got a pretty solid summary scraper working. need to still clean all of it up. keep testing against all of the index urls to make this go faster and more efficient

This commit is contained in:
Brendan McDevitt 2022-05-04 01:00:46 -05:00
parent 05c5c87808
commit 5899e5c14d
2 changed files with 278 additions and 20 deletions

View file

@ -0,0 +1,272 @@
class AdobeBulletinScraper
attr_accessor :url
def initialize(url)
@url = url
end
def get_html
r = RestClient::Request.execute(
:method => :get,
:url => url
)
if r.code == 200
r.body
else
puts "HTTP Code #{r.code}"
end
end
def read_html(html)
Nokogiri::HTML(html)
end
def get_advisory
html = get_html
doc = read_html(html)
end
def xpath_contains_text(xpath_driver:, text:)
"#{xpath_driver}[contains(text(), '#{text}')]"
end
# example:
# xpath_driver: "//h1"
# id_text: "summary"
def xpath_id_search(xpath_driver:, id_name:)
"#{xpath_driver}[@id='#{id_name}']"
end
def xpath_class_search(xpath_driver:, class_name:)
"#{xpath_driver}[@class='#{class_name}']"
end
def adv_xpaths_methods
{
:header_table => xpath_class_search(xpath_driver: "//table", class_name: "dexter-Table"),
:second_header_table => xpath_class_search(xpath_driver: "//table", class_name: "text aem-GridColumn aem-GridColumn--default--12 overflowScrol"),
:summary_h1_upper => xpath_id_search(xpath_driver: "//h1", id_name: "Summary"),
:summary_h1_lower => xpath_id_search(xpath_driver: "//h1", id_name: "summary"),
:summary_h2_upper => xpath_id_search(xpath_driver: "//h2", id_name: "Summary"),
:summary_h2_lower => xpath_id_search(xpath_driver: "//h2", id_name: "summary"),
:summary_text_p => xpath_contains_text(xpath_driver: "//p", text: 'update'),
:summary_text_span => xpath_contains_text(xpath_driver: "//p/span", text: 'update'),
:affected_versions => xpath_id_search(xpath_driver: "//h1", id_name: 'AffectedVersions'),
:affected_versions_table => nil,
:solution => xpath_id_search(xpath_driver: "//h1", id_name: 'solution'),
:solution_table => nil,
:vulnerability_details => xpath_id_search(xpath_driver: "//h1", id_name: "Vulnerabilitydetails"),
:vulnerability_details_table => nil
}
end
def adv_xpaths
{
:header_table => "//table[@class='dexter-Table']",
:second_header_table => "//table[@class='text aem-GridColumn aem-GridColumn--default--12 overflowScroll']",
:summary_h1_upper => "//h1[@id='Summary']",
:summary_h1_lower => "//h1[@id='summary']",
:summary_h2_upper => "//h2[@id='Summary']",
:summary_h2_lower => "//h2[@id='summary']",
:summary_text_p => "//p[contains(text(), 'updates')]",
:summary_text_span => "//p/span[contains(text(), 'updates')]",
:affected_versions => "//h1[@id='AffectedVersions']",
:affected_versions_table => "/html/body/div[2]/div/div[2]/div/div[3]/div/div/div[1]/div/div/div[7]/div/table",
:solution => "//h1[@id='solution']",
:solution_table => "//div[11]",
:vulnerability_details => "//h1[@id='Vulnerabilitydetails']",
:vulnerability_details_table => "//div[14]//div[1]//table[1]"
}
end
def get_advisory_xpaths
doc = get_advisory
xpath_hash = adv_xpaths_methods.keys.map do |key|
{"#{key}": doc.xpath(adv_xpaths[key])}
end.inject(:merge)
if url == 'https://helpx.adobe.com/security/products/photoshop/apsb22-20.html'
binding.pry
end
# for debugging with pry
# we can look at the state of each of these variables
header_table_info = header_table_hash(xpath_hash)
summary_info = summary_hash(xpath_hash)
# affected_versions_info = affected_versions_hash(xpath_hash)
# solution_info = solution_hash(xpath_hash)
# binding.pry
# advisory_hash = [header_table_info, summary_info, affected_versions_info, solution_info].inject(&:merge)
advisory_hash = [header_table_info, summary_info].inject(&:merge)
# vulnerability_details_info = vulnerability_details_hash(xpath_hash)
#binding.pry
#xpath_hash
end
def header_table_hash(xpath_hash)
adv_id_date_and_priority = xpath_hash[:header_table].inner_text.squish.gsub("Bulletin ID Date Published Priority", "").squish.split(" ")
adv_id = adv_id_date_and_priority[0]
priority = adv_id_date_and_priority[4]
month = adv_id_date_and_priority[1]
day = adv_id_date_and_priority[2]
year = adv_id_date_and_priority[3]
date_published = "#{month} #{day} #{year}"
{
:bulletin_id => adv_id,
:date_published => date_published,
:priority => priority
}
end
def has_summary_heading?(xpath_hash)
summary_heading = xpath_hash[:summary_h1_upper]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h1_lower]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h2_upper]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h2_lower]
if summary_heading.empty?
return false
end
end
end
end
if summary_heading
return true
end
end
def find_summary_text(xpath_hash)
# sometimes its just nested //p tags
summary = xpath_hash[:summary_text_p]
if summary.empty?
# sometimes its nested //p/span tags
summary = xpath_hash[:summary_text_span]
if summary.empty?
summary_text = ''
else
summary_text = summary.text.squish
end
else
summary_text = summary.text.squish
end
summary_text
end
def summary_hash(xpath_hash)
if has_summary_heading?(xpath_hash)
summary_text = find_summary_text(xpath_hash)
{
:summary => summary_text
}
else
{
:summary => nil
}
end
end
def get_table_rows(table)
table.xpath(".//tbody/tr")
end
def table_rows_drop_header(table_rows)
# drop the header row
table_rows.shift
table_rows
end
def products_and_rowspans(table_rows)
table_rows.flat_map.with_index do |tr|
tr.children.map do |td|
if td.has_attribute? 'rowspan'
{
:product_name => td.children.text.squish,
:rowspan => td.attributes.dig('rowspan').value
}
end
end
end.compact
end
def product_version_platform(table)
t = get_table_rows(table)
rows = table_rows_drop_header(t)
p_and_r = products_and_rowspans(rows)
product_names = p_and_r.map { |p| p[:product_name] }
rowspan = p_and_r.first[:rowspan].to_i
tr_groups = rows.to_a.in_groups(rowspan)
tr_groups.zip(product_names).map do |tr_group, product_name|
tr_group.map do |tr|
version_platform = version_and_platform(tr, product_name)
{:product_name => product_name}.merge(version_platform)
end
end.flatten
end
def version_and_platform(tr_node, product_name)
version_and_platform = tr_node.text().gsub("\n", " ").squish.gsub(product_name, " ").squish
platform = version_and_platform.split(" ").last
version = version_and_platform.gsub(platform, " ").squish
{:version => version, :platform => platform}
end
def affected_versions_hash(xpath_hash)
table = xpath_hash[:affected_versions_table].first
{:affected_versions => product_version_platform(table) }
end
def solution_hash(xpath_hash)
table = xpath_hash[:solution_table].first
t = get_table_rows(table)
header_values = t.xpath('./th/text()').map {|t| t.text}
rows = table_rows_drop_header(t)
tds = rows.map do |td|
td.xpath('./td')
end
installation_instruction_urls = tds.first.children.xpath(".//a/@href").map(&:value)
solution_hash = tds.map do |td|
row_data = td.children.text().split("\n")
product_name = row_data[0]
updated_version = row_data[1]
platform = row_data[2]
priority_rating = row_data[3]
{
:solution => {
:product => product_name,
:updated_version => updated_version,
:platform => platform,
:priority_rating => priority_rating,
:installation_instruction_urls => installation_instruction_urls
}
}
end
{ :solution_info => solution_hash }
end
def vulnerability_details_hash(xpath_hash)
table = xpath_hash[:vulnerability_details_table].first
t = get_table_rows(table)
header_values = t.xpath("./th").map {|th| th.text.gsub("\n", " ").squish}
binding.pry
header_values
end
#def advisory_hash
# {
# :bulletin_id => nil,
# :date_published => nil,
# :priority => nil,
# :summary => nil,
# :affected_versions => [{:product => nil, [:version => nil, :platform => nil}]],
# :solution => [{:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil }],
# :vulnerability_details => [{:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}]
# }
#end
end

View file

@ -1,4 +1,4 @@
class AdobeScraper
class AdobeIndexScraper
attr_accessor :index_url, :doc
APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i
APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i
@ -45,11 +45,14 @@ class AdobeScraper
end
def index_hash
# https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html: this was 404ing. we migth have to ommit this one.
a_with_href.map.with_index do |a, index|
url = a.attributes['href'].value
advisory_id = advisory_id_from_url(url)
if advisory_id == 'None'
next
elsif url == 'https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html'
next
else
{
:index => index,
@ -57,23 +60,6 @@ class AdobeScraper
:url => url
}
end
end
end.compact
end
def get_advisory(url)
html = get_html(url)
doc = read_html(html)
end
def advisory_hash
{
:bulletin_id => nil,
:date_published => nil,
:priority => nil,
:summary => nil,
:affected_versions => {:product => nil, :version => nil, :platform => nil},
:solution => {:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil },
:vulnerability_details => {:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}
}
end
end
end