got a pretty solid summary scraper working. need to still clean all of it up. keep testing against all of the index urls to make this go faster and more efficient
This commit is contained in:
parent
05c5c87808
commit
5899e5c14d
2 changed files with 278 additions and 20 deletions
272
lib/cna_scrapers/adobe/adobe_bulletin_scraper.rb
Normal file
272
lib/cna_scrapers/adobe/adobe_bulletin_scraper.rb
Normal file
|
@ -0,0 +1,272 @@
|
|||
class AdobeBulletinScraper
|
||||
|
||||
|
||||
attr_accessor :url
|
||||
def initialize(url)
|
||||
@url = url
|
||||
end
|
||||
|
||||
def get_html
|
||||
r = RestClient::Request.execute(
|
||||
:method => :get,
|
||||
:url => url
|
||||
)
|
||||
if r.code == 200
|
||||
r.body
|
||||
else
|
||||
puts "HTTP Code #{r.code}"
|
||||
end
|
||||
end
|
||||
|
||||
def read_html(html)
|
||||
Nokogiri::HTML(html)
|
||||
end
|
||||
|
||||
def get_advisory
|
||||
html = get_html
|
||||
doc = read_html(html)
|
||||
end
|
||||
|
||||
def xpath_contains_text(xpath_driver:, text:)
|
||||
"#{xpath_driver}[contains(text(), '#{text}')]"
|
||||
end
|
||||
|
||||
# example:
|
||||
# xpath_driver: "//h1"
|
||||
# id_text: "summary"
|
||||
def xpath_id_search(xpath_driver:, id_name:)
|
||||
"#{xpath_driver}[@id='#{id_name}']"
|
||||
end
|
||||
|
||||
def xpath_class_search(xpath_driver:, class_name:)
|
||||
"#{xpath_driver}[@class='#{class_name}']"
|
||||
end
|
||||
|
||||
def adv_xpaths_methods
|
||||
{
|
||||
:header_table => xpath_class_search(xpath_driver: "//table", class_name: "dexter-Table"),
|
||||
:second_header_table => xpath_class_search(xpath_driver: "//table", class_name: "text aem-GridColumn aem-GridColumn--default--12 overflowScrol"),
|
||||
:summary_h1_upper => xpath_id_search(xpath_driver: "//h1", id_name: "Summary"),
|
||||
:summary_h1_lower => xpath_id_search(xpath_driver: "//h1", id_name: "summary"),
|
||||
:summary_h2_upper => xpath_id_search(xpath_driver: "//h2", id_name: "Summary"),
|
||||
:summary_h2_lower => xpath_id_search(xpath_driver: "//h2", id_name: "summary"),
|
||||
:summary_text_p => xpath_contains_text(xpath_driver: "//p", text: 'update'),
|
||||
:summary_text_span => xpath_contains_text(xpath_driver: "//p/span", text: 'update'),
|
||||
:affected_versions => xpath_id_search(xpath_driver: "//h1", id_name: 'AffectedVersions'),
|
||||
:affected_versions_table => nil,
|
||||
:solution => xpath_id_search(xpath_driver: "//h1", id_name: 'solution'),
|
||||
:solution_table => nil,
|
||||
:vulnerability_details => xpath_id_search(xpath_driver: "//h1", id_name: "Vulnerabilitydetails"),
|
||||
:vulnerability_details_table => nil
|
||||
}
|
||||
end
|
||||
|
||||
def adv_xpaths
|
||||
{
|
||||
:header_table => "//table[@class='dexter-Table']",
|
||||
:second_header_table => "//table[@class='text aem-GridColumn aem-GridColumn--default--12 overflowScroll']",
|
||||
:summary_h1_upper => "//h1[@id='Summary']",
|
||||
:summary_h1_lower => "//h1[@id='summary']",
|
||||
:summary_h2_upper => "//h2[@id='Summary']",
|
||||
:summary_h2_lower => "//h2[@id='summary']",
|
||||
:summary_text_p => "//p[contains(text(), 'updates')]",
|
||||
:summary_text_span => "//p/span[contains(text(), 'updates')]",
|
||||
:affected_versions => "//h1[@id='AffectedVersions']",
|
||||
:affected_versions_table => "/html/body/div[2]/div/div[2]/div/div[3]/div/div/div[1]/div/div/div[7]/div/table",
|
||||
:solution => "//h1[@id='solution']",
|
||||
:solution_table => "//div[11]",
|
||||
:vulnerability_details => "//h1[@id='Vulnerabilitydetails']",
|
||||
:vulnerability_details_table => "//div[14]//div[1]//table[1]"
|
||||
}
|
||||
end
|
||||
|
||||
def get_advisory_xpaths
|
||||
doc = get_advisory
|
||||
xpath_hash = adv_xpaths_methods.keys.map do |key|
|
||||
{"#{key}": doc.xpath(adv_xpaths[key])}
|
||||
end.inject(:merge)
|
||||
if url == 'https://helpx.adobe.com/security/products/photoshop/apsb22-20.html'
|
||||
binding.pry
|
||||
end
|
||||
|
||||
# for debugging with pry
|
||||
# we can look at the state of each of these variables
|
||||
header_table_info = header_table_hash(xpath_hash)
|
||||
summary_info = summary_hash(xpath_hash)
|
||||
# affected_versions_info = affected_versions_hash(xpath_hash)
|
||||
# solution_info = solution_hash(xpath_hash)
|
||||
# binding.pry
|
||||
# advisory_hash = [header_table_info, summary_info, affected_versions_info, solution_info].inject(&:merge)
|
||||
|
||||
advisory_hash = [header_table_info, summary_info].inject(&:merge)
|
||||
# vulnerability_details_info = vulnerability_details_hash(xpath_hash)
|
||||
|
||||
#binding.pry
|
||||
#xpath_hash
|
||||
end
|
||||
|
||||
def header_table_hash(xpath_hash)
|
||||
adv_id_date_and_priority = xpath_hash[:header_table].inner_text.squish.gsub("Bulletin ID Date Published Priority", "").squish.split(" ")
|
||||
adv_id = adv_id_date_and_priority[0]
|
||||
priority = adv_id_date_and_priority[4]
|
||||
month = adv_id_date_and_priority[1]
|
||||
day = adv_id_date_and_priority[2]
|
||||
year = adv_id_date_and_priority[3]
|
||||
date_published = "#{month} #{day} #{year}"
|
||||
{
|
||||
:bulletin_id => adv_id,
|
||||
:date_published => date_published,
|
||||
:priority => priority
|
||||
}
|
||||
end
|
||||
|
||||
def has_summary_heading?(xpath_hash)
|
||||
summary_heading = xpath_hash[:summary_h1_upper]
|
||||
if summary_heading.empty?
|
||||
summary_heading = xpath_hash[:summary_h1_lower]
|
||||
if summary_heading.empty?
|
||||
summary_heading = xpath_hash[:summary_h2_upper]
|
||||
if summary_heading.empty?
|
||||
summary_heading = xpath_hash[:summary_h2_lower]
|
||||
if summary_heading.empty?
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if summary_heading
|
||||
return true
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
def find_summary_text(xpath_hash)
|
||||
# sometimes its just nested //p tags
|
||||
summary = xpath_hash[:summary_text_p]
|
||||
if summary.empty?
|
||||
# sometimes its nested //p/span tags
|
||||
summary = xpath_hash[:summary_text_span]
|
||||
if summary.empty?
|
||||
summary_text = ''
|
||||
else
|
||||
summary_text = summary.text.squish
|
||||
end
|
||||
else
|
||||
summary_text = summary.text.squish
|
||||
end
|
||||
summary_text
|
||||
end
|
||||
|
||||
def summary_hash(xpath_hash)
|
||||
if has_summary_heading?(xpath_hash)
|
||||
summary_text = find_summary_text(xpath_hash)
|
||||
{
|
||||
:summary => summary_text
|
||||
}
|
||||
else
|
||||
{
|
||||
:summary => nil
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def get_table_rows(table)
|
||||
table.xpath(".//tbody/tr")
|
||||
end
|
||||
|
||||
def table_rows_drop_header(table_rows)
|
||||
# drop the header row
|
||||
table_rows.shift
|
||||
table_rows
|
||||
end
|
||||
|
||||
def products_and_rowspans(table_rows)
|
||||
table_rows.flat_map.with_index do |tr|
|
||||
tr.children.map do |td|
|
||||
if td.has_attribute? 'rowspan'
|
||||
{
|
||||
:product_name => td.children.text.squish,
|
||||
:rowspan => td.attributes.dig('rowspan').value
|
||||
}
|
||||
end
|
||||
end
|
||||
end.compact
|
||||
end
|
||||
|
||||
def product_version_platform(table)
|
||||
t = get_table_rows(table)
|
||||
rows = table_rows_drop_header(t)
|
||||
p_and_r = products_and_rowspans(rows)
|
||||
product_names = p_and_r.map { |p| p[:product_name] }
|
||||
rowspan = p_and_r.first[:rowspan].to_i
|
||||
tr_groups = rows.to_a.in_groups(rowspan)
|
||||
tr_groups.zip(product_names).map do |tr_group, product_name|
|
||||
tr_group.map do |tr|
|
||||
version_platform = version_and_platform(tr, product_name)
|
||||
{:product_name => product_name}.merge(version_platform)
|
||||
end
|
||||
end.flatten
|
||||
end
|
||||
|
||||
def version_and_platform(tr_node, product_name)
|
||||
version_and_platform = tr_node.text().gsub("\n", " ").squish.gsub(product_name, " ").squish
|
||||
platform = version_and_platform.split(" ").last
|
||||
version = version_and_platform.gsub(platform, " ").squish
|
||||
{:version => version, :platform => platform}
|
||||
end
|
||||
|
||||
def affected_versions_hash(xpath_hash)
|
||||
table = xpath_hash[:affected_versions_table].first
|
||||
{:affected_versions => product_version_platform(table) }
|
||||
end
|
||||
|
||||
def solution_hash(xpath_hash)
|
||||
table = xpath_hash[:solution_table].first
|
||||
t = get_table_rows(table)
|
||||
header_values = t.xpath('./th/text()').map {|t| t.text}
|
||||
rows = table_rows_drop_header(t)
|
||||
tds = rows.map do |td|
|
||||
td.xpath('./td')
|
||||
end
|
||||
installation_instruction_urls = tds.first.children.xpath(".//a/@href").map(&:value)
|
||||
|
||||
solution_hash = tds.map do |td|
|
||||
row_data = td.children.text().split("\n")
|
||||
product_name = row_data[0]
|
||||
updated_version = row_data[1]
|
||||
platform = row_data[2]
|
||||
priority_rating = row_data[3]
|
||||
{
|
||||
:solution => {
|
||||
:product => product_name,
|
||||
:updated_version => updated_version,
|
||||
:platform => platform,
|
||||
:priority_rating => priority_rating,
|
||||
:installation_instruction_urls => installation_instruction_urls
|
||||
}
|
||||
}
|
||||
end
|
||||
{ :solution_info => solution_hash }
|
||||
end
|
||||
|
||||
def vulnerability_details_hash(xpath_hash)
|
||||
table = xpath_hash[:vulnerability_details_table].first
|
||||
t = get_table_rows(table)
|
||||
header_values = t.xpath("./th").map {|th| th.text.gsub("\n", " ").squish}
|
||||
binding.pry
|
||||
header_values
|
||||
|
||||
end
|
||||
|
||||
#def advisory_hash
|
||||
# {
|
||||
# :bulletin_id => nil,
|
||||
# :date_published => nil,
|
||||
# :priority => nil,
|
||||
# :summary => nil,
|
||||
# :affected_versions => [{:product => nil, [:version => nil, :platform => nil}]],
|
||||
# :solution => [{:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil }],
|
||||
# :vulnerability_details => [{:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}]
|
||||
# }
|
||||
#end
|
||||
end
|
|
@ -1,4 +1,4 @@
|
|||
class AdobeScraper
|
||||
class AdobeIndexScraper
|
||||
attr_accessor :index_url, :doc
|
||||
APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i
|
||||
APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i
|
||||
|
@ -45,11 +45,14 @@ class AdobeScraper
|
|||
end
|
||||
|
||||
def index_hash
|
||||
# https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html: this was 404ing. we migth have to ommit this one.
|
||||
a_with_href.map.with_index do |a, index|
|
||||
url = a.attributes['href'].value
|
||||
advisory_id = advisory_id_from_url(url)
|
||||
if advisory_id == 'None'
|
||||
next
|
||||
elsif url == 'https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html'
|
||||
next
|
||||
else
|
||||
{
|
||||
:index => index,
|
||||
|
@ -57,23 +60,6 @@ class AdobeScraper
|
|||
:url => url
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def get_advisory(url)
|
||||
html = get_html(url)
|
||||
doc = read_html(html)
|
||||
end
|
||||
|
||||
def advisory_hash
|
||||
{
|
||||
:bulletin_id => nil,
|
||||
:date_published => nil,
|
||||
:priority => nil,
|
||||
:summary => nil,
|
||||
:affected_versions => {:product => nil, :version => nil, :platform => nil},
|
||||
:solution => {:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil },
|
||||
:vulnerability_details => {:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}
|
||||
}
|
||||
end.compact
|
||||
end
|
||||
end
|
Loading…
Add table
Reference in a new issue