data_importer/lib/cna_scrapers/adobe/adobe_bulletin_scraper.rb

273 lines
8.6 KiB
Ruby
Raw Permalink Normal View History

class AdobeBulletinScraper
attr_accessor :url
def initialize(url)
@url = url
end
def get_html
r = RestClient::Request.execute(
:method => :get,
:url => url
)
if r.code == 200
r.body
else
puts "HTTP Code #{r.code}"
end
end
def read_html(html)
Nokogiri::HTML(html)
end
def get_advisory
html = get_html
doc = read_html(html)
end
def xpath_contains_text(xpath_driver:, text:)
"#{xpath_driver}[contains(text(), '#{text}')]"
end
# example:
# xpath_driver: "//h1"
# id_text: "summary"
def xpath_id_search(xpath_driver:, id_name:)
"#{xpath_driver}[@id='#{id_name}']"
end
def xpath_class_search(xpath_driver:, class_name:)
"#{xpath_driver}[@class='#{class_name}']"
end
def adv_xpaths_methods
{
:header_table => xpath_class_search(xpath_driver: "//table", class_name: "dexter-Table"),
:second_header_table => xpath_class_search(xpath_driver: "//table", class_name: "text aem-GridColumn aem-GridColumn--default--12 overflowScrol"),
:summary_h1_upper => xpath_id_search(xpath_driver: "//h1", id_name: "Summary"),
:summary_h1_lower => xpath_id_search(xpath_driver: "//h1", id_name: "summary"),
:summary_h2_upper => xpath_id_search(xpath_driver: "//h2", id_name: "Summary"),
:summary_h2_lower => xpath_id_search(xpath_driver: "//h2", id_name: "summary"),
:summary_text_p => xpath_contains_text(xpath_driver: "//p", text: 'update'),
:summary_text_span => xpath_contains_text(xpath_driver: "//p/span", text: 'update'),
:affected_versions => xpath_id_search(xpath_driver: "//h1", id_name: 'AffectedVersions'),
:affected_versions_table => nil,
:solution => xpath_id_search(xpath_driver: "//h1", id_name: 'solution'),
:solution_table => nil,
:vulnerability_details => xpath_id_search(xpath_driver: "//h1", id_name: "Vulnerabilitydetails"),
:vulnerability_details_table => nil
}
end
def adv_xpaths
{
:header_table => "//table[@class='dexter-Table']",
:second_header_table => "//table[@class='text aem-GridColumn aem-GridColumn--default--12 overflowScroll']",
:summary_h1_upper => "//h1[@id='Summary']",
:summary_h1_lower => "//h1[@id='summary']",
:summary_h2_upper => "//h2[@id='Summary']",
:summary_h2_lower => "//h2[@id='summary']",
:summary_text_p => "//p[contains(text(), 'updates')]",
:summary_text_span => "//p/span[contains(text(), 'updates')]",
:affected_versions => "//h1[@id='AffectedVersions']",
:affected_versions_table => "/html/body/div[2]/div/div[2]/div/div[3]/div/div/div[1]/div/div/div[7]/div/table",
:solution => "//h1[@id='solution']",
:solution_table => "//div[11]",
:vulnerability_details => "//h1[@id='Vulnerabilitydetails']",
:vulnerability_details_table => "//div[14]//div[1]//table[1]"
}
end
def get_advisory_xpaths
doc = get_advisory
xpath_hash = adv_xpaths_methods.keys.map do |key|
{"#{key}": doc.xpath(adv_xpaths[key])}
end.inject(:merge)
if url == 'https://helpx.adobe.com/security/products/photoshop/apsb22-20.html'
binding.pry
end
# for debugging with pry
# we can look at the state of each of these variables
header_table_info = header_table_hash(xpath_hash)
summary_info = summary_hash(xpath_hash)
# affected_versions_info = affected_versions_hash(xpath_hash)
# solution_info = solution_hash(xpath_hash)
# binding.pry
# advisory_hash = [header_table_info, summary_info, affected_versions_info, solution_info].inject(&:merge)
advisory_hash = [header_table_info, summary_info].inject(&:merge)
# vulnerability_details_info = vulnerability_details_hash(xpath_hash)
#binding.pry
#xpath_hash
end
def header_table_hash(xpath_hash)
adv_id_date_and_priority = xpath_hash[:header_table].inner_text.squish.gsub("Bulletin ID Date Published Priority", "").squish.split(" ")
adv_id = adv_id_date_and_priority[0]
priority = adv_id_date_and_priority[4]
month = adv_id_date_and_priority[1]
day = adv_id_date_and_priority[2]
year = adv_id_date_and_priority[3]
date_published = "#{month} #{day} #{year}"
{
:bulletin_id => adv_id,
:date_published => date_published,
:priority => priority
}
end
def has_summary_heading?(xpath_hash)
summary_heading = xpath_hash[:summary_h1_upper]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h1_lower]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h2_upper]
if summary_heading.empty?
summary_heading = xpath_hash[:summary_h2_lower]
if summary_heading.empty?
return false
end
end
end
end
if summary_heading
return true
end
end
def find_summary_text(xpath_hash)
# sometimes its just nested //p tags
summary = xpath_hash[:summary_text_p]
if summary.empty?
# sometimes its nested //p/span tags
summary = xpath_hash[:summary_text_span]
if summary.empty?
summary_text = ''
else
summary_text = summary.text.squish
end
else
summary_text = summary.text.squish
end
summary_text
end
def summary_hash(xpath_hash)
if has_summary_heading?(xpath_hash)
summary_text = find_summary_text(xpath_hash)
{
:summary => summary_text
}
else
{
:summary => nil
}
end
end
def get_table_rows(table)
table.xpath(".//tbody/tr")
end
def table_rows_drop_header(table_rows)
# drop the header row
table_rows.shift
table_rows
end
def products_and_rowspans(table_rows)
table_rows.flat_map.with_index do |tr|
tr.children.map do |td|
if td.has_attribute? 'rowspan'
{
:product_name => td.children.text.squish,
:rowspan => td.attributes.dig('rowspan').value
}
end
end
end.compact
end
def product_version_platform(table)
t = get_table_rows(table)
rows = table_rows_drop_header(t)
p_and_r = products_and_rowspans(rows)
product_names = p_and_r.map { |p| p[:product_name] }
rowspan = p_and_r.first[:rowspan].to_i
tr_groups = rows.to_a.in_groups(rowspan)
tr_groups.zip(product_names).map do |tr_group, product_name|
tr_group.map do |tr|
version_platform = version_and_platform(tr, product_name)
{:product_name => product_name}.merge(version_platform)
end
end.flatten
end
def version_and_platform(tr_node, product_name)
version_and_platform = tr_node.text().gsub("\n", " ").squish.gsub(product_name, " ").squish
platform = version_and_platform.split(" ").last
version = version_and_platform.gsub(platform, " ").squish
{:version => version, :platform => platform}
end
def affected_versions_hash(xpath_hash)
table = xpath_hash[:affected_versions_table].first
{:affected_versions => product_version_platform(table) }
end
def solution_hash(xpath_hash)
table = xpath_hash[:solution_table].first
t = get_table_rows(table)
header_values = t.xpath('./th/text()').map {|t| t.text}
rows = table_rows_drop_header(t)
tds = rows.map do |td|
td.xpath('./td')
end
installation_instruction_urls = tds.first.children.xpath(".//a/@href").map(&:value)
solution_hash = tds.map do |td|
row_data = td.children.text().split("\n")
product_name = row_data[0]
updated_version = row_data[1]
platform = row_data[2]
priority_rating = row_data[3]
{
:solution => {
:product => product_name,
:updated_version => updated_version,
:platform => platform,
:priority_rating => priority_rating,
:installation_instruction_urls => installation_instruction_urls
}
}
end
{ :solution_info => solution_hash }
end
def vulnerability_details_hash(xpath_hash)
table = xpath_hash[:vulnerability_details_table].first
t = get_table_rows(table)
header_values = t.xpath("./th").map {|th| th.text.gsub("\n", " ").squish}
binding.pry
header_values
end
#def advisory_hash
# {
# :bulletin_id => nil,
# :date_published => nil,
# :priority => nil,
# :summary => nil,
# :affected_versions => [{:product => nil, [:version => nil, :platform => nil}]],
# :solution => [{:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil }],
# :vulnerability_details => [{:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}]
# }
#end
end