got a pretty solid summary scraper working. need to still clean all of it up. keep testing against all of the index urls to make this go faster and more efficient

2022-05-04 01:00:46 -05:00 · 2022-05-04 01:00:46 -05:00 · 5899e5c14d
commit 5899e5c14d
parent 05c5c87808
2 changed files with 278 additions and 20 deletions
--- a/lib/cna_scrapers/adobe/adobe_bulletin_scraper.rb
+++ b/lib/cna_scrapers/adobe/adobe_bulletin_scraper.rb
@ -0,0 +1,272 @@
+class AdobeBulletinScraper
+
+
+  attr_accessor :url 
+  def initialize(url)
+    @url = url
+  end
+
+  def get_html
+    r = RestClient::Request.execute(
+      :method => :get,
+      :url => url
+    )
+    if r.code == 200
+      r.body
+    else
+      puts "HTTP Code #{r.code}"
+    end
+  end
+
+  def read_html(html)
+    Nokogiri::HTML(html)
+  end
+  
+  def get_advisory
+    html = get_html
+    doc = read_html(html)
+  end
+
+  def xpath_contains_text(xpath_driver:, text:)
+    "#{xpath_driver}[contains(text(), '#{text}')]"
+  end
+
+  # example:
+  # xpath_driver: "//h1"
+  # id_text: "summary" 
+  def xpath_id_search(xpath_driver:, id_name:)
+    "#{xpath_driver}[@id='#{id_name}']"
+  end
+
+def xpath_class_search(xpath_driver:, class_name:)
+  "#{xpath_driver}[@class='#{class_name}']"
+end
+
+def adv_xpaths_methods
+  {
+    :header_table => xpath_class_search(xpath_driver: "//table", class_name: "dexter-Table"),
+    :second_header_table => xpath_class_search(xpath_driver: "//table", class_name: "text aem-GridColumn aem-GridColumn--default--12 overflowScrol"),
+    :summary_h1_upper => xpath_id_search(xpath_driver: "//h1", id_name: "Summary"),
+    :summary_h1_lower => xpath_id_search(xpath_driver: "//h1", id_name: "summary"),
+    :summary_h2_upper => xpath_id_search(xpath_driver: "//h2", id_name: "Summary"),
+    :summary_h2_lower => xpath_id_search(xpath_driver: "//h2", id_name: "summary"),
+    :summary_text_p => xpath_contains_text(xpath_driver: "//p", text: 'update'),
+    :summary_text_span => xpath_contains_text(xpath_driver: "//p/span", text: 'update'),
+    :affected_versions => xpath_id_search(xpath_driver: "//h1", id_name: 'AffectedVersions'),
+    :affected_versions_table => nil,
+    :solution => xpath_id_search(xpath_driver: "//h1", id_name: 'solution'),
+    :solution_table => nil,
+    :vulnerability_details => xpath_id_search(xpath_driver: "//h1", id_name: "Vulnerabilitydetails"),
+    :vulnerability_details_table => nil
+  }
+end
+
+  def adv_xpaths
+    {
+      :header_table => "//table[@class='dexter-Table']",
+      :second_header_table => "//table[@class='text aem-GridColumn aem-GridColumn--default--12 overflowScroll']",
+      :summary_h1_upper => "//h1[@id='Summary']",
+      :summary_h1_lower => "//h1[@id='summary']",
+      :summary_h2_upper => "//h2[@id='Summary']",
+      :summary_h2_lower => "//h2[@id='summary']",
+      :summary_text_p => "//p[contains(text(), 'updates')]",
+      :summary_text_span => "//p/span[contains(text(), 'updates')]",
+      :affected_versions => "//h1[@id='AffectedVersions']",
+      :affected_versions_table => "/html/body/div[2]/div/div[2]/div/div[3]/div/div/div[1]/div/div/div[7]/div/table",
+      :solution => "//h1[@id='solution']",
+      :solution_table => "//div[11]",
+      :vulnerability_details => "//h1[@id='Vulnerabilitydetails']",
+      :vulnerability_details_table => "//div[14]//div[1]//table[1]"
+    }
+  end
+
+  def get_advisory_xpaths
+    doc = get_advisory
+    xpath_hash = adv_xpaths_methods.keys.map do |key|
+      {"#{key}": doc.xpath(adv_xpaths[key])}
+    end.inject(:merge)
+    if url == 'https://helpx.adobe.com/security/products/photoshop/apsb22-20.html'
+      binding.pry
+    end
+
+    # for debugging with pry
+    # we can look at the state of each of these variables
+    header_table_info = header_table_hash(xpath_hash)
+    summary_info = summary_hash(xpath_hash) 
+   # affected_versions_info = affected_versions_hash(xpath_hash)
+   # solution_info = solution_hash(xpath_hash)
+   # binding.pry
+  # advisory_hash = [header_table_info, summary_info, affected_versions_info, solution_info].inject(&:merge)
+
+  advisory_hash = [header_table_info, summary_info].inject(&:merge)
+   # vulnerability_details_info = vulnerability_details_hash(xpath_hash)
+
+  #binding.pry
+   #xpath_hash
+  end
+
+  def header_table_hash(xpath_hash)
+    adv_id_date_and_priority = xpath_hash[:header_table].inner_text.squish.gsub("Bulletin ID Date Published Priority", "").squish.split(" ")
+    adv_id = adv_id_date_and_priority[0]
+    priority = adv_id_date_and_priority[4]
+    month = adv_id_date_and_priority[1]
+    day = adv_id_date_and_priority[2] 
+    year = adv_id_date_and_priority[3] 
+    date_published = "#{month} #{day} #{year}" 
+    {
+      :bulletin_id => adv_id,
+      :date_published => date_published,
+      :priority => priority
+    }
+  end
+
+  def has_summary_heading?(xpath_hash)
+    summary_heading = xpath_hash[:summary_h1_upper]
+    if summary_heading.empty?
+      summary_heading = xpath_hash[:summary_h1_lower]
+      if summary_heading.empty? 
+        summary_heading = xpath_hash[:summary_h2_upper]
+        if summary_heading.empty?
+          summary_heading = xpath_hash[:summary_h2_lower]
+          if summary_heading.empty?
+            return false
+          end
+        end
+      end
+    end
+    if summary_heading
+      return true
+    end
+
+  end
+
+  def find_summary_text(xpath_hash)
+    # sometimes its just nested //p tags
+    summary = xpath_hash[:summary_text_p]
+    if summary.empty?
+    # sometimes its nested //p/span tags
+      summary = xpath_hash[:summary_text_span]
+      if summary.empty?
+        summary_text = ''
+      else
+        summary_text = summary.text.squish
+      end
+    else 
+      summary_text = summary.text.squish
+    end
+    summary_text
+  end
+
+  def summary_hash(xpath_hash)
+    if has_summary_heading?(xpath_hash)
+      summary_text = find_summary_text(xpath_hash)
+      {
+        :summary => summary_text
+      }
+    else 
+      {
+        :summary => nil 
+      }
+    end
+  end
+
+  def get_table_rows(table)
+    table.xpath(".//tbody/tr")
+  end
+
+  def table_rows_drop_header(table_rows)
+    # drop the header row
+    table_rows.shift
+    table_rows
+  end
+
+  def products_and_rowspans(table_rows)
+    table_rows.flat_map.with_index do |tr|
+      tr.children.map do  |td| 
+        if td.has_attribute? 'rowspan'
+          { 
+            :product_name => td.children.text.squish, 
+            :rowspan => td.attributes.dig('rowspan').value 
+          }
+        end
+      end
+    end.compact
+  end
+
+  def product_version_platform(table)
+    t = get_table_rows(table)
+    rows = table_rows_drop_header(t)
+    p_and_r = products_and_rowspans(rows)
+    product_names = p_and_r.map { |p| p[:product_name] }
+    rowspan = p_and_r.first[:rowspan].to_i
+    tr_groups = rows.to_a.in_groups(rowspan)
+    tr_groups.zip(product_names).map do |tr_group, product_name|
+      tr_group.map do |tr|
+        version_platform = version_and_platform(tr, product_name)
+        {:product_name => product_name}.merge(version_platform)
+      end
+    end.flatten
+  end
+
+  def version_and_platform(tr_node, product_name)
+    version_and_platform = tr_node.text().gsub("\n", " ").squish.gsub(product_name, " ").squish
+    platform = version_and_platform.split(" ").last
+    version = version_and_platform.gsub(platform, " ").squish
+    {:version => version, :platform => platform}
+  end
+
+  def affected_versions_hash(xpath_hash)
+    table = xpath_hash[:affected_versions_table].first
+    {:affected_versions => product_version_platform(table) }
+  end
+
+  def solution_hash(xpath_hash)
+    table = xpath_hash[:solution_table].first
+    t = get_table_rows(table)
+    header_values = t.xpath('./th/text()').map {|t| t.text}
+    rows = table_rows_drop_header(t)
+    tds = rows.map do |td|
+      td.xpath('./td')
+    end
+    installation_instruction_urls = tds.first.children.xpath(".//a/@href").map(&:value)
+    
+    solution_hash = tds.map do |td| 
+      row_data = td.children.text().split("\n") 
+      product_name = row_data[0]
+      updated_version = row_data[1]
+      platform = row_data[2]
+      priority_rating = row_data[3]
+      {
+        :solution => {
+          :product => product_name,
+          :updated_version => updated_version,
+          :platform => platform,
+          :priority_rating => priority_rating,
+          :installation_instruction_urls => installation_instruction_urls
+        }
+      }
+    end
+    { :solution_info => solution_hash }
+  end
+
+  def vulnerability_details_hash(xpath_hash)
+    table = xpath_hash[:vulnerability_details_table].first
+    t = get_table_rows(table)
+    header_values = t.xpath("./th").map {|th| th.text.gsub("\n", " ").squish}
+    binding.pry
+    header_values
+
+  end
+   
+  #def advisory_hash
+  #  {
+  #    :bulletin_id => nil,
+  #    :date_published => nil,
+  #    :priority => nil,
+  #    :summary => nil,
+  #    :affected_versions => [{:product => nil, [:version => nil, :platform => nil}]],
+  #    :solution => [{:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil }],
+  #    :vulnerability_details => [{:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}]
+  #  }
+  #end
+end
--- a/lib/cna_scrapers/adobe/adobe_index_scraper.rb
+++ b/lib/cna_scrapers/adobe/adobe_index_scraper.rb
@ -1,4 +1,4 @@
-class AdobeScraper
+class AdobeIndexScraper
  attr_accessor :index_url, :doc
  APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i
  APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i
@ -45,11 +45,14 @@ class AdobeScraper
  end

  def index_hash
+    # https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html: this was 404ing. we migth have to ommit this one.
    a_with_href.map.with_index do |a, index|
      url = a.attributes['href'].value
      advisory_id = advisory_id_from_url(url) 
      if advisory_id == 'None' 
        next
+      elsif url == 'https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html'
+        next
      else 
        {
          :index => index,
@ -57,23 +60,6 @@ class AdobeScraper
          :url => url
        }
      end
-    end
+    end.compact
  end
-
-  def get_advisory(url)
-    html = get_html(url)
-    doc = read_html(html)
-  end
-
-  def advisory_hash
-    {
-      :bulletin_id => nil,
-      :date_published => nil,
-      :priority => nil,
-      :summary => nil,
-      :affected_versions => {:product => nil, :version => nil, :platform => nil},
-      :solution => {:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil },
-      :vulnerability_details => {:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}
-    }
-  end
-end
+end