From db264a3f000e1097c48c477f21d94987aae29d93 Mon Sep 17 00:00:00 2001 From: bpmcdevitt Date: Wed, 28 Sep 2022 16:56:35 -0500 Subject: [PATCH] more added to the scraper. trying to pack the data from advisories now into a data hash --- .../mozilla_security_advisory_scraper.rb | 64 ++++++++++++------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/tools/mozilla/security_advisory_scraper/mozilla_security_advisory_scraper.rb b/tools/mozilla/security_advisory_scraper/mozilla_security_advisory_scraper.rb index a9198d0..805b6e7 100644 --- a/tools/mozilla/security_advisory_scraper/mozilla_security_advisory_scraper.rb +++ b/tools/mozilla/security_advisory_scraper/mozilla_security_advisory_scraper.rb @@ -1,45 +1,61 @@ require 'rest-client' require 'nokogiri' - - -## CURRENT ISSUE: 502 BAD GATEWAY WHEN TESTING GET_ADVISORY_URLS METHOD. -## TODO: COPY THE EXACT HEADERS THAT YOU ARE GIVING FROM THE WEB BROWSER -## AND SEND THEM WITH THIS REQUEST AND TEST AGAIN. class MozillaSecurityAdvisoryScraper - attr_accessor :index_url + attr_accessor :index_url, :default_headers, :advisory_urls + def initialize() @index_url = "https://www.mozilla.org/en-US/security/advisories" - end - - def get_index - headers = { + @default_headers = { :accept => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', :user_agent => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0' } - response = RestClient::Request.execute( - :method => :get, - :url => index_url, - :headers => headers - ) - if response.code == 200 - response.body - else - puts "HTTP Status code: #{r.code}" - end + @advisory_urls ||= get_advisory_urls end - def parse_index(response_body) + def get_index + get(index_url) + end + + def get_advisory(advisory_url) + get(advisory_url) + end + + def parse_html(response_body) Nokogiri::HTML(response_body) end - def advisory_urls(html_doc) + def advisory_urls_in_html(html_doc) html_doc.xpath('//li[@class="level-item"]/a').map {|link| relative_url = link['href']; "https://www.mozilla.org#{relative_url}"} end def get_advisory_urls body = get_index - doc = parse_index(body) - advisory_urls(doc) + doc = parse_html(body) + advisory_urls_in_html(doc) + end + + def get(url) + response = RestClient::Request.execute( + :method => :get, + :url => url, + :headers => default_headers + ) + if response.code == 200 + response.body + else + puts "HTTP Status: #{response.code}" + end + end + + # we know its the very first dl tag w class name summary + def parse_advisory_summary(advisory_doc) + nodes = advisory_doc.xpath('//dl[@class="summary"]').first.children + # dd tags and dt tags have the info we care about. dt tag is the key, dd + # tag is the value + end + + def parse_advisory_cve(advisory_doc) + advisory_doc.xpath('//dl[@class="cve"]') end end