more added to the scraper. trying to pack the data from advisories now into a data hash

This commit is contained in:
Brendan McDevitt 2022-09-28 16:56:35 -05:00
parent aae01c1e57
commit db264a3f00

View file

@ -1,45 +1,61 @@
require 'rest-client'
require 'nokogiri'
## CURRENT ISSUE: 502 BAD GATEWAY WHEN TESTING GET_ADVISORY_URLS METHOD.
## TODO: COPY THE EXACT HEADERS THAT YOU ARE GIVING FROM THE WEB BROWSER
## AND SEND THEM WITH THIS REQUEST AND TEST AGAIN.
class MozillaSecurityAdvisoryScraper
attr_accessor :index_url
attr_accessor :index_url, :default_headers, :advisory_urls
def initialize()
@index_url = "https://www.mozilla.org/en-US/security/advisories"
end
def get_index
headers = {
@default_headers = {
:accept => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
:user_agent => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
}
response = RestClient::Request.execute(
:method => :get,
:url => index_url,
:headers => headers
)
if response.code == 200
response.body
else
puts "HTTP Status code: #{r.code}"
end
@advisory_urls ||= get_advisory_urls
end
def parse_index(response_body)
def get_index
get(index_url)
end
def get_advisory(advisory_url)
get(advisory_url)
end
def parse_html(response_body)
Nokogiri::HTML(response_body)
end
def advisory_urls(html_doc)
def advisory_urls_in_html(html_doc)
html_doc.xpath('//li[@class="level-item"]/a').map {|link| relative_url = link['href']; "https://www.mozilla.org#{relative_url}"}
end
def get_advisory_urls
body = get_index
doc = parse_index(body)
advisory_urls(doc)
doc = parse_html(body)
advisory_urls_in_html(doc)
end
def get(url)
response = RestClient::Request.execute(
:method => :get,
:url => url,
:headers => default_headers
)
if response.code == 200
response.body
else
puts "HTTP Status: #{response.code}"
end
end
# we know its the very first dl tag w class name summary
def parse_advisory_summary(advisory_doc)
nodes = advisory_doc.xpath('//dl[@class="summary"]').first.children
# dd tags and dt tags have the info we care about. dt tag is the key, dd
# tag is the value
end
def parse_advisory_cve(advisory_doc)
advisory_doc.xpath('//dl[@class="cve"]')
end
end