class AdobeBulletinScraper attr_accessor :url def initialize(url) @url = url end def get_html r = RestClient::Request.execute( :method => :get, :url => url ) if r.code == 200 r.body else puts "HTTP Code #{r.code}" end end def read_html(html) Nokogiri::HTML(html) end def get_advisory html = get_html doc = read_html(html) end def xpath_contains_text(xpath_driver:, text:) "#{xpath_driver}[contains(text(), '#{text}')]" end # example: # xpath_driver: "//h1" # id_text: "summary" def xpath_id_search(xpath_driver:, id_name:) "#{xpath_driver}[@id='#{id_name}']" end def xpath_class_search(xpath_driver:, class_name:) "#{xpath_driver}[@class='#{class_name}']" end def adv_xpaths_methods { :header_table => xpath_class_search(xpath_driver: "//table", class_name: "dexter-Table"), :second_header_table => xpath_class_search(xpath_driver: "//table", class_name: "text aem-GridColumn aem-GridColumn--default--12 overflowScrol"), :summary_h1_upper => xpath_id_search(xpath_driver: "//h1", id_name: "Summary"), :summary_h1_lower => xpath_id_search(xpath_driver: "//h1", id_name: "summary"), :summary_h2_upper => xpath_id_search(xpath_driver: "//h2", id_name: "Summary"), :summary_h2_lower => xpath_id_search(xpath_driver: "//h2", id_name: "summary"), :summary_text_p => xpath_contains_text(xpath_driver: "//p", text: 'update'), :summary_text_span => xpath_contains_text(xpath_driver: "//p/span", text: 'update'), :affected_versions => xpath_id_search(xpath_driver: "//h1", id_name: 'AffectedVersions'), :affected_versions_table => nil, :solution => xpath_id_search(xpath_driver: "//h1", id_name: 'solution'), :solution_table => nil, :vulnerability_details => xpath_id_search(xpath_driver: "//h1", id_name: "Vulnerabilitydetails"), :vulnerability_details_table => nil } end def adv_xpaths { :header_table => "//table[@class='dexter-Table']", :second_header_table => "//table[@class='text aem-GridColumn aem-GridColumn--default--12 overflowScroll']", :summary_h1_upper => "//h1[@id='Summary']", :summary_h1_lower => "//h1[@id='summary']", :summary_h2_upper => "//h2[@id='Summary']", :summary_h2_lower => "//h2[@id='summary']", :summary_text_p => "//p[contains(text(), 'updates')]", :summary_text_span => "//p/span[contains(text(), 'updates')]", :affected_versions => "//h1[@id='AffectedVersions']", :affected_versions_table => "/html/body/div[2]/div/div[2]/div/div[3]/div/div/div[1]/div/div/div[7]/div/table", :solution => "//h1[@id='solution']", :solution_table => "//div[11]", :vulnerability_details => "//h1[@id='Vulnerabilitydetails']", :vulnerability_details_table => "//div[14]//div[1]//table[1]" } end def get_advisory_xpaths doc = get_advisory xpath_hash = adv_xpaths_methods.keys.map do |key| {"#{key}": doc.xpath(adv_xpaths[key])} end.inject(:merge) if url == 'https://helpx.adobe.com/security/products/photoshop/apsb22-20.html' binding.pry end # for debugging with pry # we can look at the state of each of these variables header_table_info = header_table_hash(xpath_hash) summary_info = summary_hash(xpath_hash) # affected_versions_info = affected_versions_hash(xpath_hash) # solution_info = solution_hash(xpath_hash) # binding.pry # advisory_hash = [header_table_info, summary_info, affected_versions_info, solution_info].inject(&:merge) advisory_hash = [header_table_info, summary_info].inject(&:merge) # vulnerability_details_info = vulnerability_details_hash(xpath_hash) #binding.pry #xpath_hash end def header_table_hash(xpath_hash) adv_id_date_and_priority = xpath_hash[:header_table].inner_text.squish.gsub("Bulletin ID Date Published Priority", "").squish.split(" ") adv_id = adv_id_date_and_priority[0] priority = adv_id_date_and_priority[4] month = adv_id_date_and_priority[1] day = adv_id_date_and_priority[2] year = adv_id_date_and_priority[3] date_published = "#{month} #{day} #{year}" { :bulletin_id => adv_id, :date_published => date_published, :priority => priority } end def has_summary_heading?(xpath_hash) summary_heading = xpath_hash[:summary_h1_upper] if summary_heading.empty? summary_heading = xpath_hash[:summary_h1_lower] if summary_heading.empty? summary_heading = xpath_hash[:summary_h2_upper] if summary_heading.empty? summary_heading = xpath_hash[:summary_h2_lower] if summary_heading.empty? return false end end end end if summary_heading return true end end def find_summary_text(xpath_hash) # sometimes its just nested //p tags summary = xpath_hash[:summary_text_p] if summary.empty? # sometimes its nested //p/span tags summary = xpath_hash[:summary_text_span] if summary.empty? summary_text = '' else summary_text = summary.text.squish end else summary_text = summary.text.squish end summary_text end def summary_hash(xpath_hash) if has_summary_heading?(xpath_hash) summary_text = find_summary_text(xpath_hash) { :summary => summary_text } else { :summary => nil } end end def get_table_rows(table) table.xpath(".//tbody/tr") end def table_rows_drop_header(table_rows) # drop the header row table_rows.shift table_rows end def products_and_rowspans(table_rows) table_rows.flat_map.with_index do |tr| tr.children.map do |td| if td.has_attribute? 'rowspan' { :product_name => td.children.text.squish, :rowspan => td.attributes.dig('rowspan').value } end end end.compact end def product_version_platform(table) t = get_table_rows(table) rows = table_rows_drop_header(t) p_and_r = products_and_rowspans(rows) product_names = p_and_r.map { |p| p[:product_name] } rowspan = p_and_r.first[:rowspan].to_i tr_groups = rows.to_a.in_groups(rowspan) tr_groups.zip(product_names).map do |tr_group, product_name| tr_group.map do |tr| version_platform = version_and_platform(tr, product_name) {:product_name => product_name}.merge(version_platform) end end.flatten end def version_and_platform(tr_node, product_name) version_and_platform = tr_node.text().gsub("\n", " ").squish.gsub(product_name, " ").squish platform = version_and_platform.split(" ").last version = version_and_platform.gsub(platform, " ").squish {:version => version, :platform => platform} end def affected_versions_hash(xpath_hash) table = xpath_hash[:affected_versions_table].first {:affected_versions => product_version_platform(table) } end def solution_hash(xpath_hash) table = xpath_hash[:solution_table].first t = get_table_rows(table) header_values = t.xpath('./th/text()').map {|t| t.text} rows = table_rows_drop_header(t) tds = rows.map do |td| td.xpath('./td') end installation_instruction_urls = tds.first.children.xpath(".//a/@href").map(&:value) solution_hash = tds.map do |td| row_data = td.children.text().split("\n") product_name = row_data[0] updated_version = row_data[1] platform = row_data[2] priority_rating = row_data[3] { :solution => { :product => product_name, :updated_version => updated_version, :platform => platform, :priority_rating => priority_rating, :installation_instruction_urls => installation_instruction_urls } } end { :solution_info => solution_hash } end def vulnerability_details_hash(xpath_hash) table = xpath_hash[:vulnerability_details_table].first t = get_table_rows(table) header_values = t.xpath("./th").map {|th| th.text.gsub("\n", " ").squish} binding.pry header_values end #def advisory_hash # { # :bulletin_id => nil, # :date_published => nil, # :priority => nil, # :summary => nil, # :affected_versions => [{:product => nil, [:version => nil, :platform => nil}]], # :solution => [{:product => nil, :updated_version => nil, :platform => nil, :priority_rating => nil, :installation_instructions => nil }], # :vulnerability_details => [{:vulnerability_category => nil, :vulnerability_impact => nil, :severity => nil, :authentication_required_to_exploit? => nil, :exploit_requires_admin_privileges? => nil, :cvss_base_score => nil, :cvss_vector => nil, :cve_number => nil}] # } #end end