data_importer/lib/cna_scrapers/adobe_scraper.rb

class AdobeScraper
  attr_accessor :index_url, :doc
  APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i
  APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i

  def initialize
    @index_url = 'https://helpx.adobe.com/security/security-bulletin.html'
    @doc = read_html(get_html)
  end

  def get_html
    r = RestClient::Request.execute(
      :method => :get,
      :url => index_url
    )
    if r.code == 200
      r.body
    else
      puts "HTTP Code #{r.code}"
    end
  end

  def read_html(doc)
    Nokogiri::HTML(doc)
  end

  def a_with_href
    doc.xpath("//tr/td/a[starts-with(@href, 'https://')]")
  end

  def advisory_id_from_url(url)
    # adobes advisory ids end with .html
    if url.ends_with?'.html'
      # adobe product security bulletin
      if url.include? 'apsb'
        result = url.scan(APSB_ID_MATCHER).first.upcase
      elsif url.include? 'apsa'
        result = url.scan(APSA_ID_MATCHER).first.upcase
      else
        result = 'None'
      end
      result
    end
  end

  def index_hash
    a_with_href.map.with_index do |a, index|
      url = a.attributes['href'].value
      advisory_id = advisory_id_from_url(url) 
      if advisory_id == 'None' 
        next
      else 
        {
          :index => index,
          :advisory_id => advisory_id,
          :url => url
        }
      end
    end
  end
end
starting to add in cna scrapers. first one is adobe. wrote up to index_hash method that allows me to pull a hash of all urls for each advisory id 2022-04-28 13:38:05 -05:00			`class AdobeScraper`
			`attr_accessor :index_url, :doc`
			`APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i`
			`APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i`

			`def initialize`
			`@index_url = 'https://helpx.adobe.com/security/security-bulletin.html'`
			`@doc = read_html(get_html)`
			`end`

			`def get_html`
			`r = RestClient::Request.execute(`
			`:method => :get,`
			`:url => index_url`
			`)`
			`if r.code == 200`
			`r.body`
			`else`
			`puts "HTTP Code #{r.code}"`
			`end`
			`end`

			`def read_html(doc)`
			`Nokogiri::HTML(doc)`
			`end`

			`def a_with_href`
			`doc.xpath("//tr/td/a[starts-with(@href, 'https://')]")`
			`end`

			`def advisory_id_from_url(url)`
			`# adobes advisory ids end with .html`
			`if url.ends_with?'.html'`
			`# adobe product security bulletin`
			`if url.include? 'apsb'`
			`result = url.scan(APSB_ID_MATCHER).first.upcase`
			`elsif url.include? 'apsa'`
			`result = url.scan(APSA_ID_MATCHER).first.upcase`
			`else`
			`result = 'None'`
			`end`
			`result`
			`end`
			`end`

			`def index_hash`
			`a_with_href.map.with_index do \|a, index\|`
			`url = a.attributes['href'].value`
			`advisory_id = advisory_id_from_url(url)`
			`if advisory_id == 'None'`
			`next`
			`else`
			`{`
			`:index => index,`
			`:advisory_id => advisory_id,`
			`:url => url`
			`}`
			`end`
			`end`
			`end`
			`end`