class AdobeIndexScraper attr_accessor :index_url, :doc APSB_ID_MATCHER = /apsb\d{2}-\d{2,3}/i APSA_ID_MATCHER = /apsa\d{2}-\d{2,3}/i def initialize @index_url = 'https://helpx.adobe.com/security/security-bulletin.html' @doc = read_html(get_html(index_url)) end def get_html(url) r = RestClient::Request.execute( :method => :get, :url => url ) if r.code == 200 r.body else puts "HTTP Code #{r.code}" end end def read_html(doc) Nokogiri::HTML(doc) end def a_with_href doc.xpath("//tr/td/a[starts-with(@href, 'https://')]") end def advisory_id_from_url(url) # adobes advisory ids end with .html if url.ends_with?'.html' # adobe product security bulletin if url.include? 'apsb' result = url.scan(APSB_ID_MATCHER).first.upcase # adobe product security advisory elsif url.include? 'apsa' result = url.scan(APSA_ID_MATCHER).first.upcase else result = 'None' end result end end def index_hash # https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html: this was 404ing. we migth have to ommit this one. a_with_href.map.with_index do |a, index| url = a.attributes['href'].value advisory_id = advisory_id_from_url(url) if advisory_id == 'None' next elsif url == 'https://helpx.adobe.com/security/products/creative-cloud/apsb21-111.html' next else { :index => index, :advisory_id => advisory_id, :url => url } end end.compact end end