require 'git' require 'json' require 'date' require 'bulk_insert' class TrickestPocCveImporter attr_accessor :repo_url, :repo_path def initialize @repo_url = 'https://github.com/trickest/cve.git' @repo_path = '/data_importer/data/trickest_cve' end def git_clone_repo Git.clone(repo_url, repo_path) end def pull_latest_changes `cd #{repo_path}; git pull;` puts "Now pulling latest changes from #{repo_path}" end def pull_or_clone(repo_path) if Dir.exist?(repo_path) pull_latest_changes else git_clone_repo end end def read_markdown(filename) data = File.read(filename) formatter = RDoc::Markup::ToHtml.new(RDoc::Options.new, nil) # should give us the html doc RDoc::Markdown.parse(data).accept(formatter) end def html_to_hash(html) data_hash = {} doc = Nokogiri::HTML5.parse(html) h3_nodes = doc.xpath('//h3') h3_keys = doc.xpath('//h3').map {|n| n.children.first.text} h4_keys = doc.xpath('//h4').map {|n| n.children.first.text} data_hash_keys = (h3_keys + h4_keys).flatten # cve id is always the first url in the markdown doc cve_url = doc.xpath("//h3/a").attribute('href').value cve_id = h3_keys.first p_text = doc.xpath('//p').map {|p| p.text } links_for_poc = doc.xpath('//p/a').map {|a| a.values}.flatten data_hash['cve_id'] = cve_id data_hash['cve_url'] = cve_url # p_text[0] is always an ' '. data_hash['description'] = p_text[1] data_hash['poc_links'] = links_for_poc data_hash end def list_mds_for_year(year) year_fp = "#{repo_path}/#{year}" Dir["#{year_fp}/*.md"] end def read_mds_for_year(year) filenames = list_mds_for_year(year) filenames.map { |filename| read_markdown(filename) } end def read_all_mds (1999..Date.today.year).map do |year| read_mds_for_year(year.to_s) end end def cve_attrs_from_item(json) cve_attrs = {} cve_attrs[:cve_id] = json['cve_id'] cve_attrs[:cve_url] = json['cve_url'] cve_attrs[:description] = json['description'] cve_attrs[:poc_links] = json['poc_links'] cve_attrs end # for bulk inserting def cves_for_year(year) htmls = read_mds_for_year(year) htmls.map do |html| data_hash = html_to_hash(html) cve_attrs_from_item(data_hash) end end def import if Dir.exist?(repo_path) pull_latest_changes else git_clone_repo end puts "Now starting import for #{repo_url}." puts '----------' * 12 (1999..Date.today.year).map do |year| cves_from_markdown = cves_for_year(year) ids = cves_from_markdown.map { |cve| cve[:cve_id] } cve_ids_in_db = TrickestPocCve.where(:cve_id => ids).pluck(:cve_id) new_cve_ids = ids - cve_ids_in_db new_cves = cves_from_markdown.select { |cve| cve if new_cve_ids.include?(cve[:cve_id]) } puts "Importing any new CVEs from #{year}" bulk_insert(new_cves) end end def bulk_insert(cves) TrickestPocCve.bulk_insert do |worker| cves.each do |attrs| worker.add(attrs) end end end end