require 'git' require 'json' require 'date' require 'bulk_insert' class TrickestPocCveImporter attr_accessor :repo_url, :repo_path def initialize @repo_url = 'https://github.com/trickest/cve.git' @repo_path = '/data_importer/data/trickest_cve' end def git_clone_repo Git.clone(repo_url, repo_path) end def pull_latest_changes `cd #{repo_path}; git pull;` puts "Now pulling latest changes from #{repo_path}" end def pull_or_clone(repo_path) if Dir.exist?(repo_path) pull_latest_changes else git_clone_repo end end def read_markdown(filename) data = File.read(filename) formatter = RDoc::Markup::ToHtml.new(RDoc::Options.new, nil) # should give us the html doc RDoc::Markdown.parse(data).accept(formatter) end def html_to_hash(html) data_hash = {} doc = Nokogiri::HTML5.parse(html) h3_nodes = doc.xpath('//h3') h3_keys = doc.xpath('//h3').map {|n| n.children.first.text} h4_keys = doc.xpath('//h4').map {|n| n.children.first.text} data_hash_keys = (h3_keys + h4_keys).flatten # cve id is always the first url in the markdown doc cve_url = doc.xpath("//h3/a").attribute('href').value cve_id = h3_keys.first p_text = doc.xpath('//p').map {|p| p.text } links_for_poc = doc.xpath('//p/a').map {|a| a.values}.flatten data_hash["#{cve_id}"] = cve_url # p_text[0] is always an ' '. data_hash['Description'] = p_text[1] # array of values if its a links. hard to distinguish between ones under POC and ones under Github # if it contains no data under the heading there will be no .value but instead .text will return data. # these ones can both have multiple values # just normalize and put POC and Github stuff under one key now. idc i just need the URL data_hash['POC'] = links_for_poc data_hash end def list_mds_for_year(year) year_fp = "#{repo_path}/#{year}" Dir["#{year_fp}/*.md"] end def read_mds_for_year(year) filenames = list_mds_for_year(year) filenames.map { |filename| read_markdown(filename) } end def read_all_mds (1999..Date.today.year).map do |year| read_mds_for_year(year.to_s) end end def cve_attrs_from_item(json) cve_attrs = {} #cve_attrs[:cve_data_meta] = json['CVE_data_meta'] #cve_attrs[:cve_id] = json['CVE_data_meta']['ID'] #cve_attrs[:affects] = json['affects'] #cve_attrs[:data_format] = json['data_format'] #cve_attrs[:data_type] = json['data_type'] #cve_attrs[:data_version] = json['data_version'] #cve_attrs[:description] = json['description'] #cve_attrs[:impact] = json['impact'] #cve_attrs[:problemtype] = json['problemtype'] #cve_attrs[:references] = json['references'] #cve_attrs[:source] = json['source'] cve_attrs end # for bulk inserting def cves_for_year(year) json_data = read_jsons_for_year(year) json_data.map do |json_f| cve_attrs_from_item(json_f) end end def import if Dir.exist?(repo_path) pull_latest_changes else git_clone_repo end puts "Now starting import for CveList." (1999..Date.today.year).map do |year| cves_from_json = cves_for_year(year) ids = cves_from_json.map { |cve| cve[:cve_id] } cve_ids_in_db = TrickestPocCve.where(:cve_id => ids).pluck(:cve_id) new_cve_ids = ids - cve_ids_in_db new_cves = cves_from_json.select { |cve| cve if new_cve_ids.include?(cve[:cve_id]) } puts "Importing any new CVEs from #{year}" bulk_insert(new_cves) end end def bulk_insert(cves) TrickestPocCve.bulk_insert do |worker| cves.each do |attrs| worker.add(attrs) end end end end