2022-04-06 22:31:52 -05:00
|
|
|
require 'git'
|
|
|
|
require 'json'
|
|
|
|
require 'date'
|
|
|
|
require 'bulk_insert'
|
|
|
|
|
|
|
|
class TrickestPocCveImporter
|
|
|
|
attr_accessor :repo_url, :repo_path
|
|
|
|
|
|
|
|
def initialize
|
|
|
|
@repo_url = 'https://github.com/trickest/cve.git'
|
|
|
|
@repo_path = '/data_importer/data/trickest_cve'
|
|
|
|
end
|
|
|
|
|
|
|
|
def git_clone_repo
|
|
|
|
Git.clone(repo_url, repo_path)
|
|
|
|
end
|
|
|
|
|
|
|
|
def pull_latest_changes
|
|
|
|
`cd #{repo_path}; git pull;`
|
|
|
|
puts "Now pulling latest changes from #{repo_path}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def pull_or_clone(repo_path)
|
|
|
|
if Dir.exist?(repo_path)
|
|
|
|
pull_latest_changes
|
|
|
|
else
|
|
|
|
git_clone_repo
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_markdown(filename)
|
|
|
|
data = File.read(filename)
|
|
|
|
formatter = RDoc::Markup::ToHtml.new(RDoc::Options.new, nil)
|
|
|
|
# should give us the html doc
|
|
|
|
RDoc::Markdown.parse(data).accept(formatter)
|
|
|
|
end
|
|
|
|
|
|
|
|
def html_to_hash(html)
|
|
|
|
data_hash = {}
|
|
|
|
doc = Nokogiri::HTML5.parse(html)
|
|
|
|
h3_nodes = doc.xpath('//h3')
|
|
|
|
h3_keys = doc.xpath('//h3').map {|n| n.children.first.text}
|
|
|
|
h4_keys = doc.xpath('//h4').map {|n| n.children.first.text}
|
|
|
|
data_hash_keys = (h3_keys + h4_keys).flatten
|
|
|
|
|
|
|
|
# cve id is always the first url in the markdown doc
|
|
|
|
cve_url = doc.xpath("//h3/a").attribute('href').value
|
|
|
|
cve_id = h3_keys.first
|
|
|
|
|
|
|
|
p_text = doc.xpath('//p').map {|p| p.text }
|
|
|
|
links_for_poc = doc.xpath('//p/a').map {|a| a.values}.flatten
|
|
|
|
|
2022-04-07 01:51:44 -05:00
|
|
|
data_hash['cve_id'] = cve_id
|
|
|
|
data_hash['cve_url'] = cve_url
|
2022-04-06 22:31:52 -05:00
|
|
|
# p_text[0] is always an ' '.
|
2022-04-07 01:51:44 -05:00
|
|
|
data_hash['description'] = p_text[1]
|
2022-04-06 22:31:52 -05:00
|
|
|
|
2022-04-07 01:51:44 -05:00
|
|
|
data_hash['poc_links'] = links_for_poc
|
2022-04-06 22:31:52 -05:00
|
|
|
data_hash
|
|
|
|
end
|
|
|
|
|
|
|
|
def list_mds_for_year(year)
|
|
|
|
year_fp = "#{repo_path}/#{year}"
|
|
|
|
Dir["#{year_fp}/*.md"]
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_mds_for_year(year)
|
|
|
|
filenames = list_mds_for_year(year)
|
|
|
|
filenames.map { |filename| read_markdown(filename) }
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_all_mds
|
|
|
|
(1999..Date.today.year).map do |year|
|
|
|
|
read_mds_for_year(year.to_s)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def cve_attrs_from_item(json)
|
|
|
|
cve_attrs = {}
|
2022-04-07 01:51:44 -05:00
|
|
|
cve_attrs[:cve_id] = json['cve_id']
|
|
|
|
cve_attrs[:cve_url] = json['cve_url']
|
|
|
|
cve_attrs[:description] = json['description']
|
|
|
|
cve_attrs[:poc_links] = json['poc_links']
|
2022-04-06 22:31:52 -05:00
|
|
|
cve_attrs
|
|
|
|
end
|
|
|
|
|
|
|
|
# for bulk inserting
|
|
|
|
def cves_for_year(year)
|
2022-04-07 01:51:44 -05:00
|
|
|
htmls = read_mds_for_year(year)
|
|
|
|
htmls.map do |html|
|
|
|
|
data_hash = html_to_hash(html)
|
|
|
|
cve_attrs_from_item(data_hash)
|
2022-04-06 22:31:52 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def import
|
|
|
|
if Dir.exist?(repo_path)
|
|
|
|
pull_latest_changes
|
|
|
|
else
|
|
|
|
git_clone_repo
|
|
|
|
end
|
|
|
|
|
2022-04-07 01:51:44 -05:00
|
|
|
puts "Now starting import for #{repo_url}."
|
|
|
|
puts '----------' * 12
|
2022-04-06 22:31:52 -05:00
|
|
|
(1999..Date.today.year).map do |year|
|
2022-04-07 01:51:44 -05:00
|
|
|
cves_from_markdown = cves_for_year(year)
|
2022-04-06 22:31:52 -05:00
|
|
|
|
2022-04-07 01:51:44 -05:00
|
|
|
ids = cves_from_markdown.map { |cve| cve[:cve_id] }
|
2022-04-06 22:31:52 -05:00
|
|
|
cve_ids_in_db = TrickestPocCve.where(:cve_id => ids).pluck(:cve_id)
|
|
|
|
|
|
|
|
new_cve_ids = ids - cve_ids_in_db
|
2022-04-07 01:51:44 -05:00
|
|
|
new_cves = cves_from_markdown.select { |cve| cve if new_cve_ids.include?(cve[:cve_id]) }
|
2022-04-06 22:31:52 -05:00
|
|
|
puts "Importing any new CVEs from #{year}"
|
|
|
|
|
|
|
|
bulk_insert(new_cves)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def bulk_insert(cves)
|
|
|
|
TrickestPocCve.bulk_insert do |worker|
|
|
|
|
cves.each do |attrs|
|
|
|
|
worker.add(attrs)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|