require 'git' require 'json' require 'date' require 'bulk_insert' class PocInGithubImporter CVE_MATCHER = /(CVE|cve)-\d{4}-\d{4,7}/ attr_accessor :repo_url, :repo_path def initialize @repo_url = 'https://github.com/nomi-sec/PoC-in-GitHub.git' @repo_path = '/data_importer/data/poc_in_github' end def git_clone_repo Git.clone(repo_url, repo_path) end def pull_latest_changes `cd #{repo_path}; git pull;` puts "Now pulling latest changes from #{repo_path}" end def read_json(filename) JSON.parse(File.read(filename)) end # all the files are named CVE-year-1234.json in this repo def cve_from_filename(filename) File.basename(filename,File.extname(filename)) end # regex extract substring thats a cve-id from either the name or full_name json entries def cve_from_json_names(json) name = json['name'] fullname = json['full_name'] description = json['description'] id = name.match(CVE_MATCHER)[0] || fullname.match(CVE_MATCHER)[0] || description.match(CVE_MATCHER)[0] debug_hash = {:name => name, :fullname => fullname, :description => description, :id => id.upcase } puts debug_hash cve_id = id.upcase cve_id end def list_jsons_for_year(year) year_fp = "#{repo_path}/#{year}" Dir["#{year_fp}/*.json"] end def read_jsons_for_year(year) filenames = list_jsons_for_year(year) filenames.map do |f| {:cve_id => cve_from_filename(f), :file_data => read_json(f) } end.flatten end def read_all_jsons (1999..Date.today.year).map do |year| read_jsons_for_year(year.to_s) end end def cve_attrs_from_item(json, cve_attrs={}) cve_attrs[:github_poc_id] = json['id'] cve_attrs[:name] = json['name'] cve_attrs[:full_name] = json['full_name'] cve_attrs[:owner] = json['owner'] cve_attrs[:html_url] = json['html_url'] cve_attrs[:description] = json['description'] cve_attrs[:fork] = json['fork'] cve_attrs[:created_at] = json['created_at'] cve_attrs[:updated_at] = json['updated_at'] cve_attrs[:pushed_at] = json['pushed_at'] cve_attrs[:stargazers_count] = json['stargazers_count'] cve_attrs[:watchers_count] = json['watchers_count'] cve_attrs[:forks_count] = json['forks_count'] cve_attrs[:allow_forking] = json['allow_forking'] cve_attrs[:is_template] = json['is_template'] cve_attrs[:topics] = json['topics'] cve_attrs[:visibility] = json['visibility'] cve_attrs[:forks] = json['forks'] cve_attrs[:watchers] = json['watchers'] cve_attrs[:score] = json['score'] cve_attrs end # for bulk inserting def cves_for_year(year) json_data = read_jsons_for_year(year) json_data.map do |info_hash| cve_id = info_hash[:cve_id] json_fd = info_hash[:file_data] json_fd.map do |entry| cve_attrs_from_item(entry, cve_attrs={:cve_id => cve_id}) end end.flatten end def import if Dir.exist?(repo_path) pull_latest_changes else git_clone_repo end puts "Now starting import for #{repo_url}." (1999..Date.today.year).map do |year| cves_from_json = cves_for_year(year) ids = cves_from_json.map { |cve| cve[:github_poc_id] } ids_in_db = GithubPoc.where(:github_poc_id => ids).pluck(:github_poc_id) new_ids = ids - ids_in_db new_cves = cves_from_json.select { |cve| cve if new_ids.include?(cve[:github_poc_id]) } puts "Importing any new CVEs from #{year}" bulk_insert(new_cves) end end def bulk_insert(cves) GithubPoc.bulk_insert do |worker| cves.each do |attrs| worker.add(attrs) end end end end