From b91923a9e49ba0b00e906466f33bb35e8b630197 Mon Sep 17 00:00:00 2001 From: Brendan McDevitt Date: Tue, 5 Apr 2022 22:09:21 -0500 Subject: [PATCH] added model and working importer for PoC in Github nomi-sec json data into postgres --- app/models/github_poc.rb | 2 + db/migrate/20220405230622_github_pocs.rb | 28 +++++ lib/poc_in_github_importer.rb | 130 +++++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 app/models/github_poc.rb create mode 100644 db/migrate/20220405230622_github_pocs.rb create mode 100644 lib/poc_in_github_importer.rb diff --git a/app/models/github_poc.rb b/app/models/github_poc.rb new file mode 100644 index 0000000..52d763c --- /dev/null +++ b/app/models/github_poc.rb @@ -0,0 +1,2 @@ +class GithubPoc < ActiveRecord::Base +end diff --git a/db/migrate/20220405230622_github_pocs.rb b/db/migrate/20220405230622_github_pocs.rb new file mode 100644 index 0000000..b191395 --- /dev/null +++ b/db/migrate/20220405230622_github_pocs.rb @@ -0,0 +1,28 @@ +class GithubPocs < ActiveRecord::Migration[7.0] + def change + create_table :github_pocs do |t| + t.integer :github_poc_id + t.index :github_poc_id, unique: true + t.string :cve_id, default: "None" + t.string :name + t.string :full_name + t.jsonb :owner + t.string :html_url + t.string :description + t.boolean :fork + t.date :created_at + t.date :updated_at + t.date :pushed_at + t.integer :stargazers_count + t.integer :watchers_count + t.integer :forks_count + t.boolean :allow_forking + t.boolean :is_template + t.string :topics, array: true + t.string :visibility + t.integer :forks + t.integer :watchers + t.integer :score + end + end +end diff --git a/lib/poc_in_github_importer.rb b/lib/poc_in_github_importer.rb new file mode 100644 index 0000000..5887cfe --- /dev/null +++ b/lib/poc_in_github_importer.rb @@ -0,0 +1,130 @@ +require 'git' +require 'json' +require 'date' +require 'bulk_insert' + +class PocInGithubImporter + CVE_MATCHER = /(CVE|cve)-\d{4}-\d{4,7}/ + + attr_accessor :repo_url, :repo_path + + def initialize + @repo_url = 'https://github.com/nomi-sec/PoC-in-GitHub.git' + @repo_path = '/data_importer/data/poc_in_github' + end + + def git_clone_repo + Git.clone(repo_url, repo_path) + end + + def pull_latest_changes + `cd #{repo_path}; git pull;` + puts "Now pulling latest changes from #{repo_path}" + end + + def read_json(filename) + JSON.parse(File.read(filename)) + end + + # all the files are named CVE-year-1234.json in this repo + def cve_from_filename(filename) + File.basename(filename,File.extname(filename)) + end + + # regex extract substring thats a cve-id from either the name or full_name json entries + def cve_from_json_names(json) + name = json['name'] + fullname = json['full_name'] + description = json['description'] + id = name.match(CVE_MATCHER)[0] || fullname.match(CVE_MATCHER)[0] || description.match(CVE_MATCHER)[0] + debug_hash = {:name => name, :fullname => fullname, :description => description, :id => id.upcase } + puts debug_hash + cve_id = id.upcase + cve_id + end + + def list_jsons_for_year(year) + year_fp = "#{repo_path}/#{year}" + Dir["#{year_fp}/*.json"] + end + + def read_jsons_for_year(year) + filenames = list_jsons_for_year(year) + filenames.map do |f| + {:cve_id => cve_from_filename(f), :file_data => read_json(f) } + end.flatten + end + + def read_all_jsons + (1999..Date.today.year).map do |year| + read_jsons_for_year(year.to_s) + end + end + + def cve_attrs_from_item(json, cve_attrs={}) + cve_attrs[:github_poc_id] = json['id'] + cve_attrs[:name] = json['name'] + cve_attrs[:full_name] = json['full_name'] + cve_attrs[:owner] = json['owner'] + cve_attrs[:html_url] = json['html_url'] + cve_attrs[:description] = json['description'] + cve_attrs[:fork] = json['fork'] + cve_attrs[:created_at] = json['created_at'] + cve_attrs[:updated_at] = json['updated_at'] + cve_attrs[:pushed_at] = json['pushed_at'] + cve_attrs[:stargazers_count] = json['stargazers_count'] + cve_attrs[:watchers_count] = json['watchers_count'] + cve_attrs[:forks_count] = json['forks_count'] + cve_attrs[:allow_forking] = json['allow_forking'] + cve_attrs[:is_template] = json['is_template'] + cve_attrs[:topics] = json['topics'] + cve_attrs[:visibility] = json['visibility'] + cve_attrs[:forks] = json['forks'] + cve_attrs[:watchers] = json['watchers'] + cve_attrs[:score] = json['score'] + cve_attrs + end + + # for bulk inserting + def cves_for_year(year) + json_data = read_jsons_for_year(year) + + json_data.map do |info_hash| + cve_id = info_hash[:cve_id] + json_fd = info_hash[:file_data] + + json_fd.map do |entry| + cve_attrs_from_item(entry, cve_attrs={:cve_id => cve_id}) + end + end.flatten + end + + def import + if Dir.exist?(repo_path) + pull_latest_changes + else + git_clone_repo + end + + puts "Now starting import for PocInGithub." + (1999..Date.today.year).map do |year| + cves_from_json = cves_for_year(year) + + ids = cves_from_json.map { |cve| cve[:github_poc_id] } + ids_in_db = GithubPoc.where(:github_poc_id => ids).pluck(:github_poc_id) + + new_ids = ids - ids_in_db + new_cves = cves_from_json.select { |cve| cve if new_ids.include?(cve[:github_poc_id]) } + puts "Importing any new CVEs from #{year}" + bulk_insert(new_cves) + end + end + + def bulk_insert(cves) + GithubPoc.bulk_insert do |worker| + cves.each do |attrs| + worker.add(attrs) + end + end + end +end \ No newline at end of file