From 752aef639206f0c56851031816703b6c332233d2 Mon Sep 17 00:00:00 2001 From: Brendan McDevitt Date: Wed, 6 Apr 2022 22:31:52 -0500 Subject: [PATCH] add base support for trickest poc cves to github --- .gitignore | 3 + Gemfile | 3 + Gemfile.lock | 77 ++++++++++++++- config/initializers/twitter_config.rb | 15 +++ docker-compose.yml | 2 + lib/trickest_poc_cve_importer.rb | 134 ++++++++++++++++++++++++++ 6 files changed, 230 insertions(+), 4 deletions(-) create mode 100644 config/initializers/twitter_config.rb create mode 100644 lib/trickest_poc_cve_importer.rb diff --git a/.gitignore b/.gitignore index 81452db..9e1b5ec 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ # Ignore master key for decrypting credentials and more. /config/master.key + +# Any API keys or envars we dont want to commit add here. +/twitter_credentials.env diff --git a/Gemfile b/Gemfile index 4d700f1..5b6786b 100644 --- a/Gemfile +++ b/Gemfile @@ -9,6 +9,9 @@ gem 'actionpack' gem 'sass-rails' gem 'railties' gem 'rest-client' +gem 'twitter' +gem 'tweetkit', github: 'julianfssen/tweetkit' # for twitter v2 api support +gem 'nokogiri' # Use postgres as the database for Active Record gem 'pg' diff --git a/Gemfile.lock b/Gemfile.lock index 09da9df..5129332 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,3 +1,12 @@ +GIT + remote: https://github.com/julianfssen/tweetkit.git + revision: e9ff2e807089547548a3caeea24b06cbdb1defd3 + specs: + tweetkit (0.2.0) + faraday (~> 1.9.3) + faraday_middleware (~> 1.2.0) + simple_oauth (~> 0.3.0) + GEM remote: https://rubygems.org/ specs: @@ -74,6 +83,7 @@ GEM bindex (0.8.1) bootsnap (1.11.1) msgpack (~> 1.2) + buftok (0.2.0) builder (3.2.4) bulk_insert (1.9.0) activerecord (>= 3.2.0) @@ -104,16 +114,54 @@ GEM digest (3.1.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) + equalizer (0.0.11) erubi (1.10.0) execjs (2.8.1) + faraday (1.9.3) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0) + faraday-multipart (~> 1.0) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.0) + faraday-patron (~> 1.0) + faraday-rack (~> 1.0) + faraday-retry (~> 1.0) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-multipart (1.0.3) + multipart-post (>= 1.2, < 3) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + faraday-rack (1.0.0) + faraday-retry (1.0.3) + faraday_middleware (1.2.0) + faraday (~> 1.0) ffi (1.15.5) + ffi-compiler (1.0.1) + ffi (>= 1.0.0) + rake git (1.10.2) rchardet (~> 1.8) globalid (1.0.0) activesupport (>= 5.0) + http (4.4.1) + addressable (~> 2.3) + http-cookie (~> 1.0) + http-form_data (~> 2.2) + http-parser (~> 1.2.0) http-accept (1.7.0) http-cookie (1.0.4) domain_name (~> 0.5) + http-form_data (2.3.0) + http-parser (1.2.3) + ffi-compiler (>= 1.0, < 2.0) + http_parser.rb (0.6.0) i18n (1.10.0) concurrent-ruby (~> 1.0) interception (0.5) @@ -132,6 +180,8 @@ GEM mini_mime (>= 0.1.1) marcel (1.0.2) matrix (0.4.2) + memoizable (0.4.2) + thread_safe (~> 0.3, >= 0.3.1) method_source (1.0.0) mime-types (3.4.1) mime-types-data (~> 3.2015) @@ -139,7 +189,9 @@ GEM mini_mime (1.1.2) mini_portile2 (2.8.0) minitest (5.15.0) - msgpack (1.4.5) + msgpack (1.5.0) + multipart-post (2.1.1) + naught (1.1.0) net-imap (0.2.3) digest net-protocol @@ -160,12 +212,12 @@ GEM mini_portile2 (~> 2.8.0) racc (~> 1.4) pg (1.3.5) - pry (0.13.1) + pry (0.14.1) coderay (~> 1.1) method_source (~> 1.0) - pry-byebug (3.9.0) + pry-byebug (3.8.0) byebug (~> 11.0) - pry (~> 0.13.0) + pry (~> 0.10) pry-doc (1.3.0) pry (~> 0.11) yard (~> 0.9.11) @@ -220,6 +272,7 @@ GEM mime-types (>= 1.16, < 4.0) netrc (~> 0.8) rexml (3.2.5) + ruby2_keywords (0.0.5) ruby_dep (1.5.0) rubyzip (2.3.2) sass-rails (6.0.0) @@ -236,6 +289,7 @@ GEM childprocess (>= 0.5, < 5.0) rexml (~> 3.2, >= 3.2.5) rubyzip (>= 1.2.2) + simple_oauth (0.3.1) spring (2.1.1) spring-watcher-listen (2.0.1) listen (>= 2.7, < 4.0) @@ -249,11 +303,23 @@ GEM sprockets (>= 3.0.0) strscan (3.0.1) thor (1.2.1) + thread_safe (0.3.6) tilt (2.0.10) timeout (0.2.0) turbolinks (5.2.1) turbolinks-source (~> 5.2) turbolinks-source (5.2.0) + twitter (7.0.0) + addressable (~> 2.3) + buftok (~> 0.2.0) + equalizer (~> 0.0.11) + http (~> 4.0) + http-form_data (~> 2.0) + http_parser.rb (~> 0.6.0) + memoizable (~> 0.4.0) + multipart-post (~> 2.0) + naught (~> 1.0) + simple_oauth (~> 0.3.0) tzinfo (2.0.4) concurrent-ruby (~> 1.0) uglifier (4.2.0) @@ -291,6 +357,7 @@ DEPENDENCIES git jbuilder (~> 2.5) listen (>= 3.0.5, < 3.2) + nokogiri pg pry pry-byebug @@ -307,6 +374,8 @@ DEPENDENCIES spring spring-watcher-listen (~> 2.0.0) turbolinks (~> 5) + tweetkit! + twitter tzinfo-data uglifier (>= 1.3.0) web-console (>= 3.3.0) diff --git a/config/initializers/twitter_config.rb b/config/initializers/twitter_config.rb new file mode 100644 index 0000000..456eeb7 --- /dev/null +++ b/config/initializers/twitter_config.rb @@ -0,0 +1,15 @@ +require 'twitter' +require 'tweetkit' + +twitter_client = Twitter::REST::Client.new do |config| + config.consumer_key = ENV['twitter_api_key'] + config.consumer_secret = ENV['twitter_api_key_secret'] + config.access_token = ENV['twitter_access_token'] + config.access_token_secret = ENV['twitter_access_token_secret'] +end + +tweetkit_client = Tweetkit::Client.new do |config| + config.bearer_token = ENV['twitter_bearer_token'] + config.consumer_key = ENV['twitter_api_key'] + config.consumer_secret = ENV['twitter_api_key_secret'] +end \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 64d69bf..ea48071 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,8 @@ services: POSTGRES_PASSWORD: password web: build: . + env_file: + - twitter_credentials.env command: bash -c "rm -f tmp/pids/server.pid && bundle exec rails s -p 3000 -b '0.0.0.0'" volumes: - .:/data_importer diff --git a/lib/trickest_poc_cve_importer.rb b/lib/trickest_poc_cve_importer.rb new file mode 100644 index 0000000..3bcfe38 --- /dev/null +++ b/lib/trickest_poc_cve_importer.rb @@ -0,0 +1,134 @@ +require 'git' +require 'json' +require 'date' +require 'bulk_insert' + +class TrickestPocCveImporter + attr_accessor :repo_url, :repo_path + + def initialize + @repo_url = 'https://github.com/trickest/cve.git' + @repo_path = '/data_importer/data/trickest_cve' + end + + def git_clone_repo + Git.clone(repo_url, repo_path) + end + + def pull_latest_changes + `cd #{repo_path}; git pull;` + puts "Now pulling latest changes from #{repo_path}" + end + + def pull_or_clone(repo_path) + if Dir.exist?(repo_path) + pull_latest_changes + else + git_clone_repo + end + end + + def read_markdown(filename) + data = File.read(filename) + formatter = RDoc::Markup::ToHtml.new(RDoc::Options.new, nil) + # should give us the html doc + RDoc::Markdown.parse(data).accept(formatter) + end + + def html_to_hash(html) + data_hash = {} + doc = Nokogiri::HTML5.parse(html) + h3_nodes = doc.xpath('//h3') + h3_keys = doc.xpath('//h3').map {|n| n.children.first.text} + h4_keys = doc.xpath('//h4').map {|n| n.children.first.text} + data_hash_keys = (h3_keys + h4_keys).flatten + + # cve id is always the first url in the markdown doc + cve_url = doc.xpath("//h3/a").attribute('href').value + cve_id = h3_keys.first + + p_text = doc.xpath('//p').map {|p| p.text } + links_for_poc = doc.xpath('//p/a').map {|a| a.values}.flatten + + data_hash["#{cve_id}"] = cve_url + # p_text[0] is always an ' '. + data_hash['Description'] = p_text[1] + + # array of values if its a links. hard to distinguish between ones under POC and ones under Github + # if it contains no data under the heading there will be no .value but instead .text will return data. + # these ones can both have multiple values + # just normalize and put POC and Github stuff under one key now. idc i just need the URL + data_hash['POC'] = links_for_poc + data_hash + end + + def list_mds_for_year(year) + year_fp = "#{repo_path}/#{year}" + Dir["#{year_fp}/*.md"] + end + + def read_mds_for_year(year) + filenames = list_mds_for_year(year) + filenames.map { |filename| read_markdown(filename) } + end + + def read_all_mds + (1999..Date.today.year).map do |year| + read_mds_for_year(year.to_s) + end + end + + def cve_attrs_from_item(json) + cve_attrs = {} + #cve_attrs[:cve_data_meta] = json['CVE_data_meta'] + #cve_attrs[:cve_id] = json['CVE_data_meta']['ID'] + #cve_attrs[:affects] = json['affects'] + #cve_attrs[:data_format] = json['data_format'] + #cve_attrs[:data_type] = json['data_type'] + #cve_attrs[:data_version] = json['data_version'] + #cve_attrs[:description] = json['description'] + #cve_attrs[:impact] = json['impact'] + #cve_attrs[:problemtype] = json['problemtype'] + #cve_attrs[:references] = json['references'] + #cve_attrs[:source] = json['source'] + cve_attrs + end + + # for bulk inserting + def cves_for_year(year) + json_data = read_jsons_for_year(year) + json_data.map do |json_f| + cve_attrs_from_item(json_f) + end + end + + def import + if Dir.exist?(repo_path) + pull_latest_changes + else + git_clone_repo + end + + puts "Now starting import for CveList." + (1999..Date.today.year).map do |year| + cves_from_json = cves_for_year(year) + + ids = cves_from_json.map { |cve| cve[:cve_id] } + cve_ids_in_db = TrickestPocCve.where(:cve_id => ids).pluck(:cve_id) + + new_cve_ids = ids - cve_ids_in_db + new_cves = cves_from_json.select { |cve| cve if new_cve_ids.include?(cve[:cve_id]) } + puts "Importing any new CVEs from #{year}" + + bulk_insert(new_cves) + end + end + + def bulk_insert(cves) + TrickestPocCve.bulk_insert do |worker| + cves.each do |attrs| + worker.add(attrs) + end + end + end +end \ No newline at end of file