add base support for trickest poc cves to github

This commit is contained in:
Brendan McDevitt 2022-04-06 22:31:52 -05:00
parent d81c31febe
commit 752aef6392
6 changed files with 230 additions and 4 deletions

3
.gitignore vendored
View file

@ -29,3 +29,6 @@
# Ignore master key for decrypting credentials and more.
/config/master.key
# Any API keys or envars we dont want to commit add here.
/twitter_credentials.env

View file

@ -9,6 +9,9 @@ gem 'actionpack'
gem 'sass-rails'
gem 'railties'
gem 'rest-client'
gem 'twitter'
gem 'tweetkit', github: 'julianfssen/tweetkit' # for twitter v2 api support
gem 'nokogiri'
# Use postgres as the database for Active Record
gem 'pg'

View file

@ -1,3 +1,12 @@
GIT
remote: https://github.com/julianfssen/tweetkit.git
revision: e9ff2e807089547548a3caeea24b06cbdb1defd3
specs:
tweetkit (0.2.0)
faraday (~> 1.9.3)
faraday_middleware (~> 1.2.0)
simple_oauth (~> 0.3.0)
GEM
remote: https://rubygems.org/
specs:
@ -74,6 +83,7 @@ GEM
bindex (0.8.1)
bootsnap (1.11.1)
msgpack (~> 1.2)
buftok (0.2.0)
builder (3.2.4)
bulk_insert (1.9.0)
activerecord (>= 3.2.0)
@ -104,16 +114,54 @@ GEM
digest (3.1.0)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
equalizer (0.0.11)
erubi (1.10.0)
execjs (2.8.1)
faraday (1.9.3)
faraday-em_http (~> 1.0)
faraday-em_synchrony (~> 1.0)
faraday-excon (~> 1.1)
faraday-httpclient (~> 1.0)
faraday-multipart (~> 1.0)
faraday-net_http (~> 1.0)
faraday-net_http_persistent (~> 1.0)
faraday-patron (~> 1.0)
faraday-rack (~> 1.0)
faraday-retry (~> 1.0)
ruby2_keywords (>= 0.0.4)
faraday-em_http (1.0.0)
faraday-em_synchrony (1.0.0)
faraday-excon (1.1.0)
faraday-httpclient (1.0.1)
faraday-multipart (1.0.3)
multipart-post (>= 1.2, < 3)
faraday-net_http (1.0.1)
faraday-net_http_persistent (1.2.0)
faraday-patron (1.0.0)
faraday-rack (1.0.0)
faraday-retry (1.0.3)
faraday_middleware (1.2.0)
faraday (~> 1.0)
ffi (1.15.5)
ffi-compiler (1.0.1)
ffi (>= 1.0.0)
rake
git (1.10.2)
rchardet (~> 1.8)
globalid (1.0.0)
activesupport (>= 5.0)
http (4.4.1)
addressable (~> 2.3)
http-cookie (~> 1.0)
http-form_data (~> 2.2)
http-parser (~> 1.2.0)
http-accept (1.7.0)
http-cookie (1.0.4)
domain_name (~> 0.5)
http-form_data (2.3.0)
http-parser (1.2.3)
ffi-compiler (>= 1.0, < 2.0)
http_parser.rb (0.6.0)
i18n (1.10.0)
concurrent-ruby (~> 1.0)
interception (0.5)
@ -132,6 +180,8 @@ GEM
mini_mime (>= 0.1.1)
marcel (1.0.2)
matrix (0.4.2)
memoizable (0.4.2)
thread_safe (~> 0.3, >= 0.3.1)
method_source (1.0.0)
mime-types (3.4.1)
mime-types-data (~> 3.2015)
@ -139,7 +189,9 @@ GEM
mini_mime (1.1.2)
mini_portile2 (2.8.0)
minitest (5.15.0)
msgpack (1.4.5)
msgpack (1.5.0)
multipart-post (2.1.1)
naught (1.1.0)
net-imap (0.2.3)
digest
net-protocol
@ -160,12 +212,12 @@ GEM
mini_portile2 (~> 2.8.0)
racc (~> 1.4)
pg (1.3.5)
pry (0.13.1)
pry (0.14.1)
coderay (~> 1.1)
method_source (~> 1.0)
pry-byebug (3.9.0)
pry-byebug (3.8.0)
byebug (~> 11.0)
pry (~> 0.13.0)
pry (~> 0.10)
pry-doc (1.3.0)
pry (~> 0.11)
yard (~> 0.9.11)
@ -220,6 +272,7 @@ GEM
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
rexml (3.2.5)
ruby2_keywords (0.0.5)
ruby_dep (1.5.0)
rubyzip (2.3.2)
sass-rails (6.0.0)
@ -236,6 +289,7 @@ GEM
childprocess (>= 0.5, < 5.0)
rexml (~> 3.2, >= 3.2.5)
rubyzip (>= 1.2.2)
simple_oauth (0.3.1)
spring (2.1.1)
spring-watcher-listen (2.0.1)
listen (>= 2.7, < 4.0)
@ -249,11 +303,23 @@ GEM
sprockets (>= 3.0.0)
strscan (3.0.1)
thor (1.2.1)
thread_safe (0.3.6)
tilt (2.0.10)
timeout (0.2.0)
turbolinks (5.2.1)
turbolinks-source (~> 5.2)
turbolinks-source (5.2.0)
twitter (7.0.0)
addressable (~> 2.3)
buftok (~> 0.2.0)
equalizer (~> 0.0.11)
http (~> 4.0)
http-form_data (~> 2.0)
http_parser.rb (~> 0.6.0)
memoizable (~> 0.4.0)
multipart-post (~> 2.0)
naught (~> 1.0)
simple_oauth (~> 0.3.0)
tzinfo (2.0.4)
concurrent-ruby (~> 1.0)
uglifier (4.2.0)
@ -291,6 +357,7 @@ DEPENDENCIES
git
jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2)
nokogiri
pg
pry
pry-byebug
@ -307,6 +374,8 @@ DEPENDENCIES
spring
spring-watcher-listen (~> 2.0.0)
turbolinks (~> 5)
tweetkit!
twitter
tzinfo-data
uglifier (>= 1.3.0)
web-console (>= 3.3.0)

View file

@ -0,0 +1,15 @@
require 'twitter'
require 'tweetkit'
twitter_client = Twitter::REST::Client.new do |config|
config.consumer_key = ENV['twitter_api_key']
config.consumer_secret = ENV['twitter_api_key_secret']
config.access_token = ENV['twitter_access_token']
config.access_token_secret = ENV['twitter_access_token_secret']
end
tweetkit_client = Tweetkit::Client.new do |config|
config.bearer_token = ENV['twitter_bearer_token']
config.consumer_key = ENV['twitter_api_key']
config.consumer_secret = ENV['twitter_api_key_secret']
end

View file

@ -8,6 +8,8 @@ services:
POSTGRES_PASSWORD: password
web:
build: .
env_file:
- twitter_credentials.env
command: bash -c "rm -f tmp/pids/server.pid && bundle exec rails s -p 3000 -b '0.0.0.0'"
volumes:
- .:/data_importer

View file

@ -0,0 +1,134 @@
require 'git'
require 'json'
require 'date'
require 'bulk_insert'
class TrickestPocCveImporter
attr_accessor :repo_url, :repo_path
def initialize
@repo_url = 'https://github.com/trickest/cve.git'
@repo_path = '/data_importer/data/trickest_cve'
end
def git_clone_repo
Git.clone(repo_url, repo_path)
end
def pull_latest_changes
`cd #{repo_path}; git pull;`
puts "Now pulling latest changes from #{repo_path}"
end
def pull_or_clone(repo_path)
if Dir.exist?(repo_path)
pull_latest_changes
else
git_clone_repo
end
end
def read_markdown(filename)
data = File.read(filename)
formatter = RDoc::Markup::ToHtml.new(RDoc::Options.new, nil)
# should give us the html doc
RDoc::Markdown.parse(data).accept(formatter)
end
def html_to_hash(html)
data_hash = {}
doc = Nokogiri::HTML5.parse(html)
h3_nodes = doc.xpath('//h3')
h3_keys = doc.xpath('//h3').map {|n| n.children.first.text}
h4_keys = doc.xpath('//h4').map {|n| n.children.first.text}
data_hash_keys = (h3_keys + h4_keys).flatten
# cve id is always the first url in the markdown doc
cve_url = doc.xpath("//h3/a").attribute('href').value
cve_id = h3_keys.first
p_text = doc.xpath('//p').map {|p| p.text }
links_for_poc = doc.xpath('//p/a').map {|a| a.values}.flatten
data_hash["#{cve_id}"] = cve_url
# p_text[0] is always an ' '.
data_hash['Description'] = p_text[1]
# array of values if its a links. hard to distinguish between ones under POC and ones under Github
# if it contains no data under the heading there will be no .value but instead .text will return data.
# these ones can both have multiple values
# just normalize and put POC and Github stuff under one key now. idc i just need the URL
data_hash['POC'] = links_for_poc
data_hash
end
def list_mds_for_year(year)
year_fp = "#{repo_path}/#{year}"
Dir["#{year_fp}/*.md"]
end
def read_mds_for_year(year)
filenames = list_mds_for_year(year)
filenames.map { |filename| read_markdown(filename) }
end
def read_all_mds
(1999..Date.today.year).map do |year|
read_mds_for_year(year.to_s)
end
end
def cve_attrs_from_item(json)
cve_attrs = {}
#cve_attrs[:cve_data_meta] = json['CVE_data_meta']
#cve_attrs[:cve_id] = json['CVE_data_meta']['ID']
#cve_attrs[:affects] = json['affects']
#cve_attrs[:data_format] = json['data_format']
#cve_attrs[:data_type] = json['data_type']
#cve_attrs[:data_version] = json['data_version']
#cve_attrs[:description] = json['description']
#cve_attrs[:impact] = json['impact']
#cve_attrs[:problemtype] = json['problemtype']
#cve_attrs[:references] = json['references']
#cve_attrs[:source] = json['source']
cve_attrs
end
# for bulk inserting
def cves_for_year(year)
json_data = read_jsons_for_year(year)
json_data.map do |json_f|
cve_attrs_from_item(json_f)
end
end
def import
if Dir.exist?(repo_path)
pull_latest_changes
else
git_clone_repo
end
puts "Now starting import for CveList."
(1999..Date.today.year).map do |year|
cves_from_json = cves_for_year(year)
ids = cves_from_json.map { |cve| cve[:cve_id] }
cve_ids_in_db = TrickestPocCve.where(:cve_id => ids).pluck(:cve_id)
new_cve_ids = ids - cve_ids_in_db
new_cves = cves_from_json.select { |cve| cve if new_cve_ids.include?(cve[:cve_id]) }
puts "Importing any new CVEs from #{year}"
bulk_insert(new_cves)
end
end
def bulk_insert(cves)
TrickestPocCve.bulk_insert do |worker|
cves.each do |attrs|
worker.add(attrs)
end
end
end
end