data_importer/lib/importers/gsd_importer.rb

77 lines
2.2 KiB
Ruby

# frozen_string_literal: true
require 'git'
require 'json'
require 'date'
require '/data_importer/lib/importers/github_repo'
require '/data_importer/lib/json_helper'
class GsdImporter < GithubRepo
EXPECTED_KEYS = %i[
cve_id
gsd_id
gsd
namespaces
].freeze
EMPTY_HASH = EXPECTED_KEYS.map { |k| [k, nil] }.to_h.freeze
def initialize
super(repo_url = 'https://github.com/cloudsecurityalliance/gsd-database.git', repo_path = '/data_importer/data/gsd_database')
end
def list_jsons_for_year(year)
year_fp = "#{repo_path}/#{year}/*"
Dir["#{year_fp}/*.json"]
end
def read_jsons_for_year(year)
filenames = list_jsons_for_year(year)
hashes = filenames.map do |filename|
json = read_json(filename)
json_transformed = JsonHelper.deep_transform_keys(json)
append_ids_to_hash(json_transformed)
end
hashes.map { |h| h.slice(*EXPECTED_KEYS).reverse_merge(EMPTY_HASH) }
end
def append_ids_to_hash(json)
json[:cve_id] = json.dig(:gsd, :alias)
json[:gsd_id] = json.dig(:gsd, :id)
json
end
# they like to post descriptions with \u0000 and it doesnt make postgres happy
def sanitize_gitlab_advisories(json)
if json[:namespaces].keys.include? (:"gitlab.com")
advisories = json[:namespaces][:"gitlab.com"][:advisories]
sanitized_advisories = advisories.map do |advisory|
description = advisory[:description]
sanitized_description = JsonHelper.fix_null_byte(description)
advisory[:description] = sanitized_description
advisory
end
json[:namespaces][:"gitlab.com"][:advisories] = sanitized_advisories
end
json
end
def import
pull_or_clone
puts "Now starting import for #{repo_url}."
puts '----------' * 12
(1999..Date.today.year).map do |year|
puts "Now importing GSDs for #{year}"
gsds = read_jsons_for_year(year)
gsds.each do |h|
# fixes a suse cve from 2009 that didnt have any namespaces set in the json.
if h[:namespaces].nil?
Gsd.upsert_all([h], unique_by: :gsd_id)
else
sanitized_h = sanitize_gitlab_advisories(h)
Gsd.upsert_all([sanitized_h], unique_by: :gsd_id)
end
end
end
end
end