made a paste_max limit option so you can query how many pastes you want

This commit is contained in:
booboy 2019-02-04 22:45:26 -06:00
parent 8d781e2499
commit 0415c64960
4 changed files with 16 additions and 11 deletions

View file

@ -6,10 +6,13 @@ require 'pry'
# setup our object and grab a session key # setup our object and grab a session key
pb = Pastebinner.new(ENV['pastebin_api_key'], ENV['pastebin_username'], ENV['pastebin_password']) pb = Pastebinner.new(ENV['pastebin_api_key'], ENV['pastebin_username'], ENV['pastebin_password'])
# set the commandline client to grab 50 pastes by default. this should be an option to config though once we add configuration methods
paste_max = 50
# pass in the Pastebinner.new client. # pass in the Pastebinner.new client.
# will download all of the raw pastes from the public scrape results into each own file in data dir. # will download all of the raw pastes from the public scrape results into each own file in data dir.
def download_pastes_json(pb) def download_pastes_json(pb, paste_max)
pub_pastes = pb.scrape_public_pastes pub_pastes = pb.scrape_public_pastes(paste_max)
keys = pb.get_unique_paste_keys(pub_pastes) keys = pb.get_unique_paste_keys(pub_pastes)
data_dir = '../data/' data_dir = '../data/'
filename = 'pastebin_paste_key' filename = 'pastebin_paste_key'
@ -22,8 +25,8 @@ def download_pastes_json(pb)
end end
end end
def download_pastes_raw(pb) def download_pastes_raw(pb, paste_max)
pub_pastes = pb.scrape_public_pastes pub_pastes = pb.scrape_public_pastes(paste_max)
keys = pb.get_unique_paste_keys(pub_pastes) keys = pb.get_unique_paste_keys(pub_pastes)
data_dir = '../data/' data_dir = '../data/'
filename = 'pastebin_paste_key' filename = 'pastebin_paste_key'
@ -39,14 +42,14 @@ end
options = OptionParser.parse! options = OptionParser.parse!
if options[:s] if options[:s]
puts pb.scrape_public_pastes puts pb.scrape_public_paste(paste_max)
elsif options[:r] && options[:k] elsif options[:r] && options[:k]
key = options[:k] key = options[:k]
puts pb.raw_paste_data(key) puts pb.raw_paste_data(key)
elsif options[:t] elsif options[:t]
puts pb.list_trending_pastes puts pb.list_trending_pastes
elsif options[:g] elsif options[:g]
r = pb.scrape_public_pastes r = pb.scrape_public_pastes(paste_max)
puts pb.get_unique_paste_keys(r) puts pb.get_unique_paste_keys(r)
elsif options[:j] elsif options[:j]
puts 'Downloading paste data as a json into the data directory...' puts 'Downloading paste data as a json into the data directory...'

View file

@ -19,7 +19,9 @@ params = { "api_dev_key": api_dev_key, "api_option": 'paste', "api_paste_code":
puts pb.create_paste(params) puts pb.create_paste(params)
#### SCRAPE PUBLIC PASTES #### SCRAPE PUBLIC PASTES
puts pb.scrape_public_pastes paste_max = 50
# set to scrape 50 pastes, max is 250 (sometimes can get rate limited when around 250 range)
puts pb.scrape_public_pastes(paste_max)
#### SCRAPING - WHITELISTED IP ONLY #### SCRAPING - WHITELISTED IP ONLY
#### SCRAPE RAW PASTE DATA OF A PASTE KEY #### SCRAPE RAW PASTE DATA OF A PASTE KEY

View file

@ -1,8 +1,8 @@
class PasteToEs class PasteToEs
include Sidekiq::Worker include Sidekiq::Worker
def perform(es_object, pb_object) def perform(es_object, pb_object, paste_max)
Logger.new(STDOUT).info("PasteToEs started") Logger.new(STDOUT).info("PasteToEs started")
pastes = pb_object.scrape_public_pastes pastes = pb_object.scrape_public_pastes(paste_max)
keys = pb_object.get_unique_paste_keys(pastes) keys = pb_object.get_unique_paste_keys(pastes)
json_data = pb_object.json_paste(keys) json_data = pb_object.json_paste(keys)
es_object.json_to_es_bulk(json_data) es_object.json_to_es_bulk(json_data)

View file

@ -101,10 +101,10 @@ class Pastebinner
# params is optional for now. to query specific language ?lang=ruby as an example # params is optional for now. to query specific language ?lang=ruby as an example
# right now its set to grab the max 250, default is 50. param is ?limit=value # right now its set to grab the max 250, default is 50. param is ?limit=value
def scrape_public_pastes(_params = nil) def scrape_public_pastes(_params = nil, limit)
response = RestClient::Request.execute( response = RestClient::Request.execute(
method: :get, method: :get,
url: @scraping_api_url + ENDPOINTS[:scraping] + '?limit=250' url: @scraping_api_url + ENDPOINTS[:scraping] + "?limit=#{limit}"
) )
end end