From 0415c64960a8a15e61118f62f7f5d783a39cd6b9 Mon Sep 17 00:00:00 2001 From: booboy Date: Mon, 4 Feb 2019 22:45:26 -0600 Subject: [PATCH] made a paste_max limit option so you can query how many pastes you want --- bin/pastebinner | 15 +++++++++------ lib/examples/examples.rb | 4 +++- lib/paste_to_es.rb | 4 ++-- lib/pastebinner.rb | 4 ++-- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/bin/pastebinner b/bin/pastebinner index 609c985..bf85048 100755 --- a/bin/pastebinner +++ b/bin/pastebinner @@ -6,10 +6,13 @@ require 'pry' # setup our object and grab a session key pb = Pastebinner.new(ENV['pastebin_api_key'], ENV['pastebin_username'], ENV['pastebin_password']) +# set the commandline client to grab 50 pastes by default. this should be an option to config though once we add configuration methods +paste_max = 50 + # pass in the Pastebinner.new client. # will download all of the raw pastes from the public scrape results into each own file in data dir. -def download_pastes_json(pb) - pub_pastes = pb.scrape_public_pastes +def download_pastes_json(pb, paste_max) + pub_pastes = pb.scrape_public_pastes(paste_max) keys = pb.get_unique_paste_keys(pub_pastes) data_dir = '../data/' filename = 'pastebin_paste_key' @@ -22,8 +25,8 @@ def download_pastes_json(pb) end end -def download_pastes_raw(pb) - pub_pastes = pb.scrape_public_pastes +def download_pastes_raw(pb, paste_max) + pub_pastes = pb.scrape_public_pastes(paste_max) keys = pb.get_unique_paste_keys(pub_pastes) data_dir = '../data/' filename = 'pastebin_paste_key' @@ -39,14 +42,14 @@ end options = OptionParser.parse! if options[:s] - puts pb.scrape_public_pastes + puts pb.scrape_public_paste(paste_max) elsif options[:r] && options[:k] key = options[:k] puts pb.raw_paste_data(key) elsif options[:t] puts pb.list_trending_pastes elsif options[:g] - r = pb.scrape_public_pastes + r = pb.scrape_public_pastes(paste_max) puts pb.get_unique_paste_keys(r) elsif options[:j] puts 'Downloading paste data as a json into the data directory...' diff --git a/lib/examples/examples.rb b/lib/examples/examples.rb index 7a6e63e..512c7be 100755 --- a/lib/examples/examples.rb +++ b/lib/examples/examples.rb @@ -19,7 +19,9 @@ params = { "api_dev_key": api_dev_key, "api_option": 'paste', "api_paste_code": puts pb.create_paste(params) #### SCRAPE PUBLIC PASTES -puts pb.scrape_public_pastes +paste_max = 50 +# set to scrape 50 pastes, max is 250 (sometimes can get rate limited when around 250 range) +puts pb.scrape_public_pastes(paste_max) #### SCRAPING - WHITELISTED IP ONLY #### SCRAPE RAW PASTE DATA OF A PASTE KEY diff --git a/lib/paste_to_es.rb b/lib/paste_to_es.rb index 9bf6d07..a01acf8 100644 --- a/lib/paste_to_es.rb +++ b/lib/paste_to_es.rb @@ -1,8 +1,8 @@ class PasteToEs include Sidekiq::Worker - def perform(es_object, pb_object) + def perform(es_object, pb_object, paste_max) Logger.new(STDOUT).info("PasteToEs started") - pastes = pb_object.scrape_public_pastes + pastes = pb_object.scrape_public_pastes(paste_max) keys = pb_object.get_unique_paste_keys(pastes) json_data = pb_object.json_paste(keys) es_object.json_to_es_bulk(json_data) diff --git a/lib/pastebinner.rb b/lib/pastebinner.rb index 27f3f51..cb995b2 100755 --- a/lib/pastebinner.rb +++ b/lib/pastebinner.rb @@ -101,10 +101,10 @@ class Pastebinner # params is optional for now. to query specific language ?lang=ruby as an example # right now its set to grab the max 250, default is 50. param is ?limit=value - def scrape_public_pastes(_params = nil) + def scrape_public_pastes(_params = nil, limit) response = RestClient::Request.execute( method: :get, - url: @scraping_api_url + ENDPOINTS[:scraping] + '?limit=250' + url: @scraping_api_url + ENDPOINTS[:scraping] + "?limit=#{limit}" ) end