DarknetMarketsNoobs_ML_Study/lib.rb

94 lines
2.3 KiB
Ruby

require 'open3'
require 'json'
require 'nokogiri'
require 'date'
require 'rubygems'
require 'lda-ruby'
require 'lingua/stemmer'
require 'textoken'
require 'stopwords'
require "lemmatizer"
class Config
def initialize()
@json = JSON.parse(File.read("config.json"))
end
def json
@json
end
def get_body
json["body_url"]
end
def get_url
json["sub_url"]
end
end
class Cmd
def url
@url
end
def run_command
stdout, status = Open3.capture3(url)
page = Nokogiri::HTML(stdout)
out = page.xpath("/html/body/div/div[2]/div[2]/div/div[2]/div").text.strip
end
end
class GetUrls < Config
def run
i = 0
url = Config.new.get_url
puts url
stdout, status = Open3.capture3(url)
page = Nokogiri::HTML(stdout)
out = []
until i >= 20
t = page.xpath("/html/body/div/div[2]/div[3]/div[#{i}]/div[1]/a").css('a')
url = t.attribute('href').to_s.strip
out << url if url.size.to_i != 0
i = i+= 1
end
File.open(File.join("DarknetMarketsNoobs_urls.txt"), "a") { |file| file.write(out.join("\n")) }
end
end
class Clean
def initialize(line)
@line = line
end
def line
@line
end
def cleanize
corpus = Lda::Corpus.new
f = Stopwords::Snowball::Filter.new "en"
cleaned_post = line.downcase
text_token = Textoken(cleaned_post, exclude: 'punctuations').tokens
filtered = f.filter(text_token)
return filtered
end
end
class GetBody < Config
def run
a = []
d = DateTime.now
date = d.strftime("%m/%d/%Y")
@f_json = []
j = {}
File.readlines("DarknetMarketsNoobs_urls.txt").each do |l|
l = l.strip
curl = Config.new.get_body.to_s.gsub(".onion", ".onion#{l}")
stdout, status = Open3.capture3(curl)
page = Nokogiri::HTML(stdout)
out = page.xpath("/html/body/div/div[2]/div[2]/div/div[2]/div").text.strip
c = Clean.new(out).cleanize
@f_json << c.join(" ")
end
j[date] = @f_json
File.open(File.join("fucking_Work.json"), "w") { |file| file.write(JSON.pretty_generate(j)) }
end
end
GetUrls.new.run
GetBody.new.run