#!/usr/bin/ruby
#PornFind is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or
#(at your option) any later version.
#PornFind is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with PornFind; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
require 'thread'
MaxThread = (ARGV[0] or "5").to_i
MINSIZE = 3500
Thread.abort_on_exception = true
def note(file_content)
begin
cmd = IO.popen("bogofilter -v -c /etc/pornfind/bogofilter.cf -t","w+")
cmd.puts(file_content)
cmd.close_write
res = cmd.gets[2..-1].to_f
cmd.close
return res
catch
return 0
end
end
class Site
def initialize(address)
@site = address
@url = Hash.new(0)
end
# add an URL in the list
def add(url)
return 0 if url =~ /(wmv|rpm|ram|bmp|png|gif|jpg|jpeg|js|avi|mpg|mpeg|ico|class|css|xml|csf|cab|swf|zip|pdf|doc|mid|wav|mp3|mov|iso)(%\d\d)*\??$/i
return 0 if url == "/" or url == "" or url == "/?"
url =~ /^([a-zA-Z0-9._\-\/\%]+)([\?&;'].*)?$/
return 0 if !$1
@url[$1]+=1
end
# return the list of all the url of the site
def getAllUrl
return @url.keys
end
# return content of the root of the site
def getRoot
res = `timeout.rb 10 wget -O - -- '#{@site}' 2> /dev/null | mybogo.rb `
return res
end
# return content of a random page
# the content must be > min bytes
def getPage(min=MINSIZE)
max = ["",""]
@url.keys.sort{|a,b| rand(3)-1}.each {|url|
# suppress url of the list
@url[url] = nil
res = `timeout.rb 10 wget -O - -- '#{@site+url}' 2> /dev/null | mybogo.rb `
next if res !~ /<html>|<body>/i
return [url,res] if res.length >= min
max = [url,res] if res.length > max[1].length
}
return max
end
end
class LogSquid
def initialize(io,good = Hash.new)
@all_site = Hash.new
while line = io.gets
# skip if the site is already rejected
#next if line =~ /NONE\/- -|DIRECT\/127.0.0.1 text\/html/
# skip if the request was from localhost (depend of cfg)
# next if line =~ /^[^ ]+ +[^ ]+ +127\.0\.0\.1 /
# site in $1, url in $2
#line =~ /GET http:\/\/([^\/]*)(\/[^ ]*)/
line =~ /(?:^|\s)+http:\/\/([^\/]*)(\/[^ ]*)/
next if !($1 and $2)
# on ne l'ajoute pas s'il est dans good
next if good[$1]
site,url = $1, $2
@all_site[site] = Site.new(site) if !@all_site[site]
@all_site[site].add(url)
end
end
def getAllSite
return @all_site.to_a
end
end
class SiteQueue
def initialize(max=MaxThread * 2)
@max = max
@list = []
@mutex = Mutex.new
end
def push(tab)
while 1 do
@mutex.synchronize {
if @list.length < @max
@list.push(tab)
return 0
end
}
sleep 1
return 0 if empty?
end
end
def pop
while 1 do
@mutex.synchronize {
if @list.length > 0
return @list.pop
end
}
sleep 1 if empty?
end
end
def empty?
return true if @list.length == 0
return false
end
end
class Tokenize
def initialize(text)
@content = text
@content.sub!(' ',' ') do
end
@content.sub!(/&([a-z])[a-z]+;/,'\1') do
end
end
def content
return @content
end
def single
return @content.split(/[\s"'<>\\\/+&?:;,.()={}*]+/)
end
def bi_key tmp = String.new(@content)
# on vire les tags html,je garde les comments et le javascript
while tmp.sub!(/<\w+[^>]+>/,'') do
end
words = tmp.split(/[\s"'<>\\\/+&?:;,.()={}*]+/)
words.reject! {|word| word.length<=1 }
bikey = []
for i in 0..words.length-2
bikey[i] = words[i]+"_"+words[i+1]
end
return bikey
end
def meta
meta = []
@content.scan(/<(\w+)([^>]*)>/) {|tag,content|
tag.tr!('A-Z','a-z')
content.map{|str| str.tr('A-Z','a-z')}
meta += content.split(/[\s"'<>\\\/+&?:;,.()={}*]+/).reject{|str| str.length==0}.map{|info| tag+"__"+info} if content.length>0
}
return meta
end
end
good = Hash.new
if ARGV[1] #nom de fichier en second paramettre
file = File.open(ARGV[1])
while line = file.gets
good[line.chomp]=1
end
end
log = LogSquid.new($stdin,good)
$stdout.sync = true
allSite = log.getAllSite
queue = SiteQueue.new
mysite = []
tmp = []
tg = ThreadGroup.new
th = []
for i in 0..MaxThread - 1
th[i] =Thread.new(i) {|k|
while mysite[k] = allSite.pop do
tmp[k] = mysite[k][1].getPage
queue.push([mysite[k][0],"/",mysite[k][1].getRoot,tmp[k][0],tmp[k][1]])
end
}
tg.add(th[i])
end
while tg.list.length > 0 or not queue.empty? do
site,url1,content1,url2,content2 = '','','',''
site,url1,content1,url2,content2 = queue.pop
# content = "\n\n"
# tok = Tokenize.new(content1+content2)
# content += (tok.bi_key * " ") + "\n" + (tok.meta * " ") + "\n" + content1+content2
if content1.length+content2.length > MINSIZE
puts "#{(note(content1 + content2)*100).to_i} http://#{site+url2}"
else
puts "??? #{(note(content1 + content2)*100).to_i} http://#{site+url2}"
end
end