download parse_porn.rb
Language: Ruby
License: GPL
LOC: 164
Project Info
Porn web page detector(pornfind)
Server: Savannah NonGNU
Type: cvs
...pornfind\pornfind\pornfind\
   basic-nocache.rb
   bogofilter.cf
   class.rb
   install.sh
   mybogo.rb
   parse_porn.rb
   select_date.rb
   timeout.rb

#!/usr/bin/ruby


#PornFind is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or
#(at your option) any later version.

#PornFind is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with PornFind; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

require 'thread'

MaxThread = (ARGV[0] or "5").to_i
MINSIZE = 3500

Thread.abort_on_exception = true


def note(file_content)
	begin
		cmd = IO.popen("bogofilter -v -c /etc/pornfind/bogofilter.cf -t","w+")
		cmd.puts(file_content)
		cmd.close_write
		res = cmd.gets[2..-1].to_f
		cmd.close
		return res
	catch
		return 0
	end
end

class Site
	def initialize(address)
		@site = address
		@url = Hash.new(0)
	end
	
	# add an URL in the list
	def add(url)
		return 0 if url =~ /(wmv|rpm|ram|bmp|png|gif|jpg|jpeg|js|avi|mpg|mpeg|ico|class|css|xml|csf|cab|swf|zip|pdf|doc|mid|wav|mp3|mov|iso)(%\d\d)*\??$/i
		return 0 if url == "/" or url == "" or url == "/?"
		url =~ /^([a-zA-Z0-9._\-\/\%]+)([\?&;'].*)?$/
		return 0 if !$1
		@url[$1]+=1
	end
	
	# return the list of all the url of the site
	def getAllUrl
		return @url.keys
	end
	
	# return content of the root of the site
	def getRoot
		res = `timeout.rb 10 wget -O - -- '#{@site}' 2> /dev/null | mybogo.rb `
		return res
		
	end
	
	# return content of a random page
	# the content must be > min bytes
	def getPage(min=MINSIZE)
		max = ["",""]
		@url.keys.sort{|a,b| rand(3)-1}.each {|url|
			# suppress url of the list
			@url[url] = nil
			res = `timeout.rb 10 wget  -O - -- '#{@site+url}' 2> /dev/null | mybogo.rb `
			next if res !~ /<html>|<body>/i
			return [url,res] if res.length >= min
			max = [url,res] if res.length > max[1].length
		}
		return max
	end
end


class LogSquid
	def initialize(io,good = Hash.new)
		@all_site = Hash.new
		while line = io.gets
		  # skip if the site is already rejected
		  #next if line =~ /NONE\/- -|DIRECT\/127.0.0.1 text\/html/
		  # skip if the request was from localhost (depend of cfg)
		  # next if line =~ /^[^ ]+ +[^ ]+ +127\.0\.0\.1 /
		  # site in $1, url in $2 
		  #line =~ /GET http:\/\/([^\/]*)(\/[^ ]*)/
		  line =~ /(?:^|\s)+http:\/\/([^\/]*)(\/[^ ]*)/
		  next if !($1 and $2)
		  # on ne l'ajoute pas s'il est dans good
		  next if good[$1]
		  site,url = $1, $2
		  @all_site[site] = Site.new(site) if !@all_site[site]
		  @all_site[site].add(url)
		end
	end
	def getAllSite
		return @all_site.to_a
	end
end

class SiteQueue
	def initialize(max=MaxThread * 2)
		@max = max
		@list = []
		@mutex = Mutex.new
	end
	def push(tab)
		while 1 do
			@mutex.synchronize {
				if @list.length < @max
					@list.push(tab)
					return 0
				end
			}
			sleep 1
			return 0 if empty?
		end
		
	end
	def pop
		while 1 do
			@mutex.synchronize {
				if @list.length > 0
					return @list.pop
				end
			}
			sleep 1 if empty?
		end
	end
	def empty?
		return true if @list.length == 0
		return false
	end
end

class Tokenize
        def initialize(text)
                @content = text
                @content.sub!('&nbsp;',' ') do
                end
                @content.sub!(/&([a-z])[a-z]+;/,'\1') do
                end
        end
        def content
                return @content
        end
        def single
                return @content.split(/[\s"'<>\\\/+&?:;,.()={}*]+/)
        end
        def bi_key                 tmp = String.new(@content)
                # on vire les tags html,je garde les comments et le javascript
                while tmp.sub!(/<\w+[^>]+>/,'') do
                end
                words = tmp.split(/[\s"'<>\\\/+&?:;,.()={}*]+/)
                words.reject! {|word| word.length<=1 }
                                                                                                 bikey = []
                for i in 0..words.length-2
                        bikey[i] = words[i]+"_"+words[i+1]
                end
                return bikey
        end
        def meta
                meta = []
                @content.scan(/<(\w+)([^>]*)>/) {|tag,content|
                        tag.tr!('A-Z','a-z')
                        content.map{|str| str.tr('A-Z','a-z')}
                        meta += content.split(/[\s"'<>\\\/+&?:;,.()={}*]+/).reject{|str| str.length==0}.map{|info| tag+"__"+info}  if content.length>0
                }
                return meta
        end
end


good = Hash.new
if ARGV[1] #nom de fichier en second paramettre
	file = File.open(ARGV[1])
	while line = file.gets
		good[line.chomp]=1
	end
end


log = LogSquid.new($stdin,good)
$stdout.sync = true


allSite = log.getAllSite

queue = SiteQueue.new
mysite = []
tmp = []
tg = ThreadGroup.new
th = []


for i in 0..MaxThread - 1
	th[i] =Thread.new(i) {|k|
		while mysite[k] = allSite.pop do
			tmp[k] = mysite[k][1].getPage
			queue.push([mysite[k][0],"/",mysite[k][1].getRoot,tmp[k][0],tmp[k][1]])
		end
	}
	tg.add(th[i])
end


while tg.list.length > 0 or not queue.empty? do
	site,url1,content1,url2,content2 = '','','',''
	site,url1,content1,url2,content2 = queue.pop
#	content = "\n\n"
#	tok = Tokenize.new(content1+content2)
#	content += (tok.bi_key * " ") + "\n" + (tok.meta * " ") + "\n" + content1+content2
	if content1.length+content2.length > MINSIZE
		puts "#{(note(content1 + content2)*100).to_i} http://#{site+url2}"
	else
		puts "??? #{(note(content1 + content2)*100).to_i} http://#{site+url2}"
	end
end

About Koders | Resources | Downloads | Support | Black Duck | Submit Project | Terms of Service | DMCA | Privacy Policy | Site Map| Contact Us