download filters.rb
Language: Ruby
LOC: 150
Project Info
Weft QDA(weft-qda)
Server: RubyForge (SVN)
Type: svn
...ches\PRESSURE\weft‑qda\lib\
   filters.rb
   weft.rb

require 'category'
require 'coding'
require 'English'

module QDA
  class InputFilter
    attr_reader :cursor

	def initialize()
      @cursor = 0
      @indexers = []
	end
	
    def add_indexer(indexer)
      unless indexer.respond_to?(:feed)
        raise "Document indexers should have a feed method"
      end
      @indexers.push(indexer)
    end
	
    # reads +file+ and creates a new document titled +doctitle+. +file+
    # may be a String filename or an open stream.
    # Under the hood, calls +read_content+ to extract the content. This
    # method must be implemented in subclasses. Then +process_content+
    # is called to create the documents text. This class does something
    # reasonable with plain text, but structured text formats will want
    # to subclass this method to process non-text information (for
    # example, HTML or XML tags)
	def read(file, doctitle)
      @content = ''
	  case file
	  when IO
		@content = read_content(file)
      when QDA::Document
        @content = file.text
	  when String
		begin
		  @content = read_content(File.new(file))
		end
	  end
      process_content(doctitle)
    end

    def process_content(doctitle)
      # signal to indexers we're about to start
      @indexers.each { | indexer | indexer.prepare(@content) }
	  doc = QDA::Document.new(doctitle)
	  @content.each_line do | line |
        doc.append(line.to_s.chomp)
        # inform AutoCoders, reverse indexers and so on.
        @indexers.each { | indexer | indexer.feed(line) }
	  end
      @indexers.each { | indexer | indexer.terminate() }
	  doc.create
      return doc
    end
  end

  class TextFilter < InputFilter
    EXTENSIONS = [ 'txt' ]

	def read_content(file)
      text = file.read()
      file.close()
      text
	end
  end

  class PDFFilter < InputFilter
    EXTENSIONS = [ 'pdf' ]

    PDF_TO_TEXT_EXEC = 'pdftotext'
    NO_COPYING_ERROR_TEXT = 
      "The author or publisher of this PDF document has locked it to
prevent copying and extraction of its text. It is not possible to
import this document."

    def read_content(file)
      unless PDF_TO_TEXT_EXEC
        raise RuntimeError.new("Could not find pdftotext executable")
      end

      text = `#{PDF_TO_TEXT_EXEC} #{file.path} - 2>&1`

      unless $CHILD_STATUS.exitstatus == 0
        if text =~ /Copying of text from this document is not allowed/
          raise RuntimeError.new(NO_COPYING_ERROR_TEXT)
        else
          raise RuntimeError.new("Could not extract PDF text: #{text}")
        end
      end
      return text
    end
    
  end

  class OutputFilter

  end

  # ...
  class HTMLFilter < OutputFilter

  end

  class Indexer
    attr_reader :cursor
    def initialize()
      @cursor = 0
    end

    def index(str)
      prepare(str)
      str.each_line { | line | feed(line) }
    end

    def terminate()
    end

    def prepare(content)
    end

    def feed(line)
      @cursor += line.length
    end
  end
  
  # An indexer which records the position of words for later reverse
  # retrieval
  class WordIndexer <  Indexer
    attr_reader :words
    # includes accented latin-1 characters
    WORD_TOKENIZER = /[[:alnum:]\192-\255']+/
    def initialize()
      super
      @words = Hash.new { | h, k | h[k] = [] }
    end

    def feed(line)
      line.scan( WORD_TOKENIZER ) do | word |
        next if word.length == 1
        @words[word].push(cursor + Regexp.last_match.begin(0))
      end
      super
    end
  end

  # An indexer that uses text patterns to identify, for example,
  # passages by a particular speaker, or text headings. 
  # The default rules recognise the following:
  # **HEADING**
  # SPEAKER:
  class AutoCoder < Indexer
    STANDARD_TRIGGER_RULES = {
      /^(\w+)\:\s*$/   => 'Speaker',
      /^\*\*(.*)\*\*$/ => 'Heading'
    }

    attr_reader :codes

    def initialize(rules = STANDARD_TRIGGER_RULES)
      super()
      @trigger_rules = rules
      @codes      = {}
      @curr_codes = {}
    end

    def feed(line)
      @trigger_rules.each do | rule, type |
        if match = rule.match(line)
          trigger(cursor, type, match[1]) 
        end
      end
      super
    end

    def get_code(codename)
      @codes[codename] or @codes[codename] = QDA::Category.new(codename)
    end

    def terminate()
      @curr_codes.each do | type, codedata |
        last_code, last_code_start = *codedata
        last_code.code(-1, last_code_start, 
                       cursor - last_code_start)
      end
    end

    def trigger(cursor, codetype, codevalue)
      codename = "#{codetype}: #{codevalue}"
      code = get_code(codename)
      if @curr_codes[codetype]
        last_code, last_code_start = @curr_codes[codetype]
        # -1 here is a placeholder value for a document id to be
        # supplied later
        last_code.code(-1, last_code_start,
                       cursor - last_code_start)
      end
      @curr_codes[codetype] = [ code, cursor ]
    end
  end


end

About Koders | Resources | Downloads | Support | Black Duck | Submit Project | Terms of Service | DMCA | Privacy Policy | Site Map| Contact Us