require 'category'
require 'coding'
require 'English'
module QDA
class InputFilter
attr_reader :cursor
def initialize()
@cursor = 0
@indexers = []
end
def add_indexer(indexer)
unless indexer.respond_to?(:feed)
raise "Document indexers should have a feed method"
end
@indexers.push(indexer)
end
# reads +file+ and creates a new document titled +doctitle+. +file+
# may be a String filename or an open stream.
# Under the hood, calls +read_content+ to extract the content. This
# method must be implemented in subclasses. Then +process_content+
# is called to create the documents text. This class does something
# reasonable with plain text, but structured text formats will want
# to subclass this method to process non-text information (for
# example, HTML or XML tags)
def read(file, doctitle)
@content = ''
case file
when IO
@content = read_content(file)
when QDA::Document
@content = file.text
when String
begin
@content = read_content(File.new(file))
end
end
process_content(doctitle)
end
def process_content(doctitle)
# signal to indexers we're about to start
@indexers.each { | indexer | indexer.prepare(@content) }
doc = QDA::Document.new(doctitle)
@content.each_line do | line |
doc.append(line.to_s.chomp)
# inform AutoCoders, reverse indexers and so on.
@indexers.each { | indexer | indexer.feed(line) }
end
@indexers.each { | indexer | indexer.terminate() }
doc.create
return doc
end
end
class TextFilter < InputFilter
EXTENSIONS = [ 'txt' ]
def read_content(file)
text = file.read()
file.close()
text
end
end
class PDFFilter < InputFilter
EXTENSIONS = [ 'pdf' ]
PDF_TO_TEXT_EXEC = 'pdftotext'
NO_COPYING_ERROR_TEXT =
"The author or publisher of this PDF document has locked it to
prevent copying and extraction of its text. It is not possible to
import this document."
def read_content(file)
unless PDF_TO_TEXT_EXEC
raise RuntimeError.new("Could not find pdftotext executable")
end
text = `#{PDF_TO_TEXT_EXEC} #{file.path} - 2>&1`
unless $CHILD_STATUS.exitstatus == 0
if text =~ /Copying of text from this document is not allowed/
raise RuntimeError.new(NO_COPYING_ERROR_TEXT)
else
raise RuntimeError.new("Could not extract PDF text: #{text}")
end
end
return text
end
end
class OutputFilter
end
# ...
class HTMLFilter < OutputFilter
end
class Indexer
attr_reader :cursor
def initialize()
@cursor = 0
end
def index(str)
prepare(str)
str.each_line { | line | feed(line) }
end
def terminate()
end
def prepare(content)
end
def feed(line)
@cursor += line.length
end
end
# An indexer which records the position of words for later reverse
# retrieval
class WordIndexer < Indexer
attr_reader :words
# includes accented latin-1 characters
WORD_TOKENIZER = /[[:alnum:]\192-\255']+/
def initialize()
super
@words = Hash.new { | h, k | h[k] = [] }
end
def feed(line)
line.scan( WORD_TOKENIZER ) do | word |
next if word.length == 1
@words[word].push(cursor + Regexp.last_match.begin(0))
end
super
end
end
# An indexer that uses text patterns to identify, for example,
# passages by a particular speaker, or text headings.
# The default rules recognise the following:
# **HEADING**
# SPEAKER:
class AutoCoder < Indexer
STANDARD_TRIGGER_RULES = {
/^(\w+)\:\s*$/ => 'Speaker',
/^\*\*(.*)\*\*$/ => 'Heading'
}
attr_reader :codes
def initialize(rules = STANDARD_TRIGGER_RULES)
super()
@trigger_rules = rules
@codes = {}
@curr_codes = {}
end
def feed(line)
@trigger_rules.each do | rule, type |
if match = rule.match(line)
trigger(cursor, type, match[1])
end
end
super
end
def get_code(codename)
@codes[codename] or @codes[codename] = QDA::Category.new(codename)
end
def terminate()
@curr_codes.each do | type, codedata |
last_code, last_code_start = *codedata
last_code.code(-1, last_code_start,
cursor - last_code_start)
end
end
def trigger(cursor, codetype, codevalue)
codename = "#{codetype}: #{codevalue}"
code = get_code(codename)
if @curr_codes[codetype]
last_code, last_code_start = @curr_codes[codetype]
# -1 here is a placeholder value for a document id to be
# supplied later
last_code.code(-1, last_code_start,
cursor - last_code_start)
end
@curr_codes[codetype] = [ code, cursor ]
end
end
end