summaryrefslogtreecommitdiffstats
path: root/pdfspider
blob: 503025ffc20f7bb5cff6826d6ff2989f4f7a07f5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python2
import sys
import xapian
import string
import subprocess
import shlex
import os.path as P
from pyPdf import PdfFileReader

datadir="/dev/shm/pindex"
sstring = "locate *.pdf"
files = subprocess.Popen(shlex.split(sstring),stdout=subprocess.PIPE,stderr=subprocess.PIPE)
output,err = files.communicate()

type = sys.argv[1]  if len(sys.argv) > 1  else "query"
database = xapian.WritableDatabase(datadir, xapian.DB_CREATE_OR_OPEN)

if type == "add":
    datadir = sys.argv[2]  if len(sys.argv) > 2  else "/dev/shm/pindex"
    limit = int(sys.argv[3])  if len(sys.argv) > 3  else 10
    # Open the database for update, creating a new database if necessary.
    indexer = xapian.TermGenerator()
    stemmer = xapian.Stem("english")
    indexer.set_stemmer(stemmer)
    for pfile in output.split('\n'):
        try:
            if (limit == 0):
                break
            print "Adding file %s" % pfile
            pdf = PdfFileReader(file(pfile,"rb"))
            for i in range(pdf.getNumPages()):
                data = pdf.getPage(i).extractText()
                doc = xapian.Document()
                try:
                    title = pdf.getDocumentInfo().title
                except:
                    title = P.basename(pfile)
                #doc.set_data(str(title))
                doc.set_data(data)

                doc.add_value(1,pfile)
                doc.add_value(2,str(i))
                doc.add_term("T"+title)
                indexer.set_document(doc)
                indexer.index_text(data)

                database.add_document(doc)
            limit = limit - 1
        except  Exception as e:
            print >> sys.stderr, "Exception: %s" % str(e)
            continue

elif type == "query" and len(sys.argv) > 2:
    try:
        enquire = xapian.Enquire(database)
    except Exception as e:
        print "Exception: %s" % str(e)
        sys.exit(1)
    #flags = xapian.Query.OP_OR
    #if (sys.argv[2].find(',')):
        #flags=
    limit = int(sys.argv[2])  if len(sys.argv) > 2
    query_string = sys.argv[3:]
    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)
    print "Parsed query is: %s" % str(query)

    enquire.set_query(query)
    matches = enquire.get_mset(0,limit)
    # Display the results.
    print "%i results found." % matches.get_matches_estimated()
    print "Results 1-%i:" % matches.size()

    for m in matches:
        print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data())
        for ter in m.document.values():
            print "%d : %s" %(ter.num,ter.value)
else:
    print "Usage: %s query|add database|(term1,term2..) limit" % sys.argv[0]
    sys.exit(1)
#if __name__ == '__main__':
#    sys.exit(