I'd like to share a simple python class to index documents:
omnifind.py:
------------------------------------------------------------------
# Copyright (C) 2008
# Author: Fernando Lunardellli
# Contact: flunardelli at gmail.com
"""OmniFind interaction Class
"""
__version__ = "0.1"
__all__ = [ 'document_add' ]
import httplib, urllib, base64
import mimetypes, datetime
import anydbm as cache
import sha
import os.path
class Document:
"""
Main class
collection: Name of collection to interact.
"""
def __init__(self, collection):
self.server = "127.0.0.1:8888" #Local server and port
self.doc_lang = "pt_BR" #main document language, see API documentation.
self.doc_locale = "pt_BR"
self.doc_knownlang = "pt_BR"
self.doc_id = ""
self.apikey = base64.encodestring('api:--key--') #API key, see documentation. Obs: dont remove 'api:'
self.headers = {}
self.collection = collection
self.cachefile = os.path.join("/tmp","omnifind-" + collection + ".cache") #Cache file localtion, used to control file inclusion
self.debug = None
def _documentParser(self, doc_type):
"""
Add main api header
"""
doc_lastmodified = datetime.datetime.isoformat(datetime.datetime.now())
headers = {
"collection": self.collection,
"docId": self.doc_id,
"docType": doc_type,
"docLang": self.doc_lang,
"locale": self.doc_locale,
"docKnownLang": self.doc_knownlang,
"lastModified": doc_lastmodified,
}
self.headers.update(headers)
def _connectApi(self, doc_content):
"""
Connection method
"""
self.headers.update({"Authorization": "Basic %s" % self.apikey})
try:
conn = httplib.HTTPConnection(self.server)
conn.request("POST", "/api/document", doc_content, self.headers)
response = conn.getresponse()
# Debug
if self.debug:
conn.set_debuglevel(1)
data = response.read()
print response.status, response.reason
print data
print response.getheaders()
if response.status == 200:
self._cache_update()
print "OK: document (%s) has been indexed" % (self.doc_id)
conn.close()
except BaseException, e:
print e
def document_add(self, doc_id, doc_content, doc_type = None, metadata = {}):
"""
Public method to add documents to engine
doc_id (required): Documento id, may be a url. See documentation.
doc_content (required): Document content.
doc_type (opt): Document mimetype.
metadata (opt): Dict of adicional metadatas. ex: {'name': 'omniapi.doc', 'internallink': '
http://localhost/1234'} """
self.doc_id = doc_id
if not doc_type:
mimetypes.init()
doc_type,doc_subtype = mimetypes.guess_type(self.doc_id)
self.headers.update({"action": "addDocument"})
if metadata:
self.headers.update(metadata)
self._documentParser(doc_type)
if self.debug:
for i,f in self.headers.iteritems():
print "headers: %s: %s" %(i,f)
print "content size: %s" % len(doc_content)
if not self._cache_indexed():
print "Info: connected for inclusion."
if self.debug:
print "---------------------------------------"
return True
else:
return self._connectApi(doc_content)
else:
print "Warn: document (%s) already indexed.\n" % (self.doc_id)
return True
def document_del(self):
pass
def _cache_indexed(self):
key = sha.sha(str(self.doc_id)).digest()
e = cache.open(self.cachefile,"c")
return e.has_key(key)
def _cache_update(self):
key = sha.sha(str(self.doc_id)).digest()
e = cache.open(self.cachefile,"c")
r = True
if e.has_key(key):
r = None
else:
e[key] = datetime.datetime.isoformat(datetime.datetime.now())
e.close()
return r
class Search:
""" TODO """
pass
----------------------------------------------------------------
Usage:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os,sys
import time,datetime
import omnifind
from os.path import join, getsize
collection = 'Files'
dirname = "/home/fernando_lunardelli/Files/"
username = "fernando_lunardelli"
def main():
for root, dirs, files in os.walk(dirname):
for name in files:
filename = os.path.join(root,name)
f = fopen(filename,'rb')
content = f.readlines();
if content:
doc_id = 'file:/%s' % (filename)
doc_content = content
doc_type = 'text/html'
doc_name = name
doc_author = username
print "%s: %s - %s - %s" % (datetime.datetime.isoformat(datetime.datetime.now()),doc_id,doc_name,doc_type)
entry = omnifind.Document(collection)
entry.debug = False
entry.document_add(doc_id,doc_content,doc_type,{'X-name': doc_name, 'X-author': doc_author})
time.sleep(1)
# print "Erro: %s", reg
# break
f.close()
print "Finished"
conn.close()
if __name__=='__main__':
main()