Welcome to the IBM OmniFind Yahoo! Edition Forum
August 20, 2008, 06:42:36 PM *
Welcome, Guest. Please login or register.

Login with username and password
News:
 
   Home   Help Search Login Register  
« previous next »
Pages: [1] Print
Author Topic: Simple Python integration  (Read 403 times)
flunardelli
Newbie
*
Posts: 2


View Profile
« on: July 14, 2008, 10:49:46 AM »

I'd like to share a simple python class to index documents:

omnifind.py:
------------------------------------------------------------------
# Copyright (C) 2008
# Author: Fernando Lunardellli
# Contact: flunardelli at gmail.com

"""OmniFind interaction Class

"""
__version__ = "0.1"
__all__ = [ 'document_add' ]

import httplib, urllib, base64
import mimetypes, datetime
import anydbm as cache
import sha
import os.path

class Document:
    """
        Main class
            collection: Name of collection to interact.
    """
   
    def __init__(self, collection):
        self.server = "127.0.0.1:8888" #Local server and port
        self.doc_lang = "pt_BR" #main document language, see API documentation.
        self.doc_locale = "pt_BR"
        self.doc_knownlang = "pt_BR"
        self.doc_id = ""
        self.apikey = base64.encodestring('api:--key--') #API key, see documentation. Obs: dont remove 'api:'
        self.headers = {}
        self.collection = collection
        self.cachefile = os.path.join("/tmp","omnifind-" + collection + ".cache") #Cache file localtion, used to control file inclusion
        self.debug = None
                       
    def _documentParser(self, doc_type):
        """
        Add main api header
        """
        doc_lastmodified = datetime.datetime.isoformat(datetime.datetime.now())
        headers = {
            "collection": self.collection,
            "docId": self.doc_id,
            "docType": doc_type,
            "docLang": self.doc_lang,
            "locale": self.doc_locale,
            "docKnownLang": self.doc_knownlang,
            "lastModified": doc_lastmodified,
        }
        self.headers.update(headers)

    def _connectApi(self, doc_content):
        """
        Connection method
        """   
        self.headers.update({"Authorization": "Basic %s" % self.apikey})
        try:
            conn = httplib.HTTPConnection(self.server)
            conn.request("POST", "/api/document", doc_content, self.headers)
            response = conn.getresponse()

            # Debug
            if self.debug:
                conn.set_debuglevel(1)
           
                data = response.read()
                print response.status, response.reason
                print data   
                print response.getheaders()

            if response.status == 200:
                self._cache_update()
                print "OK: document (%s) has been indexed" % (self.doc_id)

            conn.close()
        except BaseException, e:
            print e

    def document_add(self, doc_id, doc_content, doc_type = None, metadata = {}):
        """
        Public method to add documents to engine
            doc_id (required): Documento id, may be a url. See documentation.
            doc_content (required): Document content.
            doc_type (opt): Document mimetype.
            metadata (opt): Dict of adicional metadatas. ex: {'name': 'omniapi.doc', 'internallink': 'http://localhost/1234'}
        """

        self.doc_id = doc_id

        if not doc_type:
            mimetypes.init()
            doc_type,doc_subtype = mimetypes.guess_type(self.doc_id)       

        self.headers.update({"action": "addDocument"})

        if metadata:
            self.headers.update(metadata)

        self._documentParser(doc_type)

        if self.debug:
            for i,f in self.headers.iteritems():
                print "headers: %s: %s" %(i,f)
            print "content size: %s" % len(doc_content)

        if not self._cache_indexed():
            print "Info: connected for inclusion."
            if self.debug:
                print "---------------------------------------"
                return True
            else:
                return self._connectApi(doc_content)
        else:
            print "Warn: document (%s) already indexed.\n" % (self.doc_id)
            return True

    def document_del(self):
        pass
   
    def _cache_indexed(self):
        key = sha.sha(str(self.doc_id)).digest()
        e = cache.open(self.cachefile,"c")
        return e.has_key(key)

    def _cache_update(self):
        key = sha.sha(str(self.doc_id)).digest()
        e = cache.open(self.cachefile,"c")
        r = True
        if e.has_key(key):
            r = None
        else:
            e[key] = datetime.datetime.isoformat(datetime.datetime.now())
        e.close()
        return r

class Search:
    """ TODO """
    pass
----------------------------------------------------------------
Usage:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os,sys
import time,datetime
import omnifind
from os.path import join, getsize


collection = 'Files'
dirname = "/home/fernando_lunardelli/Files/"
username = "fernando_lunardelli"
def main():
    for root, dirs, files in os.walk(dirname):
        for name in files:
            filename = os.path.join(root,name)
       f = fopen(filename,'rb')
            content = f.readlines();
            if content:
                doc_id = 'file:/%s' % (filename)
                doc_content = content
                doc_type = 'text/html'
                doc_name = name
                doc_author = username
                print "%s: %s - %s - %s" % (datetime.datetime.isoformat(datetime.datetime.now()),doc_id,doc_name,doc_type)
                entry = omnifind.Document(collection)
                entry.debug = False
                entry.document_add(doc_id,doc_content,doc_type,{'X-name': doc_name, 'X-author': doc_author})
                time.sleep(1)
                #    print "Erro: %s", reg
                #    break
        f.close()
    print "Finished"
    conn.close()

if __name__=='__main__':
    main()
Logged
Pages: [1] Print 
« previous next »
Jump to:  

IBM OmniFind Yahoo! Edition Forum | Powered by SMF 1.1.2.
© 2005, Simple Machines LLC. All Rights Reserved.