Author: | Ed Summers <ehs@pobox.com> |
---|---|
Version: | 1 |
from mailbox import UnixMailbox from PyLucene import Field, Document, StandardAnalyzer, FSDirectory, \ IndexWriter store = FSDirectory.getDirectory( "chipy-index", True ) writer = IndexWriter( store, StandardAnalyzer(), True ) mbox = UnixMailbox( open('chipy.mbox') ) while True: msg = mbox.next() if msg == None: break writer.addDocument( EmailDoc(msg) ) writer.close()
from PyLucene import Document, Field class EmailDoc( Document ): def __init__( self, msg ): Document.__init__( self ) sender = msg.getheader('From') self.add( Field.Text( 'from', sender ) ) subject = msg.getheader( 'Subject' ) self.add( Field.Text( 'subject', subject ) ) body = msg.fp.read() self.add( Field.Text( 'body', body ) ) id = msg.getheader('Message-ID') self.add( Field.Keyword( 'id', id ) ) self.add( Field.Text( 'all', sender + subject + body ) )
from sys import argv from PyLucene import FSDirectory, IndexSearcher, QueryParser, \ StandardAnalyzer string = argv[1].strip() directory = FSDirectory.getDirectory( 'chipy-index', False ) searcher = IndexSearcher( directory ) query = QueryParser.parse( string, 'all', StandardAnalyzer() ) hits = searcher.search( query ) for i in range(0,hits.length()): doc = hits.doc(i) print "ID: %s" % doc.getField('id').stringValue() print "From: %s" % doc.getField('from').stringValue() print "Subject: %s" % doc.getField('subject').stringValue() print "Date: %s" % doc.getField('date').stringValue() print
from sys import argv from PyLucene import FSDirectory, IndexSearcher, TermQuery, Term id = argv[1].strip() directory = FSDirectory.getDirectory( 'chipy-index', False ) searcher = IndexSearcher( directory ) query = TermQuery( Term( 'id', id ) ) hits = searcher.search( query ) doc = hits.doc(0) print "ID: %s" % doc.getField('id').stringValue() print "From: %s" % doc.getField('from').stringValue() print "Subject: %s" % doc.getField('subject').stringValue() print "Date: %s" % doc.getField('date').stringValue() print doc.getField('body').stringValue() print
You can download the src code for these examples here. If you want an mbox to play with you can grab them from the chipy list archives.
Thanks to rst2s5 these slides were written in reStructuredText.