""" Module MontyLingua MONTY LINGUA - An end-to-end natural language processor for English, for the Python/Java platform Author: Hugo Liu Project Page: Copyright (c) 2002-2004 by Hugo Liu, MIT Media Lab All rights reserved. Non-commercial use is free, as provided in the GNU GPL By downloading and using MontyLingua, you agree to abide by the additional copyright and licensing information in "license.txt", included in this distribution If you use this software in your research, please acknowledge MontyLingua and its author, and link to back to the project page http://web.media.mit.edu/~hugo/montylingua. Please cite montylingua in academic publications as: Liu, Hugo (2004). MontyLingua: An end-to-end natural language processor with common sense. Available at: web.media.mit.edu/~hugo/montylingua. ************************************************ DOCUMENTATION OVERVIEW About MontyLingua: - MontyTokenizer - normalizes punctuation, spacing and contractions, with sensitivity to abbrevs. - MontyTagger - Part-of-speech tagging using PENN TREEBANK tagset - enriched with "Common Sense" from the Open Mind Common Sense project - exceeds accuracy of Brill94 tbl tagger using default training files - MontyREChunker - chunks tagged text into verb, noun, and adjective chunks (VX,NX, and AX respectively) - incredible speed and accuracy improvement over previous MontyChunker - MontyExtractor - extracts verb-argument structures, phrases, and other semantically valuable information from sentences and returns sentences as "digests" - MontyLemmatiser - part-of-speech sensitive lemmatisation - strips plurals (geese-->goose) and tense (were-->be, had-->have) - includes regexps from Humphreys and Carroll's morph.lex, and UPENN's XTAG corpus - MontyNLGenerator - generates summaries - generates surface form sentences - determines and numbers NPs and tenses verbs - accounts for sentence_type WHERE MUST THE DATAFILES BE? - the "datafiles" include all files ending in *.MDF - the best solution is to create an environment variable called "MONTYLINGUA" and put the path to the datafiles there - alternatively, MontyLingua can find the datafiles if they are in the operating system "PATH" variable, or in the current working directory API: The MontyLingua Python API is MontyLingua.html The MontyLingua Java API is JMontyLingua.html RUNNING: MontyLingua can be called from Python, Java, or run at the command line. A. From Python, import the MontyLingua.py file B. From your Java code: 1. make sure "montylingua.jar" is in your class path, in addition to associated subdirectories and data files 2. in your code, you need something like: import montylingua.JMontyLingua; // loads namespace public class YourClassHere { public static JMontyLingua j = new JMontyLingua(); public yourFunction(String raw, String toked) { jisted = j.jist_predicates(raw); // an example function 3. For a good use case example, see Sample.java. C. From the command line: 1. if you have python installed and in your path: type "run.bat" 2. if you have java installed and in your path: type "runJavaCommandline.bat" VERSION HISTORY: New in version 2.1 (6 Aug 2004) - new MontyNLGenerator component (in Beta phase) - includes version 2.0.1 bugfix for problem where java api wasn't being exposed New in version 2.0 (29 Jul 2004) - 2.5X speed enhancement for whole system 2X speed enhancement for tagger component - rule-based chunker replaced with much faster and more accurate regular expression chunker - common sense added to MontyTagger component improves word-level tagger accuracy to 97% - updated and expanded lexicon for English - added a user-customizable lexicon CUSTOMLEXICON.MDF - improvements to MontyLemmatiser incorporating exception cases - html documentation added - speed optimizations to all code - improvements made to semantic extraction - added a morphological analyzer component, MontyMorph - expanded Java API New in version 1.3.1 (11 Nov 2003) - mainly bugfixes - datafiles can now sit in the current working directory (".") or in the path of either of the two environment variables "MONTYLINGUA" or "PATH" - presence of the '/' token in input won't crash system New in Version 1.3 (5 Nov 2003) - lisp-style predicate output added - Sample.java example file added to illustrate API New in Version 1.2 (12 Sep 2003) - MontyChunker rules expanded - MontyLingua JAVA API added - MontyLingua documentation added New in Version 1.1 (1 Sep 2003) - MontyTagger optimized, 2X loading and 2.5X tagging speed - MontyLemmatiser added to MontyLingua suite - MontyChunker added - MontyLingua command-line capability added New in Version 1.0 (3 Aug 2003) - First release - MontyTagger (since 15 Jan 2001) added to MontyLingua --please send bugs & suggestions to hugo@media.mit.edu-- """ __author__ = "Hugo Liu " __version__ = "2.1" import MontyTokenizer, MontyTagger, MontyLemmatiser, MontyREChunker, MontyExtractor, MontyNLGenerator class MontyLingua: def __init__(self,trace_p=0): print '\n****** MontyLingua v.'+__version__+' ******' print '***** by hugo@media.mit.edu *****' self.trace_p = trace_p self.theMontyTokenizer = MontyTokenizer.MontyTokenizer() self.theMontyLemmatiser = MontyLemmatiser.MontyLemmatiser() self.theMontyTagger = MontyTagger.MontyTagger(trace_p,self.theMontyLemmatiser) self.theMontyChunker = MontyREChunker.MontyREChunker() self.theMontyExtractor = MontyExtractor.MontyExtractor() self.theMontyNLGenerator = MontyNLGenerator.MontyNLGenerator() print '*********************************\n' # # MAIN FUNCTIONS # def generate_summary(self,vsoos): """ uses MontyNLGenerator to generate a paragraph text summary in the past tense inputs a flat list of verb-subject-object-object tuples """ return self.theMontyNLGenerator.generate_summary(vsoos) def generate_sentence(self,vsoo,sentence_type='declaration',tense='past',s_dtnum=('',1),o1_dtnum=('',1),o2_dtnum=('',1),o3_dtnum=('',1)): """ inputs verb-subject-object-object tuple outputs a generated sentence valid sentence types: declarative, imperative, (can|may|would|should|could), (who|what|when|where|why|how), question valid tenses: past, present, progressive, past_progressive, future, infinitive dtnum is a pair of determiner, number e.g. ('the',1),('some',2) valid determiners = 'a','the','some','',etc valid numbers = 1,2,3 """ return self.theMontyNLGenerator.generate_sentence(vsoo,sentence_type=sentence_type,tense=tense,s_dtnum=s_dtnum,o1_dtnum=o1_dtnum,o2_dtnum=o2_dtnum,o3_dtnum=o3_dtnum) def jist_predicates(self,text): """ similar to jist() except output is simpler returns a list (document-level) of lists (sentence-level) of lisp-style predicate argument structures - each structure should look something like this: - ("verb" "subject" "obj1" "obj2" ... ) - words are all lemmatised, and determiners and modals are stripped out - obj's can be direct or indirect, but not subordinate clauses for now. """ infos = self.jist(text) svoos_list = [] for info in infos: svoos = info['verb_arg_structures_concise'] svoos_list.append(svoos) return svoos_list def jist(self,text): """ inputs raw text, outputs a list of dictionaries with information digests of each sentence """ sentences = self.split_sentences(text) tokenized = map(self.tokenize,sentences) tagged = map(self.tag_tokenized,tokenized) chunked = map(self.chunk_tagged,tagged) #print "CHUNKED: " + string.join(chunked,'\n ') extracted = map(self.extract_info,chunked) return extracted def pp_info(self,extracted_infos): """pretty prints sentence information digests returned by jist()""" for i in range(len(extracted_infos)): keys = extracted_infos[i].keys() keys.sort() print "\n\n SENTENCE #%s DIGEST:\n"%str(i+1) for key in keys: print (key+": ").rjust(22) + str(extracted_infos[i][key]) def split_paragraphs(self,text): """inputs a raw text and outputs a list of paragraph segments""" return self.theMontyTokenizer.split_paragraphs(text) def split_sentences(self,text): """input a raw text and outputs a list of sentence segments""" return self.theMontyTokenizer.split_sentences(text) def tokenize(self,sentence,expand_contractions_p=1): """ inputs a raw text sentence and outputs that sentence with punctuation tokenized, except in the case of abbreviations iff expand_contractions_p == 1, then contractions will be resolved (e.g. "can't"-->"can not") """ return self.theMontyTokenizer.tokenize(sentence,expand_contractions_p) def tag_tokenized(self,tokenized_text): """ takes tokenized text and returns Penn Treebank tagset tagged text: i.e.: "This/DT is/VB a/DT sentence/NN". more information on the tagset can be found at: http://www.cis.upenn.edu/~treebank/ """ return self.theMontyTagger.tag_tokenized(tokenized_text) def strip_tags(self,tagged_or_chunked_text): """ strips part-of-speech and chunk tags from text and returns plaintext """ toks = tagged_or_chunked_text.split() toks = filter(lambda x:'/' in x,toks) toks = map(lambda x:x.split('/')[0],toks) return ' '.join(toks) def parse_pred_arg(self,pp): """ parses the predicate-argument string returned by jist_predicates(), of the form: '("pred name" "arg 1" "arg 2" etc)' and returns them as a list """ # unpp augmented predicate pp.strip toks = pp.strip()[1:-1].split() args = ' '.join(toks)[1:-1].split('" "') return args def chunk_tagged(self,tagged_text): """ chunks tagged text and outputs the form: "(NX He/PRP NX) (VX is/VB VX) (NX the/DT mailman/NN NX)" """ return self.theMontyChunker.Process(tagged_text) def chunk_lemmatised(self,lemmatised_text): """ inputs lemmatised text of the form: "He/PRP/he ran/VBD/run" and outputs the form: "(NX He/PRP/he NX) (VX is/VB/be VX) (NX the/DT/the mailman/NN/mailman NX)" """ return self.theMontyChunker.chunk_multitag(lemmatised_text) def lemmatise_tagged(self,tagged_text): """ lemmatises tagged text and outputs the form: 'These/DT/These sentences/NNS/sentence were/VBZ/be false/JJ/false' (lemma follows the pos tag) """ return self.theMontyLemmatiser.lemmatise_tagged_sentence(tagged_text) def extract_info(self,chunked_text): """ extracts many useful things from chunked_text outputted in a dictionary, which can be printed using pp_info() its keys and sample values: noun_phrases: ['the dog','the cat'] noun_phrases_tagged: ['the/DT dog/NN','the/DT cat/NN'] verb_phrases: ['will go quickly','go slowly'] verb_phrases_tagged: ['will/MD go/VB quickly/RB','go/VB slowly/RB'] prep_phrases: ['by the road','by chance'] prep_phrases_tagged: ['by/IN the/DT road/NN','by/IN chance/NN'] modifiers: ['red','best','quickly'] modifiers_tagged: ['red/JJ','best/JJS','quickly/RB'] verb_arg_structures: ['will/MD go/VB quickly/RB','the/DT dog/NN','to/IN the/DT cats/NNS'] verb_arg_structures_concise: ['("go" "dog" "to cat")] """ return self.theMontyExtractor.extract_info(chunked_text,self.theMontyLemmatiser.lemmatise_tagged_sentence) # END MONTYLINGUA CLASS # # COMMAND LINE PROGRAM # # rsm 2007-10-21: added flushes and removed raw_input() call, so it interacts # nicely with montylingua.rb. if __name__ == "__main__": import sys,time if '/?' in sys.argv or '-?' in sys.argv: print """ USAGE: >> python MontyLingua.py """ sys.exit(0) m = MontyLingua() # show command prompt interface print '\n' sys.stdout.flush() # rsm try: while 1: sentence = '' try: #sentence = raw_input('> ') sys.stdout.write('> ') # rsm sys.stdout.flush() sentence = sys.stdin.readline() except: raise time1 = time.time() print '\n' extractions = m.jist(sentence) print m.pp_info(extractions) predicates_list = map(lambda a:m.parse_pred_arg(a),reduce(lambda y,z:y+z,map(lambda x:x['verb_arg_structures_concise'],extractions))) print predicates_list print '\nGENERATED SUMMARY:\n'+m.generate_summary(predicates_list) time2= time.time() print "-- monty took",str(round(time2-time1,2)),'seconds. --\n' sys.stdout.flush() # rsm except (KeyboardInterrupt,EnvironmentError): #print "\n-- monty says goodbye! --" sys.exit(0)