AcronymDecomposer: AD_token.py Source File

00001 """
00002 Modul token
00003 Author: Branimira Nikolova
00004 
00005 """
00006 
00007 import MontyTokenizer, MontyTagger
00008 
00009 class Token:
00010 
00011      """
00012         class Token
00013             argunets:
00014             token: the token
00015             tag: the part-of-speech tag
00016             tagSetDict: dictionary for some important tags
00017     """    
00018     
00019     def __init__(self, token, tag):
00020 
00021         
00022         self.token = token
00023         self.tag = tag
00024         self.tagSetDict = {"noun": ["NNP", "NNPS", "NN", "NNS", "CD"],
00025                              "stopTags":["CC","IN","TO","RP", "PRP", "PRP$", "WP", "WP$"],
00026                              "delimiters":[",", ";", ".", "!", "?"],
00027                              "open": "(",
00028                              "digit": "CD",
00029                              "close": ")"}
00030     
00031     def __str__(self):
00032         """
00033         Representation from Acro for printing.
00034         """
00035         return '%s, %s' %(self.token, self.tag)
00036 
00037     def tagTextWithMT(self, text):
00038         """
00039         The input text is tokenized and tagged using the Penn-Treebank Tag Set.
00040 
00041         Returns list of tokens.
00042         """
00043         
00044         tokenList=[]
00045         new_MTA = MontyTagger.MontyTagger()
00046         tokText = new_MTA.tag(text)
00047         ltTokens = tokText.split()
00048     
00049         for t in ltTokens:
00050             tt = t.split("/")
00051             newToken=Token(token = tt[0], tag = tt[1])
00052             #print newToken
00053             tokenList.append(newToken)
00054 
00055     
00056         return tokenList