00001 """
00002 Modul token
00003 Author: Branimira Nikolova
00004
00005 """
00006
00007 import MontyTokenizer, MontyTagger
00008
00009 class Token:
00010
00011 """
00012 class Token
00013 argunets:
00014 token: the token
00015 tag: the part-of-speech tag
00016 tagSetDict: dictionary for some important tags
00017 """
00018
00019 def __init__(self, token, tag):
00020
00021
00022 self.token = token
00023 self.tag = tag
00024 self.tagSetDict = {"noun": ["NNP", "NNPS", "NN", "NNS", "CD"],
00025 "stopTags":["CC","IN","TO","RP", "PRP", "PRP$", "WP", "WP$"],
00026 "delimiters":[",", ";", ".", "!", "?"],
00027 "open": "(",
00028 "digit": "CD",
00029 "close": ")"}
00030
00031 def __str__(self):
00032 """
00033 Representation from Acro for printing.
00034 """
00035 return '%s, %s' %(self.token, self.tag)
00036
00037 def tagTextWithMT(self, text):
00038 """
00039 The input text is tokenized and tagged using the Penn-Treebank Tag Set.
00040
00041 Returns list of tokens.
00042 """
00043
00044 tokenList=[]
00045 new_MTA = MontyTagger.MontyTagger()
00046 tokText = new_MTA.tag(text)
00047 ltTokens = tokText.split()
00048
00049 for t in ltTokens:
00050 tt = t.split("/")
00051 newToken=Token(token = tt[0], tag = tt[1])
00052
00053 tokenList.append(newToken)
00054
00055
00056 return tokenList