00001 """
00002 Modul AcronmyDecomposer
00003 Programmier-Gesellenstueck
00004 Author: Branimira Nikolova
00005 SS 2006
00006
00007 This module surches text for Akronym-Expansion Pairs and trys to match
00008 the them to each other.
00009
00010 """
00011
00012
00013 import re
00014 from AD_token import Token
00015
00016 class Acro(Token):
00017 """
00018 class Acro:
00019 arguments:
00020 acronym: the acronym self
00021 expansion: the resulting expansion
00022 expCandidate: the candidate string for the expansion
00023 candidate: the candidate string for the acronym
00024 position: the current possition necessary to calculate the search space
00025 digitDict: possible matches for some digits
00026 """
00027
00028 def __init__(self):
00029
00030 self.acronym = ''
00031 self.expansion = ''
00032 self.candidate = ''
00033 self.expCandidate = ''
00034 self.position = 0
00035 self.digitDict = {'2':['to', 'two', 'second'],'3':'third','4':['for', 'fourth']}
00036
00037 def __str__(self):
00038 """
00039 Representation from Acro for printing.
00040 """
00041 return '%s, %s' %(self.acronym, self.expansion)
00042
00043
00044 def findAcroCandidate(self, text):
00045 '''
00046 Surches for Acronym-candidates: uppercase or capitalized tokens
00047 - in parentheses
00048 - infront of parentheses
00049 - inftont of ", or"
00050 - after ", or"
00051
00052 Returns Candidates list.
00053 '''
00054
00055 newToken=Token(token=None, tag=None)
00056 tokenList=self.tagTextWithMT(text)
00057
00058
00059 resultList = []
00060 result= None
00061 mix=('([A-Z][a-zA-Z]+)')
00062 regex=re.compile(mix)
00063
00064 for it in range(len(tokenList)):
00065 if (tokenList[it].token.istitle() or tokenList[it].token.isupper()\
00066 or regex.match(tokenList[it].token)):
00067
00068 if not 2 > len(tokenList[it].token) >= 10:
00069
00070 if tokenList[it].tag in newToken.tagSetDict["noun"] :
00071
00072
00073
00074 if tokenList[it-1].tag == newToken.tagSetDict["open"]:
00075
00076 if tokenList[it+1].tag == newToken.tagSetDict['close']:
00077
00078
00079 self.candidate = tokenList[it].token
00080 self.position = it
00081 self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict)
00082 result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00083 if result:
00084 resultList.append(result)
00085
00086
00087 elif tokenList[it+1].tag == newToken.tagSetDict["open"]:
00088 if tokenList[it+3].tag != newToken.tagSetDict["close"]:
00089 self.expCandidate=[]
00090 if tokenList[it].tag == newToken.tagSetDict["digit"]:
00091 continue
00092 else:
00093 self.candidate = tokenList[it].token
00094
00095 while tokenList[it+1].tag != newToken.tagSetDict["close"]:
00096 if tokenList[it+1].tag in newToken.tagSetDict['delimiters']:
00097 break
00098 else:
00099 it+=1
00100 self.expCandidate.append(tokenList[it])
00101
00102
00103 if len(self.expCandidate) < 2:
00104 continue
00105 else:
00106 self.expCandidate.reverse()
00107
00108 result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00109 if result:
00110 resultList.append(result)
00111
00112
00113 elif (tokenList[it-1].token == 'or' and tokenList[it-2].token == ','):
00114 self.candidate = tokenList[it].token
00115 self.position = it-1
00116
00117 self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict )
00118 result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00119 if result:
00120 resultList.append(result)
00121
00122
00123
00124 elif (tokenList[it+1].token==',' and tokenList[it+2].token == 'or'):
00125 self.expCandidate = []
00126 self.candidate = tokenList[it].token
00127 self.position = it
00128 it+=3
00129
00130
00131 while 1:
00132 if tokenList[it].tag in newToken.tagSetDict['delimiters']:
00133 break
00134 else:
00135 self.expCandidate.append(tokenList[it])
00136 it+=1
00137
00138 if len(self.expCandidate) < 2:
00139 continue
00140 else:
00141 self.expCandidate.reverse()
00142 result= self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00143 if result:
00144 resultList.append(result)
00145
00146
00147 return resultList
00148
00149
00150
00151 def computeSearchSpace(self, candidate, position, tokenList, tagSetDict):
00152 '''
00153 Computes the search space for Acronym-Candidates:
00154 - if the Acronym-Candidate is longer than 5 characters the searchspace is definedto be the lenght of the Acronym-Candidate+5;
00155 - if it is shorter than 5 characters the searchspace is the lenght*2
00156
00157 Returns the Acronym-Candiate as list of AD_Token instances.
00158 '''
00159
00160 expCandidate = []
00161
00162 if len(self.candidate)>= 5:
00163 searchSpace = len(self.candidate)+5
00164 else:
00165 searchSpace = len(self.candidate)*2
00166
00167 start = self.position-2
00168
00169 end = start-searchSpace
00170
00171
00172
00173 while start > end:
00174
00175 if tokenList[start].tag in tagSetDict['delimiters']:
00176 break
00177 else:
00178 expCandidate.append(tokenList[start])
00179 start-=1
00180
00181 return expCandidate
00182
00183
00184 def checkTrueExp(self, candidate, expCandidate, tagSetDict):
00185 '''
00186 Compares the Acronym-Candidate and the Expansion-Candidate backwards.
00187 Each character from the Acronym-Candidate must appear in one of the tokens in the Expansion-Candidate
00188 in the same order as in the Acronym-Candidate; the first character of the Acronym-Candidate must match
00189 a character in the initial position of the first word in the Expansion-Candidate.
00190
00191 Returns an Acro instance.
00192 '''
00193
00194 text=''
00195 for i in self.expCandidate:
00196 text='%s %s' %(text,i.token)
00197
00198
00199 itc = 0
00200
00201 its = 0
00202
00203 clist=[]
00204 match=''
00205 found=0
00206
00207 expansion=[]
00208 expansionEnd=0
00209
00210 for character in self.candidate:
00211 clist.append(character)
00212 clist.reverse()
00213
00214 while 1:
00215
00216 if itc==len(clist)-1:
00217
00218
00219 if self.expCandidate[its].tag in tagSetDict['stopTags']:
00220 its+=1
00221 continue
00222 else:
00223 first=self.expCandidate[its].token
00224
00225 if clist[itc].lower()== first[0].lower():
00226 expansion.append(self.expCandidate[its].token)
00227 expansionEnd=its
00228
00229 break
00230 else:
00231 its+=1
00232 if its > len(self.expCandidate)-1:
00233
00234 break
00235
00236
00237
00238 elif clist[itc].lower() in self.expCandidate[its].token.lower():
00239
00240
00241 if match == (clist[itc], self.expCandidate[its]):
00242 its+=1
00243 else:
00244 expansion.append(self.expCandidate[its].token)
00245
00246 match=(clist[itc], self.expCandidate[its])
00247 found+=1
00248 itc+=1
00249
00250
00251 if found >=3:
00252 its+=1
00253 found=0
00254
00255
00256 elif clist[itc].isdigit():
00257 its+=1
00258 if self.expCandidate[its].tag == tagSetDict['digit']:
00259 if clist[itc]== self.expCandidate[its].token:
00260
00261 expansion.append(self.expCandidate[its].token)
00262
00263 itc+=1
00264 its+=1
00265
00266
00267
00268 elif self.checkDigitDict(clist[itc], self.expCandidate[its].token):
00269 expansion.append(self.expCandidate[its].token)
00270 itc+=1
00271 its+=1
00272
00273
00274 elif self.countFirstChar(clist, itc, self.expCandidate, its):
00275 its+=1
00276 step=int(clist[itc])
00277
00278
00279 for it in self.expCandidate[its:(its+step)]:
00280 expansion.append(it.token)
00281
00282 itc+=1
00283 its+=step-1
00284 if itc==len(clist)-1:
00285 expansionEnd=its
00286 break
00287 else:
00288 break
00289
00290 else:
00291 its+=1
00292 if its>=len(self.expCandidate):
00293 break
00294
00295
00296 if itc>len(clist):
00297 break
00298
00299 if len(expansion) < len (self.candidate):
00300
00301 return 0
00302
00303 else:
00304
00305 i=expansionEnd
00306 full = ''
00307 while i >=0:
00308 full="%s %s" %(full, self.expCandidate[i].token)
00309 i-=1
00310
00311
00312 newA=Acro()
00313 newA.acronym = self.candidate
00314 newA.expansion = full
00315
00316 print newA
00317 return newA
00318
00319 def checkDigitDict(self, ac, ec):
00320
00321 '''Check if the digit in the acronym stands for
00322 a word like "for"(4) or "to"(2).
00323 '''
00324
00325 if ac in self.digitDict.keys():
00326
00327 if ec.lower() in self.digitDict[ac]:
00328 return 1
00329 else:
00330 return 0
00331
00332 def countFirstChar(self, ac, acPos, ec, ecPos):
00333
00334 ''' If the acronym starts with a digit, or some digit is inside
00335 of the acronym, count the words in the expansion, which start with
00336 the character preceding or following the digit and if the number of the
00337 characters is equal to the digit return 1.
00338 '''
00339
00340 count=0
00341 ecEnd=len(ec)-1
00342
00343 if ac[acPos]==ac[-1]:
00344 print ac[acPos]
00345
00346 for it in ec:
00347 if ac[acPos-1] == it.token[0]:
00348 print it.token
00349 count+=1
00350
00351
00352 elif ac[acPos+1]:
00353 print ac[acPos]
00354 for it in ec[ecPos:ecEnd]:
00355
00356 if ac[acPos+1] == it.token[0]:
00357 print it.token
00358 count+=1
00359
00360
00361 if count == int(ac[acPos]):
00362 return 1
00363 else:
00364 return 0
00365
00366 if __name__=="__main__":
00367
00368 import sys, os
00369
00370 if len(sys.argv) == 1:
00371 filename = "\\t\\testFile.txt"
00372 try:
00373 fh=open(filename, 'r')
00374 text=fh.read()
00375 fh.close()
00376 newAcro=Acro()
00377 result=newAcro.findAcroCandidate(text)
00378
00379 except:
00380 print "%s not found!" %(filename)
00381
00382 elif len(sys.argv)>1:
00383 result = []
00384 for f in os.listdir(sys.argv[1]):
00385 filename=os.path.join(sys.argv[1], f)
00386 print filename
00387 try:
00388 fh=open(filename, 'r')
00389 text=fh.read()
00390 fh.close()
00391 newAcro=Acro()
00392 result=newAcro.findAcroCandidate(text)
00393
00394 except:
00395 print "Error while reading %s!" %(filename)
00396
00397
00398 num=0
00399 for item in result:
00400 num+=1
00401 print num,item
00402