AcronymDecomposer: acronymDecomposer.py Source File

00001 """
00002 Modul AcronmyDecomposer
00003 Programmier-Gesellenstueck
00004 Author: Branimira Nikolova
00005 SS 2006
00006 
00007 This module surches text for Akronym-Expansion Pairs and trys to match
00008 the them to each other.
00009 
00010 """
00011 
00012 
00013 import re
00014 from AD_token import Token
00015                        
00016 class Acro(Token):
00017     """
00018     class Acro:
00019         arguments:
00020         acronym: the acronym self
00021         expansion: the resulting expansion
00022         expCandidate: the candidate string for the expansion
00023         candidate: the candidate string for the acronym
00024         position: the current possition necessary to calculate the search space 
00025         digitDict: possible matches for some digits
00026     """
00027     
00028     def __init__(self):
00029 
00030         self.acronym = ''
00031         self.expansion = ''
00032         self.candidate = ''
00033         self.expCandidate = ''
00034         self.position = 0
00035         self.digitDict = {'2':['to', 'two', 'second'],'3':'third','4':['for', 'fourth']}
00036         
00037     def __str__(self):
00038         """
00039         Representation from Acro for printing.
00040         """
00041         return '%s, %s' %(self.acronym, self.expansion)
00042 
00043     
00044     def findAcroCandidate(self, text):
00045         '''
00046         Surches for Acronym-candidates: uppercase or capitalized tokens
00047         - in parentheses
00048         - infront of parentheses
00049         - inftont of ", or" 
00050         - after ", or"
00051         
00052         Returns Candidates list.
00053         '''
00054 
00055         newToken=Token(token=None, tag=None)
00056         tokenList=self.tagTextWithMT(text)
00057 
00058             
00059         resultList = []
00060         result= None
00061         mix=('([A-Z][a-zA-Z]+)')
00062         regex=re.compile(mix)
00063         
00064         for it in range(len(tokenList)):          
00065             if (tokenList[it].token.istitle() or tokenList[it].token.isupper()\
00066                 or regex.match(tokenList[it].token)):
00067                 
00068                 if not  2 > len(tokenList[it].token)  >= 10:
00069                     
00070                     if tokenList[it].tag in newToken.tagSetDict["noun"] :
00071                        
00072                         
00073                         # if acronym in parentheses
00074                         if tokenList[it-1].tag == newToken.tagSetDict["open"]:
00075                             
00076                             if tokenList[it+1].tag == newToken.tagSetDict['close']:
00077             
00078                    
00079                                 self.candidate = tokenList[it].token
00080                                 self.position = it
00081                                 self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict)
00082                                 result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00083                                 if result:
00084                                     resultList.append(result)
00085 
00086                         # if acronym infront of parentheses         
00087                         elif tokenList[it+1].tag == newToken.tagSetDict["open"]:
00088                             if tokenList[it+3].tag != newToken.tagSetDict["close"]:
00089                                 self.expCandidate=[]
00090                                 if tokenList[it].tag == newToken.tagSetDict["digit"]:
00091                                     continue
00092                                 else:
00093                                     self.candidate = tokenList[it].token
00094                                 
00095                                 while tokenList[it+1].tag != newToken.tagSetDict["close"]:
00096                                     if tokenList[it+1].tag in newToken.tagSetDict['delimiters']:
00097                                         break
00098                                     else:
00099                                         it+=1
00100                                         self.expCandidate.append(tokenList[it])
00101                                         
00102 
00103                                 if len(self.expCandidate) < 2:
00104                                     continue
00105                                 else:
00106                                     self.expCandidate.reverse()         
00107                             
00108                                     result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00109                                     if result:
00110                                         resultList.append(result)                           
00111                                 
00112                         # if acronym follows ', or'
00113                         elif (tokenList[it-1].token == 'or' and tokenList[it-2].token == ','):
00114                             self.candidate = tokenList[it].token
00115                             self.position = it-1
00116                     
00117                             self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict )
00118                             result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00119                             if result:
00120                                 resultList.append(result)
00121 
00122 
00123                         # if acronym infront of ", or"
00124                         elif (tokenList[it+1].token==',' and tokenList[it+2].token == 'or'):
00125                             self.expCandidate = []
00126                             self.candidate = tokenList[it].token
00127                             self.position = it
00128                             it+=3
00129                             
00130                        
00131                             while 1:
00132                                 if tokenList[it].tag in newToken.tagSetDict['delimiters']:
00133                                     break
00134                                 else:
00135                                     self.expCandidate.append(tokenList[it])
00136                                     it+=1
00137                     
00138                             if len(self.expCandidate) < 2:
00139                                 continue
00140                             else:
00141                                 self.expCandidate.reverse()
00142                             result= self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
00143                             if result:
00144                                 resultList.append(result)
00145 
00146             
00147         return resultList
00148                         
00149         
00150 
00151     def computeSearchSpace(self, candidate, position, tokenList, tagSetDict):
00152         '''
00153         Computes the search space for Acronym-Candidates:
00154         - if the Acronym-Candidate is longer than 5 characters the searchspace is definedto be the lenght of the Acronym-Candidate+5;
00155         - if it is shorter than 5 characters the searchspace is the lenght*2
00156 
00157         Returns the Acronym-Candiate as list of AD_Token instances.
00158         '''
00159         
00160         expCandidate = []
00161         
00162         if len(self.candidate)>= 5:
00163             searchSpace = len(self.candidate)+5
00164         else:
00165             searchSpace = len(self.candidate)*2
00166 
00167         start = self.position-2
00168             
00169         end = start-searchSpace
00170                 
00171         
00172 
00173         while start > end:
00174            
00175             if tokenList[start].tag in tagSetDict['delimiters']:
00176                 break
00177             else:
00178                 expCandidate.append(tokenList[start])
00179             start-=1
00180         
00181         return expCandidate
00182 
00183         
00184     def checkTrueExp(self, candidate, expCandidate, tagSetDict):
00185         '''
00186         Compares the Acronym-Candidate and the Expansion-Candidate backwards.
00187         Each character from the Acronym-Candidate must appear in one of the tokens in the Expansion-Candidate
00188         in the same order as in the Acronym-Candidate; the first character of the Acronym-Candidate must match
00189         a character in the initial position of the first word in the Expansion-Candidate.
00190 
00191         Returns an Acro instance.        
00192         '''
00193 
00194         text=''
00195         for i in self.expCandidate:
00196             text='%s %s' %(text,i.token)
00197 
00198         # iterator over the acronym candidate list(clist)            
00199         itc = 0
00200         #iterator over the expansion candidate list (self.expCandidate)
00201         its = 0
00202         #acronym candidate list
00203         clist=[]
00204         match=''
00205         found=0
00206         # expansion result list
00207         expansion=[]
00208         expansionEnd=0
00209         
00210         for character in self.candidate:
00211             clist.append(character)
00212         clist.reverse()
00213         
00214         while 1:
00215             # if last character in Acronym-Candidate reached
00216             if itc==len(clist)-1:
00217                 
00218                 # no prepositions and conjunctions as first token
00219                 if self.expCandidate[its].tag in tagSetDict['stopTags']:
00220                     its+=1
00221                     continue
00222                 else:
00223                     first=self.expCandidate[its].token
00224                     
00225                     if clist[itc].lower()== first[0].lower():
00226                         expansion.append(self.expCandidate[its].token)
00227                         expansionEnd=its
00228                         
00229                         break
00230                     else:
00231                         its+=1
00232                         if its > len(self.expCandidate)-1:
00233 
00234                             break
00235                             
00236             
00237                 
00238             elif clist[itc].lower() in self.expCandidate[its].token.lower():
00239 
00240                 #if the same character allready matched in the same token, move one token forward
00241                 if match == (clist[itc], self.expCandidate[its]):
00242                     its+=1
00243                 else:
00244                     expansion.append(self.expCandidate[its].token)
00245                     
00246                     match=(clist[itc], self.expCandidate[its])
00247                     found+=1
00248                     itc+=1
00249                     
00250                     # max 3 matches in the same token allowed
00251                     if  found >=3:
00252                         its+=1
00253                         found=0
00254                     
00255 
00256             elif clist[itc].isdigit():
00257                 its+=1
00258                 if self.expCandidate[its].tag == tagSetDict['digit']:
00259                     if clist[itc]== self.expCandidate[its].token:
00260                         
00261                         expansion.append(self.expCandidate[its].token)
00262                         
00263                         itc+=1
00264                         its+=1
00265                        
00266                     
00267                     
00268                 elif self.checkDigitDict(clist[itc], self.expCandidate[its].token):
00269                     expansion.append(self.expCandidate[its].token)
00270                     itc+=1
00271                     its+=1
00272                    
00273                 
00274                 elif self.countFirstChar(clist, itc, self.expCandidate, its):
00275                     its+=1
00276                     step=int(clist[itc])
00277                     
00278                    
00279                     for it in self.expCandidate[its:(its+step)]:
00280                         expansion.append(it.token)
00281                         
00282                     itc+=1
00283                     its+=step-1
00284                     if itc==len(clist)-1:
00285                         expansionEnd=its
00286                         break
00287                 else:
00288                     break                   
00289             
00290             else:
00291                 its+=1
00292                 if its>=len(self.expCandidate):
00293                     break
00294                     
00295                 
00296             if itc>len(clist):
00297                 break
00298         
00299         if len(expansion) < len (self.candidate):
00300             
00301             return 0
00302         
00303         else:
00304             # build the expansion string
00305             i=expansionEnd
00306             full = ''
00307             while i >=0:
00308                 full="%s %s" %(full, self.expCandidate[i].token) 
00309                 i-=1
00310            
00311 
00312             newA=Acro()
00313             newA.acronym = self.candidate
00314             newA.expansion = full
00315        
00316             print newA
00317             return newA
00318     
00319     def checkDigitDict(self, ac, ec):
00320         
00321         '''Check if the digit in the acronym stands for
00322         a word like "for"(4) or "to"(2).
00323         '''
00324         
00325         if ac in self.digitDict.keys():
00326 
00327             if ec.lower() in self.digitDict[ac]:
00328                 return 1
00329             else:
00330                 return 0
00331 
00332     def countFirstChar(self, ac, acPos, ec, ecPos):
00333         
00334         ''' If the acronym starts with a digit, or some digit is inside
00335         of the acronym, count the words in the expansion, which start with
00336         the character preceding or following the digit and if the number of the
00337         characters is equal to the digit return 1.
00338         '''
00339 
00340         count=0
00341         ecEnd=len(ec)-1
00342         # if the acronym starts with digit
00343         if ac[acPos]==ac[-1]:
00344             print ac[acPos]
00345             
00346             for it in ec:
00347                 if ac[acPos-1] == it.token[0]:
00348                     print it.token
00349                     count+=1
00350                     
00351         # if the digit is inside 
00352         elif ac[acPos+1]:
00353             print ac[acPos]
00354             for it in ec[ecPos:ecEnd]:
00355                
00356                 if ac[acPos+1] == it.token[0]:
00357                     print it.token
00358                     count+=1
00359 
00360         
00361         if count == int(ac[acPos]):
00362             return 1
00363         else:
00364             return 0
00365                 
00366 if __name__=="__main__":
00367 
00368     import sys, os
00369     
00370     if len(sys.argv) == 1:
00371         filename = "\\t\\testFile.txt"
00372         try:
00373             fh=open(filename, 'r')
00374             text=fh.read()
00375             fh.close()
00376             newAcro=Acro()   
00377             result=newAcro.findAcroCandidate(text)
00378             
00379         except:
00380             print "%s not found!" %(filename)
00381         
00382     elif len(sys.argv)>1:
00383         result = []
00384         for f in os.listdir(sys.argv[1]): 
00385             filename=os.path.join(sys.argv[1], f)
00386             print filename
00387             try:
00388                 fh=open(filename, 'r')
00389                 text=fh.read()
00390                 fh.close()
00391                 newAcro=Acro()   
00392                 result=newAcro.findAcroCandidate(text)
00393             
00394             except:
00395                 print "Error while reading %s!" %(filename)
00396             
00397            
00398         num=0
00399         for item in result:
00400             num+=1
00401             print num,item
00402