#include <DetLemma.h>
Public Member Functions | |
DetLemma () | |
default constructor | |
~DetLemma () | |
destructor | |
bool | fillFlex (char *) |
initializes the levels from XML file | |
bool | detLemma (string &, JXDictionary *, list< result_lemma > &, FILE *) |
this function creates HypoLemma objects for possible segmentation of the hiragana ending | |
bool | confirmLemma (string &, string &, JXDictionary *, result_lemma &) |
searches a hypothetical lemma in the dictionary | |
bool | findInLevel (string &, UINT) |
searches a hiragana string in a specific level | |
bool | insertFlextoLevel (FlexLevel *, int) |
inserts a single Flexlevel into vector m_flexlevel | |
void | HypoPrint () |
prints all hypothesis to stdout | |
bool | insertPossibleHypos (JVerbalFlexion *jvf, UMString hirastr, int j) |
this function adds on to the possible reduction hypothesis | |
Private Types | |
typedef vector< FlexLevel * >::iterator | IT |
typedef vector< HypoLemma >::iterator | HIT |
Private Member Functions | |
int | getHighestLevel (char *) |
finds the highest level in grammar | |
Private Attributes | |
vector< FlexLevel * > | m_flexlevel |
vector< HypoLemma > | m_hypothesen |
int | m_number_of_levels |
int | m_number_of_entries |
int | m_flexmaxlength |
int | m_level_counter [MAXFLEX] |
list< result_lemma > | m_result_list |
string | m_cur_reqform |
string | m_cur_reqflexart |
JVerbalFlexion * | hypo |
|
iterator for hyothesis vector |
|
iterator for the flexation levels |
|
default constructor
|
|
destructor iterates through the map of flexlevels and deletes each one |
|
searches a hypothetical lemma in the dictionary by not only checking the lemma-string but also the kind of flexation, results are good. however, there can be more than one result for the same lookup, especially when the initial string is an all-hiragana string
|
|
this function creates HypoLemma objects for possible segmentation of the hiragana ending This is the heart of lemmatization. First there is the hiragana string, every possible way to split it results in a new hypothesis about stem and ending. For every hypothesis the ending is searched in the grammar, starting at the highest level. Once an ending is found, it gets cut of and the rest of the hiragana string is treated again as above. Once all the levels of flexation and hypothesis are checked, the dictionary form ending of the verb is selcted from the grammar, agglutinated and checked against the dictionary. Also the kind of verb associated with the grammatical rules is checked to make sure that similar forms like passiv and potential are separated if possible.
instead of the multibyte string |
|
initializes the levels from XML file this function is responsible for the grammar input the grammatical verbforms are hardcoded, which is no big deal if you stick to Japanese, because the numbers and kinds are very much fixed it prints a list of the loaded Flexlevels to stdout
|
|
searches a hiragana string in a specific level
|
|
finds the highest level in grammar internal funktion to determine how many levels |
|
prints all hypothesis to stdout
|
|
inserts a single Flexlevel into vector m_flexlevel
|
|
this function adds on to the possible reduction hypothesis this fuction is called when changes to the hiragana string occured, especially when an ending has been recognized and deleted from the string. the new, shorter string is being split into parts again, and for each new possibilty an hypothesis is added
|
|
pointer for processing current hypothesis |
|
variable for processing current hypothesis |
|
variable for processing current hypothesis |
|
the flexation levels of the grammar file are stored here |
|
variable, that defines the maximum length of an grammatical unit of hiragana |
|
hypthesis for lemma resolution are stored here |
|
array storing the numbers corresponding to the flexlevels |
|
variable, that defines the number of entries |
|
variable, that stores the number of levels found in the grammar file |
|
since sometimes there is more than one resolution, results are stored in a list |