$search
00001 package instruction.wordnet; 00002 00003 import instruction.exceptions.WordNetException; 00004 import instruction.semanticObjects.Word; 00005 import java.io.File; 00006 import java.io.FileNotFoundException; 00007 import java.io.FileReader; 00008 import java.io.IOException; 00009 import java.net.MalformedURLException; 00010 import java.net.URL; 00011 import java.text.DecimalFormat; 00012 import java.util.ArrayList; 00013 import java.util.HashMap; 00014 import java.util.Iterator; 00015 import java.util.List; 00016 import edu.mit.jwi.Dictionary; 00017 import edu.mit.jwi.IDictionary; 00018 import edu.mit.jwi.item.IIndexWord; 00019 import edu.mit.jwi.item.IWordID; 00020 import edu.mit.jwi.item.POS; 00021 00022 public class WordNet { 00023 00024 public static final boolean CACHE_IRREGULAR_FORMS = true; 00025 00026 private static IDictionary dict = null; 00027 00028 private static HashMap<String, String> irregularVerbs = null; 00029 00030 private static HashMap<String, String> irregularNouns = null; 00031 00032 private static HashMap<String, String> missingNouns = null; 00033 00034 private static HashMap<String, String> missingVerbs = null; 00035 00045 public static ArrayList<String> getSynIDs( String word, int type ) throws Exception { 00046 00047 if ( dict == null ) { 00048 getInstance(); 00049 } 00050 00051 try { 00052 ArrayList<String> synsetIDs = new ArrayList<String>(); 00053 POS pos = convertPOS( type ); 00054 00055 if ( pos == null || word == null ) 00056 return synsetIDs; 00057 00058 IIndexWord idxWord = null; 00059 00060 if ( type == Word.TYPE_PAST_PARTICIPLE ) { 00061 if ( word.endsWith( "ed" ) ) { 00062 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos ); 00063 if ( idxWord == null ) 00064 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos ); 00065 } 00066 00067 if ( idxWord == null ) 00068 if ( irregularVerbs.get( word ) != null ) 00069 idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos ); 00070 } 00071 00072 if ( type == Word.TYPE_ADV ) { 00073 idxWord = getInstance().getIndexWord( word, pos ); 00074 } 00075 00076 if ( type == Word.TYPE_NOUN ) { 00077 String syn = missingNouns.get( word.toLowerCase() ); 00078 if ( syn != null ) 00079 synsetIDs.add( syn ); 00080 } 00081 00082 else if ( type == Word.TYPE_VERB_INFINITIVE ) { 00083 String syn = missingVerbs.get( word.toLowerCase() ); 00084 if ( syn != null ) 00085 synsetIDs.add( syn ); 00086 } 00087 00088 else if ( type == Word.TYPE_ADJ ) { 00089 idxWord = getInstance().getIndexWord( word, pos ); 00090 } 00091 00092 if ( idxWord == null ) { 00093 if ( type == Word.TYPE_NOUN ) { 00094 if ( word.endsWith( "s" ) ) { 00095 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos ); 00096 } 00097 if ( idxWord == null && word.endsWith( "es" ) ) { 00098 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos ); 00099 } 00100 if ( idxWord == null ) { 00101 idxWord = getInstance().getIndexWord( word, pos ); 00102 } 00103 if ( idxWord == null ) { 00104 if ( irregularNouns.get( word ) != null ) 00105 idxWord = getInstance().getIndexWord( irregularNouns.get( word ), pos ); 00106 } 00107 } 00108 else if ( type == Word.TYPE_VERB_INFINITIVE ) { 00109 idxWord = getInstance().getIndexWord( word, pos ); 00110 if ( idxWord == null ) { 00111 if ( irregularVerbs.get( word ) != null ) 00112 idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos ); 00113 } 00114 } 00115 } 00116 00117 if ( idxWord == null ) 00118 return synsetIDs; 00119 00120 else { 00121 List<IWordID> wordIDs = idxWord.getWordIDs(); 00122 00123 for ( Iterator<IWordID> i = wordIDs.iterator(); i.hasNext(); ) { 00124 IWordID wID = i.next(); 00125 DecimalFormat df = new DecimalFormat( "00000000" ); 00126 synsetIDs.add( df.format( wID.getSynsetID().getOffset() ) ); 00127 } 00128 00129 return synsetIDs; 00130 } 00131 } 00132 catch ( MalformedURLException e ) { 00133 throw new WordNetException( e.getMessage() ); 00134 } 00135 } 00136 00149 public static boolean wordExistsAs( String word, int type ) throws Exception { 00150 00151 boolean found = wordExactlyExistsAs( word, type ); 00152 00153 if ( ! found ) { 00154 switch ( type ) { 00155 case Word.TYPE_PAST_PARTICIPLE: 00156 00157 if ( word.endsWith( "ed" ) ) { 00158 found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type ); 00159 if ( ! found ) 00160 found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type ); 00161 } 00162 00163 if ( ! found ) 00164 found = wordExactlyExistsAs( irregularVerbs.get( word ), type ); 00165 break; 00166 00167 case Word.TYPE_GERUND: 00168 00169 if ( word.endsWith( "ing" ) ) { 00170 found = wordExactlyExistsAs( word.substring( 0, word.length() - 3 ), type ); 00171 } 00172 00173 if ( ! found ) 00174 found = wordExactlyExistsAs( irregularVerbs.get( word ), type ); 00175 break; 00176 00177 case Word.TYPE_NOUN: 00178 if ( word.endsWith( "s" ) ) { 00179 found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type ); 00180 } 00181 if ( ! found && word.endsWith( "es" ) ) { 00182 found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type ); 00183 } 00184 if ( ! found ) { 00185 found = wordExactlyExistsAs( irregularNouns.get( word ), type ); 00186 } 00187 break; 00188 } 00189 } 00190 00191 return found; 00192 } 00193 00194 public static ArrayList<Integer> getPossiblePOS( String w ) throws Exception { 00195 00196 ArrayList<Integer> pos = new ArrayList<Integer>(); 00197 00198 if ( wordExistsAs( w, Word.TYPE_NOUN ) ) 00199 pos.add( Word.TYPE_NOUN ); 00200 00201 if ( wordExistsAs( w, Word.TYPE_VERB_INFINITIVE ) ) 00202 pos.add( Word.TYPE_VERB_INFINITIVE ); 00203 00204 if ( wordExistsAs( w, Word.TYPE_ADJ ) ) 00205 pos.add( Word.TYPE_ADJ ); 00206 00207 if ( wordExistsAs( w, Word.TYPE_ADV ) ) 00208 pos.add( Word.TYPE_ADV ); 00209 00210 return pos; 00211 } 00212 00213 private static POS convertPOS( int type ) { 00214 00215 POS pos = null; 00216 00217 if ( type == Word.TYPE_VERB_INFINITIVE || type == Word.TYPE_PAST_PARTICIPLE || type == Word.TYPE_GERUND ) 00218 pos = POS.VERB; 00219 else if ( type == Word.TYPE_NOUN ) 00220 pos = POS.NOUN; 00221 else if ( type == Word.TYPE_ADV ) 00222 pos = POS.ADVERB; 00223 else if ( type == Word.TYPE_ADJ ) 00224 pos = POS.ADJECTIVE; 00225 00226 return pos; 00227 } 00228 00240 public static boolean wordExactlyExistsAs( String word, int type ) throws Exception { 00241 00242 if ( word == null ) 00243 return false; 00244 00245 try { 00246 POS pos = convertPOS( type ); 00247 if ( pos == null ) 00248 return false; 00249 00250 IIndexWord idxWord = getInstance().getIndexWord( word, pos ); 00251 00252 if ( idxWord == null ) { 00253 if ( type == Word.TYPE_NOUN ) { 00254 if ( missingNouns.get( word ) != null ) 00255 return true; 00256 } 00257 else if ( type == Word.TYPE_VERB_INFINITIVE ) { 00258 if ( missingVerbs.get( word ) != null ) 00259 return true; 00260 } 00261 } 00262 else 00263 return true; 00264 return false; 00265 } 00266 catch ( MalformedURLException e ) { 00267 throw new WordNetException( e.getMessage() ); 00268 } 00269 } 00270 00271 private static IDictionary getInstance() throws Exception { 00272 00273 if ( dict == null ) { 00274 String wnhome = System.getenv( "WNHOME" ); 00275 if ( wnhome == null ) 00276 throw new Exception( "Environment variable \"WNHOME\" not set. WordNet could not be found." ); 00277 String path = wnhome + File.separator + "dict"; 00278 URL url = new URL( "file", null, path ); 00279 dict = new Dictionary( url ); 00280 if ( ! dict.isOpen() ) 00281 dict.open(); 00282 00283 // Cache irregular verb and noun forms 00284 if ( CACHE_IRREGULAR_FORMS ) { 00285 cacheIrregularVerbs( path ); 00286 cacheIrregularNouns( path ); 00287 } 00288 initializeMissingMappings(); 00289 } 00290 00291 return dict; 00292 } 00293 00294 private static void cacheIrregularVerbs( String wnPath ) throws IOException, WordNetException { 00295 00296 File verb_exc = new File( wnPath + File.separator + "verb.exc" ); 00297 if ( ! verb_exc.exists() ) 00298 throw new FileNotFoundException( "Mapping File for irregular Verb forms (" + verb_exc.getAbsolutePath() 00299 + ") not found" ); 00300 else { 00301 FileReader reader = new FileReader( verb_exc ); 00302 StringBuilder file = new StringBuilder(); 00303 char[] buffer = new char[1024]; 00304 int read = 0; 00305 while ( ( read = reader.read( buffer ) ) >= 0 ) 00306 file.append( buffer, 0, read ); 00307 irregularVerbs = new HashMap<String, String>(); 00308 00309 String[] mappings = file.toString().split( System.getProperty( "line.separator" ) ); 00310 for ( int i = 0; i < mappings.length; i++ ) { 00311 String[] keyValueSet = mappings[i].split( " " ); 00312 if ( keyValueSet.length < 2 ) 00313 throw new WordNetException( "Illegal Mapping entry in " + verb_exc.getName() + " -> " 00314 + keyValueSet.length + " Tokens " + i ); 00315 else { 00316 irregularVerbs.put( keyValueSet[0], keyValueSet[1] ); 00317 } 00318 } 00319 00320 } 00321 } 00322 00323 private static void cacheIrregularNouns( String wnPath ) throws IOException, WordNetException { 00324 00325 File noun_exc = new File( wnPath + File.separator + "noun.exc" ); 00326 if ( ! noun_exc.exists() ) 00327 throw new FileNotFoundException( "Mapping File for irregular Noun forms (" + noun_exc.getAbsolutePath() 00328 + ") not found" ); 00329 else { 00330 FileReader reader = new FileReader( noun_exc ); 00331 StringBuilder file = new StringBuilder(); 00332 char[] buffer = new char[1024]; 00333 int read = 0; 00334 while ( ( read = reader.read( buffer ) ) >= 0 ) 00335 file.append( buffer, 0, read ); 00336 irregularNouns = new HashMap<String, String>(); 00337 00338 String[] mappings = file.toString().split( System.getProperty( "line.separator" ) ); 00339 for ( int i = 0; i < mappings.length; i++ ) { 00340 String[] keyValueSet = mappings[i].split( " " ); 00341 if ( keyValueSet.length < 2 ) 00342 throw new WordNetException( "Illegal Mapping entry in " + noun_exc.getName() ); 00343 else { 00344 irregularNouns.put( keyValueSet[0], keyValueSet[1] ); 00345 } 00346 } 00347 00348 } 00349 } 00350 00351 public static void initializeMissingMappings() { 00352 00353 missingNouns = new HashMap<String, String>(); 00354 missingNouns.put( "stove burner", "04163507" ); 00355 missingNouns.put( "stove top", "TUM00001" ); 00356 missingNouns.put( "eating bowl", "02778691" ); 00357 missingNouns.put( "packet soup", "07115585" ); 00358 missingNouns.put( "contents", "07474342" ); 00359 missingNouns.put( "iced tea", "N07456631" ); 00360 00361 missingVerbs = new HashMap<String, String>(); 00362 missingVerbs.put( "look at", "TUM00002" ); 00363 missingVerbs.put( "bring out", "01145176" ); 00364 00365 } 00366 00367 public static void main( String[] args ) { 00368 00369 try { 00370 if ( wordExistsAs( "chopped", Word.TYPE_PAST_PARTICIPLE ) ) 00371 System.out.println( "Word found" ); 00372 else 00373 System.out.println( "Word not found" ); 00374 } 00375 catch ( MalformedURLException e ) { 00376 // TODO Auto-generated catch block 00377 e.printStackTrace(); 00378 } 00379 catch ( Exception e ) { 00380 // TODO Auto-generated catch block 00381 e.printStackTrace(); 00382 } 00383 } 00384 }