00001 package instruction.wordnet;
00002
00003 import instruction.exceptions.WordNetException;
00004 import instruction.semanticObjects.Word;
00005 import java.io.File;
00006 import java.io.FileNotFoundException;
00007 import java.io.FileReader;
00008 import java.io.IOException;
00009 import java.net.MalformedURLException;
00010 import java.net.URL;
00011 import java.text.DecimalFormat;
00012 import java.util.ArrayList;
00013 import java.util.HashMap;
00014 import java.util.Iterator;
00015 import java.util.List;
00016 import edu.mit.jwi.Dictionary;
00017 import edu.mit.jwi.IDictionary;
00018 import edu.mit.jwi.item.IIndexWord;
00019 import edu.mit.jwi.item.IWordID;
00020 import edu.mit.jwi.item.POS;
00021
00022 public class WordNet {
00023
00024 public static final boolean CACHE_IRREGULAR_FORMS = true;
00025
00026 private static IDictionary dict = null;
00027
00028 private static HashMap<String, String> irregularVerbs = null;
00029
00030 private static HashMap<String, String> irregularNouns = null;
00031
00032 private static HashMap<String, String> missingNouns = null;
00033
00034 private static HashMap<String, String> missingVerbs = null;
00035
00045 public static ArrayList<String> getSynIDs( String word, int type ) throws Exception {
00046
00047 if ( dict == null ) {
00048 getInstance();
00049 }
00050
00051 try {
00052 ArrayList<String> synsetIDs = new ArrayList<String>();
00053 POS pos = convertPOS( type );
00054
00055 if ( pos == null || word == null )
00056 return synsetIDs;
00057
00058 IIndexWord idxWord = null;
00059
00060 if ( type == Word.TYPE_PAST_PARTICIPLE ) {
00061 if ( word.endsWith( "ed" ) ) {
00062 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos );
00063 if ( idxWord == null )
00064 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos );
00065 }
00066
00067 if ( idxWord == null )
00068 if ( irregularVerbs.get( word ) != null )
00069 idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos );
00070 }
00071
00072 if ( type == Word.TYPE_ADV ) {
00073 idxWord = getInstance().getIndexWord( word, pos );
00074 }
00075
00076 if ( type == Word.TYPE_NOUN ) {
00077 String syn = missingNouns.get( word.toLowerCase() );
00078 if ( syn != null )
00079 synsetIDs.add( syn );
00080 }
00081
00082 else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00083 String syn = missingVerbs.get( word.toLowerCase() );
00084 if ( syn != null )
00085 synsetIDs.add( syn );
00086 }
00087
00088 else if ( type == Word.TYPE_ADJ ) {
00089 idxWord = getInstance().getIndexWord( word, pos );
00090 }
00091
00092 if ( idxWord == null ) {
00093 if ( type == Word.TYPE_NOUN ) {
00094 if ( word.endsWith( "s" ) ) {
00095 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos );
00096 }
00097 if ( idxWord == null && word.endsWith( "es" ) ) {
00098 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos );
00099 }
00100 if ( idxWord == null ) {
00101 idxWord = getInstance().getIndexWord( word, pos );
00102 }
00103 if ( idxWord == null ) {
00104 if ( irregularNouns.get( word ) != null )
00105 idxWord = getInstance().getIndexWord( irregularNouns.get( word ), pos );
00106 }
00107 }
00108 else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00109 idxWord = getInstance().getIndexWord( word, pos );
00110 if ( idxWord == null ) {
00111 if ( irregularVerbs.get( word ) != null )
00112 idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos );
00113 }
00114 }
00115 }
00116
00117 if ( idxWord == null )
00118 return synsetIDs;
00119
00120 else {
00121 List<IWordID> wordIDs = idxWord.getWordIDs();
00122
00123 for ( Iterator<IWordID> i = wordIDs.iterator(); i.hasNext(); ) {
00124 IWordID wID = i.next();
00125 DecimalFormat df = new DecimalFormat( "00000000" );
00126 synsetIDs.add( df.format( wID.getSynsetID().getOffset() ) );
00127 }
00128
00129 return synsetIDs;
00130 }
00131 }
00132 catch ( MalformedURLException e ) {
00133 throw new WordNetException( e.getMessage() );
00134 }
00135 }
00136
00149 public static boolean wordExistsAs( String word, int type ) throws Exception {
00150
00151 boolean found = wordExactlyExistsAs( word, type );
00152
00153 if ( ! found ) {
00154 switch ( type ) {
00155 case Word.TYPE_PAST_PARTICIPLE:
00156
00157 if ( word.endsWith( "ed" ) ) {
00158 found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type );
00159 if ( ! found )
00160 found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type );
00161 }
00162
00163 if ( ! found )
00164 found = wordExactlyExistsAs( irregularVerbs.get( word ), type );
00165 break;
00166
00167 case Word.TYPE_GERUND:
00168
00169 if ( word.endsWith( "ing" ) ) {
00170 found = wordExactlyExistsAs( word.substring( 0, word.length() - 3 ), type );
00171 }
00172
00173 if ( ! found )
00174 found = wordExactlyExistsAs( irregularVerbs.get( word ), type );
00175 break;
00176
00177 case Word.TYPE_NOUN:
00178 if ( word.endsWith( "s" ) ) {
00179 found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type );
00180 }
00181 if ( ! found && word.endsWith( "es" ) ) {
00182 found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type );
00183 }
00184 if ( ! found ) {
00185 found = wordExactlyExistsAs( irregularNouns.get( word ), type );
00186 }
00187 break;
00188 }
00189 }
00190
00191 return found;
00192 }
00193
00194 public static ArrayList<Integer> getPossiblePOS( String w ) throws Exception {
00195
00196 ArrayList<Integer> pos = new ArrayList<Integer>();
00197
00198 if ( wordExistsAs( w, Word.TYPE_NOUN ) )
00199 pos.add( Word.TYPE_NOUN );
00200
00201 if ( wordExistsAs( w, Word.TYPE_VERB_INFINITIVE ) )
00202 pos.add( Word.TYPE_VERB_INFINITIVE );
00203
00204 if ( wordExistsAs( w, Word.TYPE_ADJ ) )
00205 pos.add( Word.TYPE_ADJ );
00206
00207 if ( wordExistsAs( w, Word.TYPE_ADV ) )
00208 pos.add( Word.TYPE_ADV );
00209
00210 return pos;
00211 }
00212
00213 private static POS convertPOS( int type ) {
00214
00215 POS pos = null;
00216
00217 if ( type == Word.TYPE_VERB_INFINITIVE || type == Word.TYPE_PAST_PARTICIPLE || type == Word.TYPE_GERUND )
00218 pos = POS.VERB;
00219 else if ( type == Word.TYPE_NOUN )
00220 pos = POS.NOUN;
00221 else if ( type == Word.TYPE_ADV )
00222 pos = POS.ADVERB;
00223 else if ( type == Word.TYPE_ADJ )
00224 pos = POS.ADJECTIVE;
00225
00226 return pos;
00227 }
00228
00240 public static boolean wordExactlyExistsAs( String word, int type ) throws Exception {
00241
00242 if ( word == null )
00243 return false;
00244
00245 try {
00246 POS pos = convertPOS( type );
00247 if ( pos == null )
00248 return false;
00249
00250 IIndexWord idxWord = getInstance().getIndexWord( word, pos );
00251
00252 if ( idxWord == null ) {
00253 if ( type == Word.TYPE_NOUN ) {
00254 if ( missingNouns.get( word ) != null )
00255 return true;
00256 }
00257 else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00258 if ( missingVerbs.get( word ) != null )
00259 return true;
00260 }
00261 }
00262 else
00263 return true;
00264 return false;
00265 }
00266 catch ( MalformedURLException e ) {
00267 throw new WordNetException( e.getMessage() );
00268 }
00269 }
00270
00271 private static IDictionary getInstance() throws Exception {
00272
00273 if ( dict == null ) {
00274 String wnhome = System.getenv( "WNHOME" );
00275 if ( wnhome == null )
00276 throw new Exception( "Environment variable \"WNHOME\" not set. WordNet could not be found." );
00277 String path = wnhome + File.separator + "dict";
00278 URL url = new URL( "file", null, path );
00279 dict = new Dictionary( url );
00280 if ( ! dict.isOpen() )
00281 dict.open();
00282
00283
00284 if ( CACHE_IRREGULAR_FORMS ) {
00285 cacheIrregularVerbs( path );
00286 cacheIrregularNouns( path );
00287 }
00288 initializeMissingMappings();
00289 }
00290
00291 return dict;
00292 }
00293
00294 private static void cacheIrregularVerbs( String wnPath ) throws IOException, WordNetException {
00295
00296 File verb_exc = new File( wnPath + File.separator + "verb.exc" );
00297 if ( ! verb_exc.exists() )
00298 throw new FileNotFoundException( "Mapping File for irregular Verb forms (" + verb_exc.getAbsolutePath()
00299 + ") not found" );
00300 else {
00301 FileReader reader = new FileReader( verb_exc );
00302 StringBuilder file = new StringBuilder();
00303 char[] buffer = new char[1024];
00304 int read = 0;
00305 while ( ( read = reader.read( buffer ) ) >= 0 )
00306 file.append( buffer, 0, read );
00307 irregularVerbs = new HashMap<String, String>();
00308
00309 String[] mappings = file.toString().split( System.getProperty( "line.separator" ) );
00310 for ( int i = 0; i < mappings.length; i++ ) {
00311 String[] keyValueSet = mappings[i].split( " " );
00312 if ( keyValueSet.length < 2 )
00313 throw new WordNetException( "Illegal Mapping entry in " + verb_exc.getName() + " -> "
00314 + keyValueSet.length + " Tokens " + i );
00315 else {
00316 irregularVerbs.put( keyValueSet[0], keyValueSet[1] );
00317 }
00318 }
00319
00320 }
00321 }
00322
00323 private static void cacheIrregularNouns( String wnPath ) throws IOException, WordNetException {
00324
00325 File noun_exc = new File( wnPath + File.separator + "noun.exc" );
00326 if ( ! noun_exc.exists() )
00327 throw new FileNotFoundException( "Mapping File for irregular Noun forms (" + noun_exc.getAbsolutePath()
00328 + ") not found" );
00329 else {
00330 FileReader reader = new FileReader( noun_exc );
00331 StringBuilder file = new StringBuilder();
00332 char[] buffer = new char[1024];
00333 int read = 0;
00334 while ( ( read = reader.read( buffer ) ) >= 0 )
00335 file.append( buffer, 0, read );
00336 irregularNouns = new HashMap<String, String>();
00337
00338 String[] mappings = file.toString().split( System.getProperty( "line.separator" ) );
00339 for ( int i = 0; i < mappings.length; i++ ) {
00340 String[] keyValueSet = mappings[i].split( " " );
00341 if ( keyValueSet.length < 2 )
00342 throw new WordNetException( "Illegal Mapping entry in " + noun_exc.getName() );
00343 else {
00344 irregularNouns.put( keyValueSet[0], keyValueSet[1] );
00345 }
00346 }
00347
00348 }
00349 }
00350
00351 public static void initializeMissingMappings() {
00352
00353 missingNouns = new HashMap<String, String>();
00354 missingNouns.put( "stove burner", "04163507" );
00355 missingNouns.put( "stove top", "TUM00001" );
00356 missingNouns.put( "eating bowl", "02778691" );
00357 missingNouns.put( "packet soup", "07115585" );
00358 missingNouns.put( "contents", "07474342" );
00359 missingNouns.put( "iced tea", "N07456631" );
00360
00361 missingVerbs = new HashMap<String, String>();
00362 missingVerbs.put( "look at", "TUM00002" );
00363 missingVerbs.put( "bring out", "01145176" );
00364
00365 }
00366
00367 public static void main( String[] args ) {
00368
00369 try {
00370 if ( wordExistsAs( "chopped", Word.TYPE_PAST_PARTICIPLE ) )
00371 System.out.println( "Word found" );
00372 else
00373 System.out.println( "Word not found" );
00374 }
00375 catch ( MalformedURLException e ) {
00376
00377 e.printStackTrace();
00378 }
00379 catch ( Exception e ) {
00380
00381 e.printStackTrace();
00382 }
00383 }
00384 }