WordNet.java
Go to the documentation of this file.
00001 package instruction.wordnet;
00002 
00003 import instruction.exceptions.WordNetException;
00004 import instruction.semanticObjects.Word;
00005 import java.io.File;
00006 import java.io.FileNotFoundException;
00007 import java.io.FileReader;
00008 import java.io.IOException;
00009 import java.net.MalformedURLException;
00010 import java.net.URL;
00011 import java.text.DecimalFormat;
00012 import java.util.ArrayList;
00013 import java.util.HashMap;
00014 import java.util.Iterator;
00015 import java.util.List;
00016 import edu.mit.jwi.Dictionary;
00017 import edu.mit.jwi.IDictionary;
00018 import edu.mit.jwi.item.IIndexWord;
00019 import edu.mit.jwi.item.IWordID;
00020 import edu.mit.jwi.item.POS;
00021 
00022 public class WordNet {
00023 
00024         public static final boolean CACHE_IRREGULAR_FORMS = true;
00025 
00026         private static IDictionary dict = null;
00027 
00028         private static HashMap<String, String> irregularVerbs = null;
00029 
00030         private static HashMap<String, String> irregularNouns = null;
00031 
00032         private static HashMap<String, String> missingNouns = null;
00033 
00034         private static HashMap<String, String> missingVerbs = null;
00035 
00045         public static ArrayList<String> getSynIDs( String word, int type ) throws Exception {
00046 
00047                 if ( dict == null ) {
00048                         getInstance();
00049                 }
00050 
00051                 try {
00052                         ArrayList<String> synsetIDs = new ArrayList<String>();
00053                         POS pos = convertPOS( type );
00054 
00055                         if ( pos == null || word == null )
00056                                 return synsetIDs;
00057 
00058                         IIndexWord idxWord = null;
00059 
00060                         if ( type == Word.TYPE_PAST_PARTICIPLE ) {
00061                                 if ( word.endsWith( "ed" ) ) {
00062                                         idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos );
00063                                         if ( idxWord == null )
00064                                                 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos );
00065                                 }
00066 
00067                                 if ( idxWord == null )
00068                                         if ( irregularVerbs.get( word ) != null )
00069                                                 idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos );
00070                         }
00071 
00072                         if ( type == Word.TYPE_ADV ) {
00073                                 idxWord = getInstance().getIndexWord( word, pos );
00074                         }
00075 
00076                         if ( type == Word.TYPE_NOUN ) {
00077                                 String syn = missingNouns.get( word.toLowerCase() );
00078                                 if ( syn != null )
00079                                         synsetIDs.add( syn );
00080                         }
00081 
00082                         else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00083                                 String syn = missingVerbs.get( word.toLowerCase() );
00084                                 if ( syn != null )
00085                                         synsetIDs.add( syn );
00086                         }
00087 
00088                         else if ( type == Word.TYPE_ADJ ) {
00089                                 idxWord = getInstance().getIndexWord( word, pos );
00090                         }
00091 
00092                         if ( idxWord == null ) {
00093                                 if ( type == Word.TYPE_NOUN ) {
00094                                         if ( word.endsWith( "s" ) ) {
00095                                                 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 1 ), pos );
00096                                         }
00097                                         if ( idxWord == null && word.endsWith( "es" ) ) {
00098                                                 idxWord = getInstance().getIndexWord( word.substring( 0, word.length() - 2 ), pos );
00099                                         }
00100                                         if ( idxWord == null ) {
00101                                                 idxWord = getInstance().getIndexWord( word, pos );
00102                                         }
00103                                         if ( idxWord == null ) {
00104                                                 if ( irregularNouns.get( word ) != null )
00105                                                         idxWord = getInstance().getIndexWord( irregularNouns.get( word ), pos );
00106                                         }
00107                                 }
00108                                 else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00109                                         idxWord = getInstance().getIndexWord( word, pos );
00110                                         if ( idxWord == null ) {
00111                                                 if ( irregularVerbs.get( word ) != null )
00112                                                         idxWord = getInstance().getIndexWord( irregularVerbs.get( word ), pos );
00113                                         }
00114                                 }
00115                         }
00116 
00117                         if ( idxWord == null )
00118                                 return synsetIDs;
00119 
00120                         else {
00121                                 List<IWordID> wordIDs = idxWord.getWordIDs();
00122 
00123                                 for ( Iterator<IWordID> i = wordIDs.iterator(); i.hasNext(); ) {
00124                                         IWordID wID = i.next();
00125                                         DecimalFormat df = new DecimalFormat( "00000000" );
00126                                         synsetIDs.add( df.format( wID.getSynsetID().getOffset() ) );
00127                                 }
00128 
00129                                 return synsetIDs;
00130                         }
00131                 }
00132                 catch ( MalformedURLException e ) {
00133                         throw new WordNetException( e.getMessage() );
00134                 }
00135         }
00136 
00149         public static boolean wordExistsAs( String word, int type ) throws Exception {
00150 
00151                 boolean found = wordExactlyExistsAs( word, type );
00152 
00153                 if ( ! found ) {
00154                         switch ( type ) {
00155                         case Word.TYPE_PAST_PARTICIPLE:
00156 
00157                                 if ( word.endsWith( "ed" ) ) {
00158                                         found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type );
00159                                         if ( ! found )
00160                                                 found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type );
00161                                 }
00162 
00163                                 if ( ! found )
00164                                         found = wordExactlyExistsAs( irregularVerbs.get( word ), type );
00165                                 break;
00166 
00167                         case Word.TYPE_GERUND:
00168 
00169                                 if ( word.endsWith( "ing" ) ) {
00170                                         found = wordExactlyExistsAs( word.substring( 0, word.length() - 3 ), type );
00171                                 }
00172 
00173                                 if ( ! found )
00174                                         found = wordExactlyExistsAs( irregularVerbs.get( word ), type );
00175                                 break;
00176 
00177                         case Word.TYPE_NOUN:
00178                                 if ( word.endsWith( "s" ) ) {
00179                                         found = wordExactlyExistsAs( word.substring( 0, word.length() - 1 ), type );
00180                                 }
00181                                 if ( ! found && word.endsWith( "es" ) ) {
00182                                         found = wordExactlyExistsAs( word.substring( 0, word.length() - 2 ), type );
00183                                 }
00184                                 if ( ! found ) {
00185                                         found = wordExactlyExistsAs( irregularNouns.get( word ), type );
00186                                 }
00187                                 break;
00188                         }
00189                 }
00190 
00191                 return found;
00192         }
00193 
00194         public static ArrayList<Integer> getPossiblePOS( String w ) throws Exception {
00195 
00196                 ArrayList<Integer> pos = new ArrayList<Integer>();
00197 
00198                 if ( wordExistsAs( w, Word.TYPE_NOUN ) )
00199                         pos.add( Word.TYPE_NOUN );
00200 
00201                 if ( wordExistsAs( w, Word.TYPE_VERB_INFINITIVE ) )
00202                         pos.add( Word.TYPE_VERB_INFINITIVE );
00203 
00204                 if ( wordExistsAs( w, Word.TYPE_ADJ ) )
00205                         pos.add( Word.TYPE_ADJ );
00206 
00207                 if ( wordExistsAs( w, Word.TYPE_ADV ) )
00208                         pos.add( Word.TYPE_ADV );
00209 
00210                 return pos;
00211         }
00212 
00213         private static POS convertPOS( int type ) {
00214 
00215                 POS pos = null;
00216 
00217                 if ( type == Word.TYPE_VERB_INFINITIVE || type == Word.TYPE_PAST_PARTICIPLE || type == Word.TYPE_GERUND )
00218                         pos = POS.VERB;
00219                 else if ( type == Word.TYPE_NOUN )
00220                         pos = POS.NOUN;
00221                 else if ( type == Word.TYPE_ADV )
00222                         pos = POS.ADVERB;
00223                 else if ( type == Word.TYPE_ADJ )
00224                         pos = POS.ADJECTIVE;
00225 
00226                 return pos;
00227         }
00228 
00240         public static boolean wordExactlyExistsAs( String word, int type ) throws Exception {
00241 
00242                 if ( word == null )
00243                         return false;
00244 
00245                 try {
00246                         POS pos = convertPOS( type );
00247                         if ( pos == null )
00248                                 return false;
00249 
00250                         IIndexWord idxWord = getInstance().getIndexWord( word, pos );
00251 
00252                         if ( idxWord == null ) {
00253                                 if ( type == Word.TYPE_NOUN ) {
00254                                         if ( missingNouns.get( word ) != null )
00255                                                 return true;
00256                                 }
00257                                 else if ( type == Word.TYPE_VERB_INFINITIVE ) {
00258                                         if ( missingVerbs.get( word ) != null )
00259                                                 return true;
00260                                 }
00261                         }
00262                         else
00263                                 return true;
00264                         return false;
00265                 }
00266                 catch ( MalformedURLException e ) {
00267                         throw new WordNetException( e.getMessage() );
00268                 }
00269         }
00270 
00271         private static IDictionary getInstance() throws Exception {
00272 
00273                 if ( dict == null ) {
00274                         String wnhome = System.getenv( "WNHOME" );
00275                         if ( wnhome == null )
00276                                 throw new Exception( "Environment variable \"WNHOME\" not set. WordNet could not be found." );
00277                         String path = wnhome + File.separator + "dict";
00278                         URL url = new URL( "file", null, path );
00279                         dict = new Dictionary( url );
00280                         if ( ! dict.isOpen() )
00281                                 dict.open();
00282 
00283                         // Cache irregular verb and noun forms
00284                         if ( CACHE_IRREGULAR_FORMS ) {
00285                                 cacheIrregularVerbs( path );
00286                                 cacheIrregularNouns( path );
00287                         }
00288                         initializeMissingMappings();
00289                 }
00290 
00291                 return dict;
00292         }
00293 
00294         private static void cacheIrregularVerbs( String wnPath ) throws IOException, WordNetException {
00295 
00296                 File verb_exc = new File( wnPath + File.separator + "verb.exc" );
00297                 if ( ! verb_exc.exists() )
00298                         throw new FileNotFoundException( "Mapping File for irregular Verb forms (" + verb_exc.getAbsolutePath()
00299                                         + ") not found" );
00300                 else {
00301                         FileReader reader = new FileReader( verb_exc );
00302                         StringBuilder file = new StringBuilder();
00303                         char[] buffer = new char[1024];
00304                         int read = 0;
00305                         while ( ( read = reader.read( buffer ) ) >= 0 )
00306                                 file.append( buffer, 0, read );
00307                         irregularVerbs = new HashMap<String, String>();
00308 
00309                         String[] mappings = file.toString().split( System.getProperty( "line.separator" ) );
00310                         for ( int i = 0; i < mappings.length; i++ ) {
00311                                 String[] keyValueSet = mappings[i].split( " " );
00312                                 if ( keyValueSet.length < 2 )
00313                                         throw new WordNetException( "Illegal Mapping entry in " + verb_exc.getName() + " -> "
00314                                                         + keyValueSet.length + " Tokens " + i );
00315                                 else {
00316                                         irregularVerbs.put( keyValueSet[0], keyValueSet[1] );
00317                                 }
00318                         }
00319 
00320                 }
00321         }
00322 
00323         private static void cacheIrregularNouns( String wnPath ) throws IOException, WordNetException {
00324 
00325                 File noun_exc = new File( wnPath + File.separator + "noun.exc" );
00326                 if ( ! noun_exc.exists() )
00327                         throw new FileNotFoundException( "Mapping File for irregular Noun forms (" + noun_exc.getAbsolutePath()
00328                                         + ") not found" );
00329                 else {
00330                         FileReader reader = new FileReader( noun_exc );
00331                         StringBuilder file = new StringBuilder();
00332                         char[] buffer = new char[1024];
00333                         int read = 0;
00334                         while ( ( read = reader.read( buffer ) ) >= 0 )
00335                                 file.append( buffer, 0, read );
00336                         irregularNouns = new HashMap<String, String>();
00337 
00338                         String[] mappings = file.toString().split( System.getProperty( "line.separator" ) );
00339                         for ( int i = 0; i < mappings.length; i++ ) {
00340                                 String[] keyValueSet = mappings[i].split( " " );
00341                                 if ( keyValueSet.length < 2 )
00342                                         throw new WordNetException( "Illegal Mapping entry in " + noun_exc.getName() );
00343                                 else {
00344                                         irregularNouns.put( keyValueSet[0], keyValueSet[1] );
00345                                 }
00346                         }
00347 
00348                 }
00349         }
00350 
00351         public static void initializeMissingMappings() {
00352 
00353                 missingNouns = new HashMap<String, String>();
00354                 missingNouns.put( "stove burner", "04163507" );
00355                 missingNouns.put( "stove top", "TUM00001" );
00356                 missingNouns.put( "eating bowl", "02778691" );
00357                 missingNouns.put( "packet soup", "07115585" );
00358                 missingNouns.put( "contents", "07474342" );
00359                 missingNouns.put( "iced tea", "N07456631" );
00360 
00361                 missingVerbs = new HashMap<String, String>();
00362                 missingVerbs.put( "look at", "TUM00002" );
00363                 missingVerbs.put( "bring out", "01145176" );
00364 
00365         }
00366 
00367         public static void main( String[] args ) {
00368 
00369                 try {
00370                         if ( wordExistsAs( "chopped", Word.TYPE_PAST_PARTICIPLE ) )
00371                                 System.out.println( "Word found" );
00372                         else
00373                                 System.out.println( "Word not found" );
00374                 }
00375                 catch ( MalformedURLException e ) {
00376                         // TODO Auto-generated catch block
00377                         e.printStackTrace();
00378                 }
00379                 catch ( Exception e ) {
00380                         // TODO Auto-generated catch block
00381                         e.printStackTrace();
00382                 }
00383         }
00384 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Friends


comp_ehow
Author(s): Moritz Tenorth, Daniel Nyga
autogenerated on Tue Apr 16 2013 00:18:03