re_vision: tokenizer.cpp Source File

Go to the documentation of this file.
00001 #include "tokenizer.h"
00002 
00003 Token::Token(){
00004     nPalabras = 0;
00005     tokenizado = false;
00006     linea[0] = '\0'; // Borramos la linea
00007     ficheroLeido = false;
00008     finFich = false;
00009     indice = 0;
00010     cuantosBuffer1 = cuantosBuffer2 = 0;
00011 }
00012 
00013 /*****************************************************************************/
00014 
00015 Token::~Token (){
00016     destruyeToken();
00017 }
00018 
00019 /*****************************************************************************/
00020 
00021 void Token::destruyeToken(){
00022     nPalabras = 0;
00023     linea[0] = '\0';
00024     tokenizado = false;
00025     nomFichero[0] = '\0';
00026     if(fichero.is_open()){
00027         fichero.close();
00028     }
00029     ficheroLeido = false;
00030     finFich = false;
00031     indice = 0;
00032     cuantosBuffer1 = cuantosBuffer2 = 0;
00033 }
00034 
00035 /*****************************************************************************/
00036 
00037 int Token::abrirFichero(const char * f){
00038     try {
00039         fichero.open(f);
00040         if(!fichero) throw (f);
00041     } catch (const char * ){
00042         estado = noAbierto;
00043         return noAbierto;
00044     }
00045     strcpy(nomFichero, f);
00046 
00047     //Llenado de los buffers
00048     indice = 0;
00049     fichero.read(buffer, TAM_BUFFER);
00050     cuantosBuffer1 = fichero.gcount();
00051     ficheroLeido = (cuantosBuffer1 < TAM_BUFFER);
00052     if(!ficheroLeido){
00053        fichero.read(buffer+INICIO_BUFFER_2, TAM_BUFFER);
00054        cuantosBuffer2 = fichero.gcount();
00055        ficheroLeido = (cuantosBuffer2 < TAM_BUFFER);
00056     }
00057 
00058     estado = ok;
00059     finFich = (cuantosBuffer1 == 0);
00060     if(finFich)
00061        estado = eof;
00062     return ok;
00063 }
00064 
00065 /*****************************************************************************/
00066 
00067 int Token::leeLinea(){
00068 
00069     int i;
00070     char c;
00071 
00072     if (!fichero.is_open()){
00073          estado = noAbierto;
00074          return noAbierto;
00075     }
00076 
00077     if (finFich){
00078        estado = eof;
00079        return eof;
00080     }
00081 
00082     linea[0] = '\0';
00083     tokenizado = false;
00084 
00085     /* Leemos lineas hasta que encontremos una que no este vacia, es
00086        decir que contenga al menos un caracter != '\n', o hasta eof
00087     */
00088     while( (!finFich) && (strlen(linea) == 0) ){
00089         i = 0;
00090         do{
00091            c = buffer[indice];
00092            if(c != 13){ // 13 == 0Dh == \n windows
00093               if (c == '\n')
00094                  linea[i] = 0;
00095               else
00096                  linea[i] = c;
00097               i++;
00098            }
00099            indice ++;
00100            if(indice == cuantosBuffer1){
00101               // BUFFER 1 VACIO => POSIBLE LLENADO DEL BUFFER 1
00102               if(indice < FIN_BUFFER1){
00103                  finFich = true;
00104               }
00105               if(!ficheroLeido){
00106                  fichero.read(buffer, TAM_BUFFER);
00107                  cuantosBuffer1 = fichero.gcount();
00108                  ficheroLeido = (cuantosBuffer1 < TAM_BUFFER);
00109               }
00110            }else if (indice == INICIO_BUFFER_2 + cuantosBuffer2){
00111               // BUFFER 2 VACIO => POSIBLE LLENADO DEL BUFFER 2
00112               if(indice < FIN_BUFFER2){
00113                  finFich = true;
00114               }
00115               if(!ficheroLeido){
00116                  fichero.read(buffer+INICIO_BUFFER_2, TAM_BUFFER);
00117                  cuantosBuffer2 = fichero.gcount();
00118                  ficheroLeido = (cuantosBuffer2 < TAM_BUFFER);
00119               }
00120               indice = 0;
00121            }
00122         }while(c != '\n' && !finFich);
00123     }
00124 
00125     nPalabras = 0;
00126 
00127     (strlen(linea) == 0 ? estado = eof : estado = ok);
00128     return estado;
00129 
00130 }
00131 
00132 /*****************************************************************************/
00133 
00134 int Token::lee(int ncampos, char * palInicial){
00135     leeLinea();
00136     if(estado != ok) return estado;
00137 
00138     estado = ok;
00139     if ((tokenizar(" ") != ncampos)
00140         || (palInicial && (strcmp(token(0), palInicial) !=0)))
00141         estado = malFormato;
00142     return estado;
00143 }
00144 
00145 /*****************************************************************************/
00146 
00147 int Token::tokenizar(const char * s){
00148 
00149    if (tokenizado) return nPalabras;
00150 
00151    strcpy(auxLinea, linea);
00152 
00153     unsigned int i, j; // Indices de las cadenas
00154     unsigned int lonAuxLinea; // Es de uso obligatorio ya que hay momentos
00155                               // en los que truncamos auxLinea poniendo
00156                               // caracteres a cero => strlen(auxLinea) cambia
00157     unsigned int lons; // Se utiliza por eficiencia, para no estar llamando
00158                        // todo el rato a strlen(s).
00159 
00160     palabras[nPalabras] = auxLinea;
00161 
00162     lonAuxLinea = strlen(auxLinea);
00163     lons = strlen(s);
00164 
00165     // Obtiene los tokens utilizando para delimitarlos los caracteres
00166     // de la cadena s
00167     for (i = 0; i < lonAuxLinea; i++){
00168        for (j = 0; j < lons; j++){
00169           if(s[j] == auxLinea[i]){
00170              auxLinea[i] = '\0';
00171              if (strlen(palabras[nPalabras]) > 0){
00172                 nPalabras ++;
00173              }
00174              palabras[nPalabras] = auxLinea + i + 1;
00175              continue;
00176           }
00177       }
00178    }
00179    if (strlen(palabras[nPalabras]) > 0){
00180        nPalabras ++;
00181    }
00182 
00183    // Devolvemos el n� de tokens encontrados
00184    return nPalabras;
00185 }
00186 
00187 /*****************************************************************************/
00188 
00189 char * Token::token(int n){
00190     if(n >= nPalabras) return NULL;
00191     return palabras[n];
00192 }