svm-scale.c
Go to the documentation of this file.
00001 #include <float.h>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <ctype.h>
00005 #include <string.h>
00006 
00007 void exit_with_help()
00008 {
00009         printf(
00010         "Usage: svm-scale [options] data_filename\n"
00011         "options:\n"
00012         "-l lower : x scaling lower limit (default -1)\n"
00013         "-u upper : x scaling upper limit (default +1)\n"
00014         "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
00015         "-s save_filename : save scaling parameters to save_filename\n"
00016         "-r restore_filename : restore scaling parameters from restore_filename\n"
00017         );
00018         exit(1);
00019 }
00020 
00021 char *line = NULL;
00022 int max_line_len = 1024;
00023 double lower=-1.0,upper=1.0,y_lower,y_upper;
00024 int y_scaling = 0;
00025 double *feature_max;
00026 double *feature_min;
00027 double y_max = -DBL_MAX;
00028 double y_min = DBL_MAX;
00029 int max_index;
00030 int min_index;
00031 long int num_nonzeros = 0;
00032 long int new_num_nonzeros = 0;
00033 
00034 #define max(x,y) (((x)>(y))?(x):(y))
00035 #define min(x,y) (((x)<(y))?(x):(y))
00036 
00037 void output_target(double value);
00038 void output(int index, double value);
00039 char* readline(FILE *input);
00040 
00041 int main(int argc,char **argv)
00042 {
00043         int i,index;
00044         FILE *fp, *fp_restore = NULL;
00045         char *save_filename = NULL;
00046         char *restore_filename = NULL;
00047 
00048         for(i=1;i<argc;i++)
00049         {
00050                 if(argv[i][0] != '-') break;
00051                 ++i;
00052                 switch(argv[i-1][1])
00053                 {
00054                         case 'l': lower = atof(argv[i]); break;
00055                         case 'u': upper = atof(argv[i]); break;
00056                         case 'y':
00057                                 y_lower = atof(argv[i]);
00058                                 ++i;
00059                                 y_upper = atof(argv[i]);
00060                                 y_scaling = 1;
00061                                 break;
00062                         case 's': save_filename = argv[i]; break;
00063                         case 'r': restore_filename = argv[i]; break;
00064                         default:
00065                                 fprintf(stderr,"unknown option\n");
00066                                 exit_with_help();
00067                 }
00068         }
00069 
00070         if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
00071         {
00072                 fprintf(stderr,"inconsistent lower/upper specification\n");
00073                 exit(1);
00074         }
00075         
00076         if(restore_filename && save_filename)
00077         {
00078                 fprintf(stderr,"cannot use -r and -s simultaneously\n");
00079                 exit(1);
00080         }
00081 
00082         if(argc != i+1) 
00083                 exit_with_help();
00084 
00085         fp=fopen(argv[i],"r");
00086         
00087         if(fp==NULL)
00088         {
00089                 fprintf(stderr,"can't open file %s\n", argv[i]);
00090                 exit(1);
00091         }
00092 
00093         line = (char *) malloc(max_line_len*sizeof(char));
00094 
00095 #define SKIP_TARGET\
00096         while(isspace(*p)) ++p;\
00097         while(!isspace(*p)) ++p;
00098 
00099 #define SKIP_ELEMENT\
00100         while(*p!=':') ++p;\
00101         ++p;\
00102         while(isspace(*p)) ++p;\
00103         while(*p && !isspace(*p)) ++p;
00104         
00105         /* assumption: min index of attributes is 1 */
00106         /* pass 1: find out max index of attributes */
00107         max_index = 0;
00108         min_index = 1;
00109 
00110         if(restore_filename)
00111         {
00112                 int idx, c;
00113 
00114                 fp_restore = fopen(restore_filename,"r");
00115                 if(fp_restore==NULL)
00116                 {
00117                         fprintf(stderr,"can't open file %s\n", restore_filename);
00118                         exit(1);
00119                 }
00120 
00121                 c = fgetc(fp_restore);
00122                 if(c == 'y')
00123                 {
00124                         readline(fp_restore);
00125                         readline(fp_restore);
00126                         readline(fp_restore);
00127                 }
00128                 readline(fp_restore);
00129                 readline(fp_restore);
00130 
00131                 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
00132                         max_index = max(idx,max_index);
00133                 rewind(fp_restore);
00134         }
00135 
00136         while(readline(fp)!=NULL)
00137         {
00138                 char *p=line;
00139 
00140                 SKIP_TARGET
00141 
00142                 while(sscanf(p,"%d:%*f",&index)==1)
00143                 {
00144                         max_index = max(max_index, index);
00145                         min_index = min(min_index, index);
00146                         SKIP_ELEMENT
00147                         num_nonzeros++;
00148                 }
00149         }
00150 
00151         if(min_index < 1)
00152                 fprintf(stderr,
00153                         "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
00154 
00155         rewind(fp);
00156 
00157         feature_max = (double *)malloc((max_index+1)* sizeof(double));
00158         feature_min = (double *)malloc((max_index+1)* sizeof(double));
00159 
00160         if(feature_max == NULL || feature_min == NULL)
00161         {
00162                 fprintf(stderr,"can't allocate enough memory\n");
00163                 exit(1);
00164         }
00165 
00166         for(i=0;i<=max_index;i++)
00167         {
00168                 feature_max[i]=-DBL_MAX;
00169                 feature_min[i]=DBL_MAX;
00170         }
00171 
00172         /* pass 2: find out min/max value */
00173         while(readline(fp)!=NULL)
00174         {
00175                 char *p=line;
00176                 int next_index=1;
00177                 double target;
00178                 double value;
00179 
00180                 sscanf(p,"%lf",&target);
00181                 y_max = max(y_max,target);
00182                 y_min = min(y_min,target);
00183                 
00184                 SKIP_TARGET
00185 
00186                 while(sscanf(p,"%d:%lf",&index,&value)==2)
00187                 {
00188                         for(i=next_index;i<index;i++)
00189                         {
00190                                 feature_max[i]=max(feature_max[i],0);
00191                                 feature_min[i]=min(feature_min[i],0);
00192                         }
00193                         
00194                         feature_max[index]=max(feature_max[index],value);
00195                         feature_min[index]=min(feature_min[index],value);
00196 
00197                         SKIP_ELEMENT
00198                         next_index=index+1;
00199                 }               
00200 
00201                 for(i=next_index;i<=max_index;i++)
00202                 {
00203                         feature_max[i]=max(feature_max[i],0);
00204                         feature_min[i]=min(feature_min[i],0);
00205                 }       
00206         }
00207 
00208         rewind(fp);
00209 
00210         /* pass 2.5: save/restore feature_min/feature_max */
00211         
00212         if(restore_filename)
00213         {
00214                 /* fp_restore rewinded in finding max_index */
00215                 int idx, c;
00216                 double fmin, fmax;
00217                 int next_index = 1;
00218                 
00219                 if((c = fgetc(fp_restore)) == 'y')
00220                 {
00221                         fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
00222                         fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
00223                         y_scaling = 1;
00224                 }
00225                 else
00226                         ungetc(c, fp_restore);
00227 
00228                 if (fgetc(fp_restore) == 'x') 
00229                 {
00230                         fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
00231                         while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
00232                         {
00233                                 for(i = next_index;i<idx;i++)
00234                                         if(feature_min[i] != feature_max[i])
00235                                                 fprintf(stderr,
00236                                                         "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00237                                                         i, argv[argc-1], restore_filename);
00238 
00239                                 feature_min[idx] = fmin;
00240                                 feature_max[idx] = fmax;
00241 
00242                                 next_index = idx + 1;
00243                         }
00244                         
00245                         for(i=next_index;i<=max_index;i++)
00246                                 if(feature_min[i] != feature_max[i])
00247                                         fprintf(stderr,
00248                                                 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00249                                                 i, argv[argc-1], restore_filename);
00250                 }
00251                 fclose(fp_restore);
00252         }
00253 
00254         if(save_filename)
00255         {
00256                 FILE *fp_save = fopen(save_filename,"w");
00257                 if(fp_save==NULL)
00258                 {
00259                         fprintf(stderr,"can't open file %s\n", save_filename);
00260                         exit(1);
00261                 }
00262                 if(y_scaling)
00263                 {
00264                         fprintf(fp_save, "y\n");
00265                         fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
00266                         fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
00267                 }
00268                 fprintf(fp_save, "x\n");
00269                 fprintf(fp_save, "%.16g %.16g\n", lower, upper);
00270                 for(i=1;i<=max_index;i++)
00271                 {
00272                         if(feature_min[i]!=feature_max[i])
00273                                 fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
00274                 }
00275 
00276                 if(min_index < 1)
00277                         fprintf(stderr,
00278                                 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
00279 
00280                 fclose(fp_save);
00281         }
00282         
00283         /* pass 3: scale */
00284         while(readline(fp)!=NULL)
00285         {
00286                 char *p=line;
00287                 int next_index=1;
00288                 double target;
00289                 double value;
00290                 
00291                 sscanf(p,"%lf",&target);
00292                 output_target(target);
00293 
00294                 SKIP_TARGET
00295 
00296                 while(sscanf(p,"%d:%lf",&index,&value)==2)
00297                 {
00298                         for(i=next_index;i<index;i++)
00299                                 output(i,0);
00300                         
00301                         output(index,value);
00302 
00303                         SKIP_ELEMENT
00304                         next_index=index+1;
00305                 }               
00306 
00307                 for(i=next_index;i<=max_index;i++)
00308                         output(i,0);
00309 
00310                 printf("\n");
00311         }
00312 
00313         if (new_num_nonzeros > num_nonzeros)
00314                 fprintf(stderr, 
00315                         "WARNING: original #nonzeros %ld\n"
00316                         "         new      #nonzeros %ld\n"
00317                         "Use -l 0 if many original feature values are zeros\n",
00318                         num_nonzeros, new_num_nonzeros);
00319 
00320         free(line);
00321         free(feature_max);
00322         free(feature_min);
00323         fclose(fp);
00324         return 0;
00325 }
00326 
00327 char* readline(FILE *input)
00328 {
00329         int len;
00330         
00331         if(fgets(line,max_line_len,input) == NULL)
00332                 return NULL;
00333 
00334         while(strrchr(line,'\n') == NULL)
00335         {
00336                 max_line_len *= 2;
00337                 line = (char *) realloc(line, max_line_len);
00338                 len = (int) strlen(line);
00339                 if(fgets(line+len,max_line_len-len,input) == NULL)
00340                         break;
00341         }
00342         return line;
00343 }
00344 
00345 void output_target(double value)
00346 {
00347         if(y_scaling)
00348         {
00349                 if(value == y_min)
00350                         value = y_lower;
00351                 else if(value == y_max)
00352                         value = y_upper;
00353                 else value = y_lower + (y_upper-y_lower) *
00354                              (value - y_min)/(y_max-y_min);
00355         }
00356         printf("%g ",value);
00357 }
00358 
00359 void output(int index, double value)
00360 {
00361         /* skip single-valued attribute */
00362         if(feature_max[index] == feature_min[index])
00363                 return;
00364 
00365         if(value == feature_min[index])
00366                 value = lower;
00367         else if(value == feature_max[index])
00368                 value = upper;
00369         else
00370                 value = lower + (upper-lower) * 
00371                         (value-feature_min[index])/
00372                         (feature_max[index]-feature_min[index]);
00373 
00374         if(value != 0)
00375         {
00376                 printf("%d:%g ",index, value);
00377                 new_num_nonzeros++;
00378         }
00379 }


ml_classifiers
Author(s): Scott Niekum
autogenerated on Mon Oct 6 2014 02:20:58