svm-scale.c
Go to the documentation of this file.
00001 #include <float.h>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <ctype.h>
00005 #include <string.h>
00006 
00007 void exit_with_help()
00008 {
00009         printf(
00010         "Usage: svm-scale [options] data_filename\n"
00011         "options:\n"
00012         "-l lower : x scaling lower limit (default -1)\n"
00013         "-u upper : x scaling upper limit (default +1)\n"
00014         "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
00015         "-s save_filename : save scaling parameters to save_filename\n"
00016         "-r restore_filename : restore scaling parameters from restore_filename\n"
00017         );
00018         exit(1);
00019 }
00020 
00021 char *line = NULL;
00022 int max_line_len = 1024;
00023 double lower=-1.0,upper=1.0,y_lower,y_upper;
00024 int y_scaling = 0;
00025 double *feature_max;
00026 double *feature_min;
00027 double y_max = -DBL_MAX;
00028 double y_min = DBL_MAX;
00029 int max_index;
00030 int min_index;
00031 long int num_nonzeros = 0;
00032 long int new_num_nonzeros = 0;
00033 
00034 #define max(x,y) (((x)>(y))?(x):(y))
00035 #define min(x,y) (((x)<(y))?(x):(y))
00036 
00037 void output_target(double value);
00038 void output(int index, double value);
00039 char* readline(FILE *input);
00040 int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
00041 
00042 int main(int argc,char **argv)
00043 {
00044         int i,index;
00045         FILE *fp, *fp_restore = NULL;
00046         char *save_filename = NULL;
00047         char *restore_filename = NULL;
00048 
00049         for(i=1;i<argc;i++)
00050         {
00051                 if(argv[i][0] != '-') break;
00052                 ++i;
00053                 switch(argv[i-1][1])
00054                 {
00055                         case 'l': lower = atof(argv[i]); break;
00056                         case 'u': upper = atof(argv[i]); break;
00057                         case 'y':
00058                                 y_lower = atof(argv[i]);
00059                                 ++i;
00060                                 y_upper = atof(argv[i]);
00061                                 y_scaling = 1;
00062                                 break;
00063                         case 's': save_filename = argv[i]; break;
00064                         case 'r': restore_filename = argv[i]; break;
00065                         default:
00066                                 fprintf(stderr,"unknown option\n");
00067                                 exit_with_help();
00068                 }
00069         }
00070 
00071         if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
00072         {
00073                 fprintf(stderr,"inconsistent lower/upper specification\n");
00074                 exit(1);
00075         }
00076         
00077         if(restore_filename && save_filename)
00078         {
00079                 fprintf(stderr,"cannot use -r and -s simultaneously\n");
00080                 exit(1);
00081         }
00082 
00083         if(argc != i+1) 
00084                 exit_with_help();
00085 
00086         fp=fopen(argv[i],"r");
00087         
00088         if(fp==NULL)
00089         {
00090                 fprintf(stderr,"can't open file %s\n", argv[i]);
00091                 exit(1);
00092         }
00093 
00094         line = (char *) malloc(max_line_len*sizeof(char));
00095 
00096 #define SKIP_TARGET\
00097         while(isspace(*p)) ++p;\
00098         while(!isspace(*p)) ++p;
00099 
00100 #define SKIP_ELEMENT\
00101         while(*p!=':') ++p;\
00102         ++p;\
00103         while(isspace(*p)) ++p;\
00104         while(*p && !isspace(*p)) ++p;
00105         
00106         /* assumption: min index of attributes is 1 */
00107         /* pass 1: find out max index of attributes */
00108         max_index = 0;
00109         min_index = 1;
00110 
00111         if(restore_filename)
00112         {
00113                 int idx, c;
00114 
00115                 fp_restore = fopen(restore_filename,"r");
00116                 if(fp_restore==NULL)
00117                 {
00118                         fprintf(stderr,"can't open file %s\n", restore_filename);
00119                         exit(1);
00120                 }
00121 
00122                 c = fgetc(fp_restore);
00123                 if(c == 'y')
00124                 {
00125                         readline(fp_restore);
00126                         readline(fp_restore);
00127                         readline(fp_restore);
00128                 }
00129                 readline(fp_restore);
00130                 readline(fp_restore);
00131 
00132                 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
00133                         max_index = max(idx,max_index);
00134                 rewind(fp_restore);
00135         }
00136 
00137         while(readline(fp)!=NULL)
00138         {
00139                 char *p=line;
00140 
00141                 SKIP_TARGET
00142 
00143                 while(sscanf(p,"%d:%*f",&index)==1)
00144                 {
00145                         max_index = max(max_index, index);
00146                         min_index = min(min_index, index);
00147                         SKIP_ELEMENT
00148                         num_nonzeros++;
00149                 }
00150         }
00151 
00152         if(min_index < 1)
00153                 fprintf(stderr,
00154                         "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
00155 
00156         rewind(fp);
00157 
00158         feature_max = (double *)malloc((max_index+1)* sizeof(double));
00159         feature_min = (double *)malloc((max_index+1)* sizeof(double));
00160 
00161         if(feature_max == NULL || feature_min == NULL)
00162         {
00163                 fprintf(stderr,"can't allocate enough memory\n");
00164                 exit(1);
00165         }
00166 
00167         for(i=0;i<=max_index;i++)
00168         {
00169                 feature_max[i]=-DBL_MAX;
00170                 feature_min[i]=DBL_MAX;
00171         }
00172 
00173         /* pass 2: find out min/max value */
00174         while(readline(fp)!=NULL)
00175         {
00176                 char *p=line;
00177                 int next_index=1;
00178                 double target;
00179                 double value;
00180 
00181                 if (sscanf(p,"%lf",&target) != 1)
00182                         return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
00183                 y_max = max(y_max,target);
00184                 y_min = min(y_min,target);
00185                 
00186                 SKIP_TARGET
00187 
00188                 while(sscanf(p,"%d:%lf",&index,&value)==2)
00189                 {
00190                         for(i=next_index;i<index;i++)
00191                         {
00192                                 feature_max[i]=max(feature_max[i],0);
00193                                 feature_min[i]=min(feature_min[i],0);
00194                         }
00195                         
00196                         feature_max[index]=max(feature_max[index],value);
00197                         feature_min[index]=min(feature_min[index],value);
00198 
00199                         SKIP_ELEMENT
00200                         next_index=index+1;
00201                 }               
00202 
00203                 for(i=next_index;i<=max_index;i++)
00204                 {
00205                         feature_max[i]=max(feature_max[i],0);
00206                         feature_min[i]=min(feature_min[i],0);
00207                 }       
00208         }
00209 
00210         rewind(fp);
00211 
00212         /* pass 2.5: save/restore feature_min/feature_max */
00213         
00214         if(restore_filename)
00215         {
00216                 /* fp_restore rewinded in finding max_index */
00217                 int idx, c;
00218                 double fmin, fmax;
00219                 int next_index = 1;
00220                 
00221                 if((c = fgetc(fp_restore)) == 'y')
00222                 {
00223                         if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
00224                            fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
00225                                 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
00226                         y_scaling = 1;
00227                 }
00228                 else
00229                         ungetc(c, fp_restore);
00230 
00231                 if (fgetc(fp_restore) == 'x') 
00232                 {
00233                         if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
00234                                 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
00235                         while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
00236                         {
00237                                 for(i = next_index;i<idx;i++)
00238                                         if(feature_min[i] != feature_max[i])
00239                                                 fprintf(stderr,
00240                                                         "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00241                                                         i, argv[argc-1], restore_filename);
00242 
00243                                 feature_min[idx] = fmin;
00244                                 feature_max[idx] = fmax;
00245 
00246                                 next_index = idx + 1;
00247                         }
00248                         
00249                         for(i=next_index;i<=max_index;i++)
00250                                 if(feature_min[i] != feature_max[i])
00251                                         fprintf(stderr,
00252                                                 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00253                                                 i, argv[argc-1], restore_filename);
00254                 }
00255                 fclose(fp_restore);
00256         }
00257 
00258         if(save_filename)
00259         {
00260                 FILE *fp_save = fopen(save_filename,"w");
00261                 if(fp_save==NULL)
00262                 {
00263                         fprintf(stderr,"can't open file %s\n", save_filename);
00264                         exit(1);
00265                 }
00266                 if(y_scaling)
00267                 {
00268                         fprintf(fp_save, "y\n");
00269                         fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
00270                         fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
00271                 }
00272                 fprintf(fp_save, "x\n");
00273                 fprintf(fp_save, "%.16g %.16g\n", lower, upper);
00274                 for(i=1;i<=max_index;i++)
00275                 {
00276                         if(feature_min[i]!=feature_max[i])
00277                                 fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
00278                 }
00279 
00280                 if(min_index < 1)
00281                         fprintf(stderr,
00282                                 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
00283 
00284                 fclose(fp_save);
00285         }
00286         
00287         /* pass 3: scale */
00288         while(readline(fp)!=NULL)
00289         {
00290                 char *p=line;
00291                 int next_index=1;
00292                 double target;
00293                 double value;
00294                 
00295                 if (sscanf(p,"%lf",&target) != 1)
00296                         return clean_up(NULL, fp, "ERROR: failed to read labels\n");
00297                 output_target(target);
00298 
00299                 SKIP_TARGET
00300 
00301                 while(sscanf(p,"%d:%lf",&index,&value)==2)
00302                 {
00303                         for(i=next_index;i<index;i++)
00304                                 output(i,0);
00305                         
00306                         output(index,value);
00307 
00308                         SKIP_ELEMENT
00309                         next_index=index+1;
00310                 }               
00311 
00312                 for(i=next_index;i<=max_index;i++)
00313                         output(i,0);
00314 
00315                 printf("\n");
00316         }
00317 
00318         if (new_num_nonzeros > num_nonzeros)
00319                 fprintf(stderr, 
00320                         "WARNING: original #nonzeros %ld\n"
00321                         "       > new      #nonzeros %ld\n"
00322                         "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
00323                         num_nonzeros, new_num_nonzeros);
00324 
00325         free(line);
00326         free(feature_max);
00327         free(feature_min);
00328         fclose(fp);
00329         return 0;
00330 }
00331 
00332 char* readline(FILE *input)
00333 {
00334         int len;
00335         
00336         if(fgets(line,max_line_len,input) == NULL)
00337                 return NULL;
00338 
00339         while(strrchr(line,'\n') == NULL)
00340         {
00341                 max_line_len *= 2;
00342                 line = (char *) realloc(line, max_line_len);
00343                 len = (int) strlen(line);
00344                 if(fgets(line+len,max_line_len-len,input) == NULL)
00345                         break;
00346         }
00347         return line;
00348 }
00349 
00350 void output_target(double value)
00351 {
00352         if(y_scaling)
00353         {
00354                 if(value == y_min)
00355                         value = y_lower;
00356                 else if(value == y_max)
00357                         value = y_upper;
00358                 else value = y_lower + (y_upper-y_lower) *
00359                              (value - y_min)/(y_max-y_min);
00360         }
00361         printf("%g ",value);
00362 }
00363 
00364 void output(int index, double value)
00365 {
00366         /* skip single-valued attribute */
00367         if(feature_max[index] == feature_min[index])
00368                 return;
00369 
00370         if(value == feature_min[index])
00371                 value = lower;
00372         else if(value == feature_max[index])
00373                 value = upper;
00374         else
00375                 value = lower + (upper-lower) * 
00376                         (value-feature_min[index])/
00377                         (feature_max[index]-feature_min[index]);
00378 
00379         if(value != 0)
00380         {
00381                 printf("%d:%g ",index, value);
00382                 new_num_nonzeros++;
00383         }
00384 }
00385 
00386 int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
00387 {
00388         fprintf(stderr, "%s", msg);
00389         free(line);
00390         free(feature_max);
00391         free(feature_min);
00392         fclose(fp);
00393         if (fp_restore)
00394                 fclose(fp_restore);
00395         return -1;
00396 }
00397 


target_obejct_detector
Author(s): CIR-KIT
autogenerated on Thu Jun 6 2019 20:19:57