00001 #include <float.h>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <ctype.h>
00005 #include <string.h>
00006
00007 void exit_with_help()
00008 {
00009 printf(
00010 "Usage: svm-scale [options] data_filename\n"
00011 "options:\n"
00012 "-l lower : x scaling lower limit (default -1)\n"
00013 "-u upper : x scaling upper limit (default +1)\n"
00014 "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
00015 "-s save_filename : save scaling parameters to save_filename\n"
00016 "-r restore_filename : restore scaling parameters from restore_filename\n"
00017 );
00018 exit(1);
00019 }
00020
00021 char *line = NULL;
00022 int max_line_len = 1024;
00023 double lower=-1.0,upper=1.0,y_lower,y_upper;
00024 int y_scaling = 0;
00025 double *feature_max;
00026 double *feature_min;
00027 double y_max = -DBL_MAX;
00028 double y_min = DBL_MAX;
00029 int max_index;
00030 int min_index;
00031 long int num_nonzeros = 0;
00032 long int new_num_nonzeros = 0;
00033
00034 #define max(x,y) (((x)>(y))?(x):(y))
00035 #define min(x,y) (((x)<(y))?(x):(y))
00036
00037 void output_target(double value);
00038 void output(int index, double value);
00039 char* readline(FILE *input);
00040
00041 int main(int argc,char **argv)
00042 {
00043 int i,index;
00044 FILE *fp, *fp_restore = NULL;
00045 char *save_filename = NULL;
00046 char *restore_filename = NULL;
00047
00048 for(i=1;i<argc;i++)
00049 {
00050 if(argv[i][0] != '-') break;
00051 ++i;
00052 switch(argv[i-1][1])
00053 {
00054 case 'l': lower = atof(argv[i]); break;
00055 case 'u': upper = atof(argv[i]); break;
00056 case 'y':
00057 y_lower = atof(argv[i]);
00058 ++i;
00059 y_upper = atof(argv[i]);
00060 y_scaling = 1;
00061 break;
00062 case 's': save_filename = argv[i]; break;
00063 case 'r': restore_filename = argv[i]; break;
00064 default:
00065 fprintf(stderr,"unknown option\n");
00066 exit_with_help();
00067 }
00068 }
00069
00070 if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
00071 {
00072 fprintf(stderr,"inconsistent lower/upper specification\n");
00073 exit(1);
00074 }
00075
00076 if(restore_filename && save_filename)
00077 {
00078 fprintf(stderr,"cannot use -r and -s simultaneously\n");
00079 exit(1);
00080 }
00081
00082 if(argc != i+1)
00083 exit_with_help();
00084
00085 fp=fopen(argv[i],"r");
00086
00087 if(fp==NULL)
00088 {
00089 fprintf(stderr,"can't open file %s\n", argv[i]);
00090 exit(1);
00091 }
00092
00093 line = (char *) malloc(max_line_len*sizeof(char));
00094
00095 #define SKIP_TARGET\
00096 while(isspace(*p)) ++p;\
00097 while(!isspace(*p)) ++p;
00098
00099 #define SKIP_ELEMENT\
00100 while(*p!=':') ++p;\
00101 ++p;\
00102 while(isspace(*p)) ++p;\
00103 while(*p && !isspace(*p)) ++p;
00104
00105
00106
00107 max_index = 0;
00108 min_index = 1;
00109
00110 if(restore_filename)
00111 {
00112 int idx, c;
00113
00114 fp_restore = fopen(restore_filename,"r");
00115 if(fp_restore==NULL)
00116 {
00117 fprintf(stderr,"can't open file %s\n", restore_filename);
00118 exit(1);
00119 }
00120
00121 c = fgetc(fp_restore);
00122 if(c == 'y')
00123 {
00124 readline(fp_restore);
00125 readline(fp_restore);
00126 readline(fp_restore);
00127 }
00128 readline(fp_restore);
00129 readline(fp_restore);
00130
00131 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
00132 max_index = max(idx,max_index);
00133 rewind(fp_restore);
00134 }
00135
00136 while(readline(fp)!=NULL)
00137 {
00138 char *p=line;
00139
00140 SKIP_TARGET
00141
00142 while(sscanf(p,"%d:%*f",&index)==1)
00143 {
00144 max_index = max(max_index, index);
00145 min_index = min(min_index, index);
00146 SKIP_ELEMENT
00147 num_nonzeros++;
00148 }
00149 }
00150
00151 if(min_index < 1)
00152 fprintf(stderr,
00153 "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
00154
00155 rewind(fp);
00156
00157 feature_max = (double *)malloc((max_index+1)* sizeof(double));
00158 feature_min = (double *)malloc((max_index+1)* sizeof(double));
00159
00160 if(feature_max == NULL || feature_min == NULL)
00161 {
00162 fprintf(stderr,"can't allocate enough memory\n");
00163 exit(1);
00164 }
00165
00166 for(i=0;i<=max_index;i++)
00167 {
00168 feature_max[i]=-DBL_MAX;
00169 feature_min[i]=DBL_MAX;
00170 }
00171
00172
00173 while(readline(fp)!=NULL)
00174 {
00175 char *p=line;
00176 int next_index=1;
00177 double target;
00178 double value;
00179
00180 sscanf(p,"%lf",&target);
00181 y_max = max(y_max,target);
00182 y_min = min(y_min,target);
00183
00184 SKIP_TARGET
00185
00186 while(sscanf(p,"%d:%lf",&index,&value)==2)
00187 {
00188 for(i=next_index;i<index;i++)
00189 {
00190 feature_max[i]=max(feature_max[i],0);
00191 feature_min[i]=min(feature_min[i],0);
00192 }
00193
00194 feature_max[index]=max(feature_max[index],value);
00195 feature_min[index]=min(feature_min[index],value);
00196
00197 SKIP_ELEMENT
00198 next_index=index+1;
00199 }
00200
00201 for(i=next_index;i<=max_index;i++)
00202 {
00203 feature_max[i]=max(feature_max[i],0);
00204 feature_min[i]=min(feature_min[i],0);
00205 }
00206 }
00207
00208 rewind(fp);
00209
00210
00211
00212 if(restore_filename)
00213 {
00214
00215 int idx, c;
00216 double fmin, fmax;
00217 int next_index = 1;
00218
00219 if((c = fgetc(fp_restore)) == 'y')
00220 {
00221 fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
00222 fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
00223 y_scaling = 1;
00224 }
00225 else
00226 ungetc(c, fp_restore);
00227
00228 if (fgetc(fp_restore) == 'x')
00229 {
00230 fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
00231 while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
00232 {
00233 for(i = next_index;i<idx;i++)
00234 if(feature_min[i] != feature_max[i])
00235 fprintf(stderr,
00236 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00237 i, argv[argc-1], restore_filename);
00238
00239 feature_min[idx] = fmin;
00240 feature_max[idx] = fmax;
00241
00242 next_index = idx + 1;
00243 }
00244
00245 for(i=next_index;i<=max_index;i++)
00246 if(feature_min[i] != feature_max[i])
00247 fprintf(stderr,
00248 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00249 i, argv[argc-1], restore_filename);
00250 }
00251 fclose(fp_restore);
00252 }
00253
00254 if(save_filename)
00255 {
00256 FILE *fp_save = fopen(save_filename,"w");
00257 if(fp_save==NULL)
00258 {
00259 fprintf(stderr,"can't open file %s\n", save_filename);
00260 exit(1);
00261 }
00262 if(y_scaling)
00263 {
00264 fprintf(fp_save, "y\n");
00265 fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
00266 fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
00267 }
00268 fprintf(fp_save, "x\n");
00269 fprintf(fp_save, "%.16g %.16g\n", lower, upper);
00270 for(i=1;i<=max_index;i++)
00271 {
00272 if(feature_min[i]!=feature_max[i])
00273 fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
00274 }
00275
00276 if(min_index < 1)
00277 fprintf(stderr,
00278 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
00279
00280 fclose(fp_save);
00281 }
00282
00283
00284 while(readline(fp)!=NULL)
00285 {
00286 char *p=line;
00287 int next_index=1;
00288 double target;
00289 double value;
00290
00291 sscanf(p,"%lf",&target);
00292 output_target(target);
00293
00294 SKIP_TARGET
00295
00296 while(sscanf(p,"%d:%lf",&index,&value)==2)
00297 {
00298 for(i=next_index;i<index;i++)
00299 output(i,0);
00300
00301 output(index,value);
00302
00303 SKIP_ELEMENT
00304 next_index=index+1;
00305 }
00306
00307 for(i=next_index;i<=max_index;i++)
00308 output(i,0);
00309
00310 printf("\n");
00311 }
00312
00313 if (new_num_nonzeros > num_nonzeros)
00314 fprintf(stderr,
00315 "WARNING: original #nonzeros %ld\n"
00316 " new #nonzeros %ld\n"
00317 "Use -l 0 if many original feature values are zeros\n",
00318 num_nonzeros, new_num_nonzeros);
00319
00320 free(line);
00321 free(feature_max);
00322 free(feature_min);
00323 fclose(fp);
00324 return 0;
00325 }
00326
00327 char* readline(FILE *input)
00328 {
00329 int len;
00330
00331 if(fgets(line,max_line_len,input) == NULL)
00332 return NULL;
00333
00334 while(strrchr(line,'\n') == NULL)
00335 {
00336 max_line_len *= 2;
00337 line = (char *) realloc(line, max_line_len);
00338 len = (int) strlen(line);
00339 if(fgets(line+len,max_line_len-len,input) == NULL)
00340 break;
00341 }
00342 return line;
00343 }
00344
00345 void output_target(double value)
00346 {
00347 if(y_scaling)
00348 {
00349 if(value == y_min)
00350 value = y_lower;
00351 else if(value == y_max)
00352 value = y_upper;
00353 else value = y_lower + (y_upper-y_lower) *
00354 (value - y_min)/(y_max-y_min);
00355 }
00356 printf("%g ",value);
00357 }
00358
00359 void output(int index, double value)
00360 {
00361
00362 if(feature_max[index] == feature_min[index])
00363 return;
00364
00365 if(value == feature_min[index])
00366 value = lower;
00367 else if(value == feature_max[index])
00368 value = upper;
00369 else
00370 value = lower + (upper-lower) *
00371 (value-feature_min[index])/
00372 (feature_max[index]-feature_min[index]);
00373
00374 if(value != 0)
00375 {
00376 printf("%d:%g ",index, value);
00377 new_num_nonzeros++;
00378 }
00379 }