00001 #include <float.h>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <ctype.h>
00005 #include <string.h>
00006
00007 void exit_with_help()
00008 {
00009 printf(
00010 "Usage: svm-scale [options] data_filename\n"
00011 "options:\n"
00012 "-l lower : x scaling lower limit (default -1)\n"
00013 "-u upper : x scaling upper limit (default +1)\n"
00014 "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
00015 "-s save_filename : save scaling parameters to save_filename\n"
00016 "-r restore_filename : restore scaling parameters from restore_filename\n"
00017 );
00018 exit(1);
00019 }
00020
00021 char *line = NULL;
00022 int max_line_len = 1024;
00023 double lower=-1.0,upper=1.0,y_lower,y_upper;
00024 int y_scaling = 0;
00025 double *feature_max;
00026 double *feature_min;
00027 double y_max = -DBL_MAX;
00028 double y_min = DBL_MAX;
00029 int max_index;
00030 int min_index;
00031 long int num_nonzeros = 0;
00032 long int new_num_nonzeros = 0;
00033
00034 #define max(x,y) (((x)>(y))?(x):(y))
00035 #define min(x,y) (((x)<(y))?(x):(y))
00036
00037 void output_target(double value);
00038 void output(int index, double value);
00039 char* readline(FILE *input);
00040 int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
00041
00042 int main(int argc,char **argv)
00043 {
00044 int i,index;
00045 FILE *fp, *fp_restore = NULL;
00046 char *save_filename = NULL;
00047 char *restore_filename = NULL;
00048
00049 for(i=1;i<argc;i++)
00050 {
00051 if(argv[i][0] != '-') break;
00052 ++i;
00053 switch(argv[i-1][1])
00054 {
00055 case 'l': lower = atof(argv[i]); break;
00056 case 'u': upper = atof(argv[i]); break;
00057 case 'y':
00058 y_lower = atof(argv[i]);
00059 ++i;
00060 y_upper = atof(argv[i]);
00061 y_scaling = 1;
00062 break;
00063 case 's': save_filename = argv[i]; break;
00064 case 'r': restore_filename = argv[i]; break;
00065 default:
00066 fprintf(stderr,"unknown option\n");
00067 exit_with_help();
00068 }
00069 }
00070
00071 if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
00072 {
00073 fprintf(stderr,"inconsistent lower/upper specification\n");
00074 exit(1);
00075 }
00076
00077 if(restore_filename && save_filename)
00078 {
00079 fprintf(stderr,"cannot use -r and -s simultaneously\n");
00080 exit(1);
00081 }
00082
00083 if(argc != i+1)
00084 exit_with_help();
00085
00086 fp=fopen(argv[i],"r");
00087
00088 if(fp==NULL)
00089 {
00090 fprintf(stderr,"can't open file %s\n", argv[i]);
00091 exit(1);
00092 }
00093
00094 line = (char *) malloc(max_line_len*sizeof(char));
00095
00096 #define SKIP_TARGET\
00097 while(isspace(*p)) ++p;\
00098 while(!isspace(*p)) ++p;
00099
00100 #define SKIP_ELEMENT\
00101 while(*p!=':') ++p;\
00102 ++p;\
00103 while(isspace(*p)) ++p;\
00104 while(*p && !isspace(*p)) ++p;
00105
00106
00107
00108 max_index = 0;
00109 min_index = 1;
00110
00111 if(restore_filename)
00112 {
00113 int idx, c;
00114
00115 fp_restore = fopen(restore_filename,"r");
00116 if(fp_restore==NULL)
00117 {
00118 fprintf(stderr,"can't open file %s\n", restore_filename);
00119 exit(1);
00120 }
00121
00122 c = fgetc(fp_restore);
00123 if(c == 'y')
00124 {
00125 readline(fp_restore);
00126 readline(fp_restore);
00127 readline(fp_restore);
00128 }
00129 readline(fp_restore);
00130 readline(fp_restore);
00131
00132 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
00133 max_index = max(idx,max_index);
00134 rewind(fp_restore);
00135 }
00136
00137 while(readline(fp)!=NULL)
00138 {
00139 char *p=line;
00140
00141 SKIP_TARGET
00142
00143 while(sscanf(p,"%d:%*f",&index)==1)
00144 {
00145 max_index = max(max_index, index);
00146 min_index = min(min_index, index);
00147 SKIP_ELEMENT
00148 num_nonzeros++;
00149 }
00150 }
00151
00152 if(min_index < 1)
00153 fprintf(stderr,
00154 "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
00155
00156 rewind(fp);
00157
00158 feature_max = (double *)malloc((max_index+1)* sizeof(double));
00159 feature_min = (double *)malloc((max_index+1)* sizeof(double));
00160
00161 if(feature_max == NULL || feature_min == NULL)
00162 {
00163 fprintf(stderr,"can't allocate enough memory\n");
00164 exit(1);
00165 }
00166
00167 for(i=0;i<=max_index;i++)
00168 {
00169 feature_max[i]=-DBL_MAX;
00170 feature_min[i]=DBL_MAX;
00171 }
00172
00173
00174 while(readline(fp)!=NULL)
00175 {
00176 char *p=line;
00177 int next_index=1;
00178 double target;
00179 double value;
00180
00181 if (sscanf(p,"%lf",&target) != 1)
00182 return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
00183 y_max = max(y_max,target);
00184 y_min = min(y_min,target);
00185
00186 SKIP_TARGET
00187
00188 while(sscanf(p,"%d:%lf",&index,&value)==2)
00189 {
00190 for(i=next_index;i<index;i++)
00191 {
00192 feature_max[i]=max(feature_max[i],0);
00193 feature_min[i]=min(feature_min[i],0);
00194 }
00195
00196 feature_max[index]=max(feature_max[index],value);
00197 feature_min[index]=min(feature_min[index],value);
00198
00199 SKIP_ELEMENT
00200 next_index=index+1;
00201 }
00202
00203 for(i=next_index;i<=max_index;i++)
00204 {
00205 feature_max[i]=max(feature_max[i],0);
00206 feature_min[i]=min(feature_min[i],0);
00207 }
00208 }
00209
00210 rewind(fp);
00211
00212
00213
00214 if(restore_filename)
00215 {
00216
00217 int idx, c;
00218 double fmin, fmax;
00219 int next_index = 1;
00220
00221 if((c = fgetc(fp_restore)) == 'y')
00222 {
00223 if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
00224 fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
00225 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
00226 y_scaling = 1;
00227 }
00228 else
00229 ungetc(c, fp_restore);
00230
00231 if (fgetc(fp_restore) == 'x')
00232 {
00233 if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
00234 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
00235 while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
00236 {
00237 for(i = next_index;i<idx;i++)
00238 if(feature_min[i] != feature_max[i])
00239 fprintf(stderr,
00240 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00241 i, argv[argc-1], restore_filename);
00242
00243 feature_min[idx] = fmin;
00244 feature_max[idx] = fmax;
00245
00246 next_index = idx + 1;
00247 }
00248
00249 for(i=next_index;i<=max_index;i++)
00250 if(feature_min[i] != feature_max[i])
00251 fprintf(stderr,
00252 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
00253 i, argv[argc-1], restore_filename);
00254 }
00255 fclose(fp_restore);
00256 }
00257
00258 if(save_filename)
00259 {
00260 FILE *fp_save = fopen(save_filename,"w");
00261 if(fp_save==NULL)
00262 {
00263 fprintf(stderr,"can't open file %s\n", save_filename);
00264 exit(1);
00265 }
00266 if(y_scaling)
00267 {
00268 fprintf(fp_save, "y\n");
00269 fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
00270 fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
00271 }
00272 fprintf(fp_save, "x\n");
00273 fprintf(fp_save, "%.16g %.16g\n", lower, upper);
00274 for(i=1;i<=max_index;i++)
00275 {
00276 if(feature_min[i]!=feature_max[i])
00277 fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
00278 }
00279
00280 if(min_index < 1)
00281 fprintf(stderr,
00282 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
00283
00284 fclose(fp_save);
00285 }
00286
00287
00288 while(readline(fp)!=NULL)
00289 {
00290 char *p=line;
00291 int next_index=1;
00292 double target;
00293 double value;
00294
00295 if (sscanf(p,"%lf",&target) != 1)
00296 return clean_up(NULL, fp, "ERROR: failed to read labels\n");
00297 output_target(target);
00298
00299 SKIP_TARGET
00300
00301 while(sscanf(p,"%d:%lf",&index,&value)==2)
00302 {
00303 for(i=next_index;i<index;i++)
00304 output(i,0);
00305
00306 output(index,value);
00307
00308 SKIP_ELEMENT
00309 next_index=index+1;
00310 }
00311
00312 for(i=next_index;i<=max_index;i++)
00313 output(i,0);
00314
00315 printf("\n");
00316 }
00317
00318 if (new_num_nonzeros > num_nonzeros)
00319 fprintf(stderr,
00320 "WARNING: original #nonzeros %ld\n"
00321 " > new #nonzeros %ld\n"
00322 "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
00323 num_nonzeros, new_num_nonzeros);
00324
00325 free(line);
00326 free(feature_max);
00327 free(feature_min);
00328 fclose(fp);
00329 return 0;
00330 }
00331
00332 char* readline(FILE *input)
00333 {
00334 int len;
00335
00336 if(fgets(line,max_line_len,input) == NULL)
00337 return NULL;
00338
00339 while(strrchr(line,'\n') == NULL)
00340 {
00341 max_line_len *= 2;
00342 line = (char *) realloc(line, max_line_len);
00343 len = (int) strlen(line);
00344 if(fgets(line+len,max_line_len-len,input) == NULL)
00345 break;
00346 }
00347 return line;
00348 }
00349
00350 void output_target(double value)
00351 {
00352 if(y_scaling)
00353 {
00354 if(value == y_min)
00355 value = y_lower;
00356 else if(value == y_max)
00357 value = y_upper;
00358 else value = y_lower + (y_upper-y_lower) *
00359 (value - y_min)/(y_max-y_min);
00360 }
00361 printf("%g ",value);
00362 }
00363
00364 void output(int index, double value)
00365 {
00366
00367 if(feature_max[index] == feature_min[index])
00368 return;
00369
00370 if(value == feature_min[index])
00371 value = lower;
00372 else if(value == feature_max[index])
00373 value = upper;
00374 else
00375 value = lower + (upper-lower) *
00376 (value-feature_min[index])/
00377 (feature_max[index]-feature_min[index]);
00378
00379 if(value != 0)
00380 {
00381 printf("%d:%g ",index, value);
00382 new_num_nonzeros++;
00383 }
00384 }
00385
00386 int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
00387 {
00388 fprintf(stderr, "%s", msg);
00389 free(line);
00390 free(feature_max);
00391 free(feature_min);
00392 fclose(fp);
00393 if (fp_restore)
00394 fclose(fp_restore);
00395 return -1;
00396 }
00397