00001 #include <float.h>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <ctype.h>
00005 #include <string.h>
00006
00007 void exit_with_help()
00008 {
00009 printf(
00010 "Usage: svm-scale [options] data_filename\n"
00011 "options:\n"
00012 "-l lower : x scaling lower limit (default -1)\n"
00013 "-u upper : x scaling upper limit (default +1)\n"
00014 "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
00015 "-s save_filename : save scaling parameters to save_filename\n"
00016 "-r restore_filename : restore scaling parameters from restore_filename\n"
00017 );
00018 exit(1);
00019 }
00020
00021 char *line = NULL;
00022 int max_line_len = 1024;
00023 double lower=-1.0,upper=1.0,y_lower,y_upper;
00024 int y_scaling = 0;
00025 double *feature_max;
00026 double *feature_min;
00027 double y_max = -DBL_MAX;
00028 double y_min = DBL_MAX;
00029 int max_index;
00030 long int num_nonzeros = 0;
00031 long int new_num_nonzeros = 0;
00032
00033 #define max(x,y) (((x)>(y))?(x):(y))
00034 #define min(x,y) (((x)<(y))?(x):(y))
00035
00036 void output_target(double value);
00037 void output(int index, double value);
00038 char* readline(FILE *input);
00039
00040 int main(int argc,char **argv)
00041 {
00042 int i,index;
00043 FILE *fp, *fp_restore = NULL;
00044 char *save_filename = NULL;
00045 char *restore_filename = NULL;
00046
00047 for(i=1;i<argc;i++)
00048 {
00049 if(argv[i][0] != '-') break;
00050 ++i;
00051 switch(argv[i-1][1])
00052 {
00053 case 'l': lower = atof(argv[i]); break;
00054 case 'u': upper = atof(argv[i]); break;
00055 case 'y':
00056 y_lower = atof(argv[i]);
00057 ++i;
00058 y_upper = atof(argv[i]);
00059 y_scaling = 1;
00060 break;
00061 case 's': save_filename = argv[i]; break;
00062 case 'r': restore_filename = argv[i]; break;
00063 default:
00064 fprintf(stderr,"unknown option\n");
00065 exit_with_help();
00066 }
00067 }
00068
00069 if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
00070 {
00071 fprintf(stderr,"inconsistent lower/upper specification\n");
00072 exit(1);
00073 }
00074
00075 if(restore_filename && save_filename)
00076 {
00077 fprintf(stderr,"cannot use -r and -s simultaneously\n");
00078 exit(1);
00079 }
00080
00081 if(argc != i+1)
00082 exit_with_help();
00083
00084 fp=fopen(argv[i],"r");
00085
00086 if(fp==NULL)
00087 {
00088 fprintf(stderr,"can't open file %s\n", argv[i]);
00089 exit(1);
00090 }
00091
00092 line = (char *) malloc(max_line_len*sizeof(char));
00093
00094 #define SKIP_TARGET\
00095 while(isspace(*p)) ++p;\
00096 while(!isspace(*p)) ++p;
00097
00098 #define SKIP_ELEMENT\
00099 while(*p!=':') ++p;\
00100 ++p;\
00101 while(isspace(*p)) ++p;\
00102 while(*p && !isspace(*p)) ++p;
00103
00104
00105
00106 max_index = 0;
00107
00108 if(restore_filename)
00109 {
00110 int idx, c;
00111
00112 fp_restore = fopen(restore_filename,"r");
00113 if(fp_restore==NULL)
00114 {
00115 fprintf(stderr,"can't open file %s\n", restore_filename);
00116 exit(1);
00117 }
00118
00119 c = fgetc(fp_restore);
00120 if(c == 'y')
00121 {
00122 readline(fp_restore);
00123 readline(fp_restore);
00124 readline(fp_restore);
00125 }
00126 readline(fp_restore);
00127 readline(fp_restore);
00128
00129 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
00130 max_index = max(idx,max_index);
00131 rewind(fp_restore);
00132 }
00133
00134 while(readline(fp)!=NULL)
00135 {
00136 char *p=line;
00137
00138 SKIP_TARGET
00139
00140 while(sscanf(p,"%d:%*f",&index)==1)
00141 {
00142 max_index = max(max_index, index);
00143 SKIP_ELEMENT
00144 num_nonzeros++;
00145 }
00146 }
00147 rewind(fp);
00148
00149 feature_max = (double *)malloc((max_index+1)* sizeof(double));
00150 feature_min = (double *)malloc((max_index+1)* sizeof(double));
00151
00152 if(feature_max == NULL || feature_min == NULL)
00153 {
00154 fprintf(stderr,"can't allocate enough memory\n");
00155 exit(1);
00156 }
00157
00158 for(i=0;i<=max_index;i++)
00159 {
00160 feature_max[i]=-DBL_MAX;
00161 feature_min[i]=DBL_MAX;
00162 }
00163
00164
00165 while(readline(fp)!=NULL)
00166 {
00167 char *p=line;
00168 int next_index=1;
00169 double target;
00170 double value;
00171
00172 sscanf(p,"%lf",&target);
00173 y_max = max(y_max,target);
00174 y_min = min(y_min,target);
00175
00176 SKIP_TARGET
00177
00178 while(sscanf(p,"%d:%lf",&index,&value)==2)
00179 {
00180 for(i=next_index;i<index;i++)
00181 {
00182 feature_max[i]=max(feature_max[i],0);
00183 feature_min[i]=min(feature_min[i],0);
00184 }
00185
00186 feature_max[index]=max(feature_max[index],value);
00187 feature_min[index]=min(feature_min[index],value);
00188
00189 SKIP_ELEMENT
00190 next_index=index+1;
00191 }
00192
00193 for(i=next_index;i<=max_index;i++)
00194 {
00195 feature_max[i]=max(feature_max[i],0);
00196 feature_min[i]=min(feature_min[i],0);
00197 }
00198 }
00199
00200 rewind(fp);
00201
00202
00203
00204 if(restore_filename)
00205 {
00206
00207 int idx, c;
00208 double fmin, fmax;
00209
00210 if((c = fgetc(fp_restore)) == 'y')
00211 {
00212 fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
00213 fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
00214 y_scaling = 1;
00215 }
00216 else
00217 ungetc(c, fp_restore);
00218
00219 if (fgetc(fp_restore) == 'x') {
00220 fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
00221 while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
00222 {
00223 if(idx<=max_index)
00224 {
00225 feature_min[idx] = fmin;
00226 feature_max[idx] = fmax;
00227 }
00228 }
00229 }
00230 fclose(fp_restore);
00231 }
00232
00233 if(save_filename)
00234 {
00235 FILE *fp_save = fopen(save_filename,"w");
00236 if(fp_save==NULL)
00237 {
00238 fprintf(stderr,"can't open file %s\n", save_filename);
00239 exit(1);
00240 }
00241 if(y_scaling)
00242 {
00243 fprintf(fp_save, "y\n");
00244 fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
00245 fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
00246 }
00247 fprintf(fp_save, "x\n");
00248 fprintf(fp_save, "%.16g %.16g\n", lower, upper);
00249 for(i=1;i<=max_index;i++)
00250 {
00251 if(feature_min[i]!=feature_max[i])
00252 fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
00253 }
00254 fclose(fp_save);
00255 }
00256
00257
00258 while(readline(fp)!=NULL)
00259 {
00260 char *p=line;
00261 int next_index=1;
00262 double target;
00263 double value;
00264
00265 sscanf(p,"%lf",&target);
00266 output_target(target);
00267
00268 SKIP_TARGET
00269
00270 while(sscanf(p,"%d:%lf",&index,&value)==2)
00271 {
00272 for(i=next_index;i<index;i++)
00273 output(i,0);
00274
00275 output(index,value);
00276
00277 SKIP_ELEMENT
00278 next_index=index+1;
00279 }
00280
00281 for(i=next_index;i<=max_index;i++)
00282 output(i,0);
00283
00284 printf("\n");
00285 }
00286
00287 if (new_num_nonzeros > num_nonzeros)
00288 fprintf(stderr,
00289 "Warning: original #nonzeros %ld\n"
00290 " new #nonzeros %ld\n"
00291 "Use -l 0 if many original feature values are zeros\n",
00292 num_nonzeros, new_num_nonzeros);
00293
00294 free(line);
00295 free(feature_max);
00296 free(feature_min);
00297 fclose(fp);
00298 return 0;
00299 }
00300
00301 char* readline(FILE *input)
00302 {
00303 int len;
00304
00305 if(fgets(line,max_line_len,input) == NULL)
00306 return NULL;
00307
00308 while(strrchr(line,'\n') == NULL)
00309 {
00310 max_line_len *= 2;
00311 line = (char *) realloc(line, max_line_len);
00312 len = (int) strlen(line);
00313 if(fgets(line+len,max_line_len-len,input) == NULL)
00314 break;
00315 }
00316 return line;
00317 }
00318
00319 void output_target(double value)
00320 {
00321 if(y_scaling)
00322 {
00323 if(value == y_min)
00324 value = y_lower;
00325 else if(value == y_max)
00326 value = y_upper;
00327 else value = y_lower + (y_upper-y_lower) *
00328 (value - y_min)/(y_max-y_min);
00329 }
00330 printf("%g ",value);
00331 }
00332
00333 void output(int index, double value)
00334 {
00335
00336 if(feature_max[index] == feature_min[index])
00337 return;
00338
00339 if(value == feature_min[index])
00340 value = lower;
00341 else if(value == feature_max[index])
00342 value = upper;
00343 else
00344 value = lower + (upper-lower) *
00345 (value-feature_min[index])/
00346 (feature_max[index]-feature_min[index]);
00347
00348 if(value != 0)
00349 {
00350 printf("%d:%g ",index, value);
00351 new_num_nonzeros++;
00352 }
00353 }