ml_classifiers: svmutil.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 from svm import *
00004 
00005 def svm_read_problem(data_file_name):
00006         """
00007         svm_read_problem(data_file_name) -> [y, x]
00008 
00009         Read LIBSVM-format data from data_file_name and return labels y
00010         and data instances x.
00011         """
00012         prob_y = []
00013         prob_x = []
00014         for line in open(data_file_name):
00015                 line = line.split(None, 1)
00016                 # In case an instance with all zero features
00017                 if len(line) == 1: line += ['']
00018                 label, features = line
00019                 xi = {}
00020                 for e in features.split():
00021                         ind, val = e.split(":")
00022                         xi[int(ind)] = float(val)
00023                 prob_y += [float(label)]
00024                 prob_x += [xi]
00025         return (prob_y, prob_x)
00026 
00027 def svm_load_model(model_file_name):
00028         """
00029         svm_load_model(model_file_name) -> model
00030         
00031         Load a LIBSVM model from model_file_name and return.
00032         """
00033         model = libsvm.svm_load_model(model_file_name.encode())
00034         if not model: 
00035                 print("can't open model file %s" % model_file_name)
00036                 return None
00037         model = toPyModel(model)
00038         return model
00039 
00040 def svm_save_model(model_file_name, model):
00041         """
00042         svm_save_model(model_file_name, model) -> None
00043 
00044         Save a LIBSVM model to the file model_file_name.
00045         """
00046         libsvm.svm_save_model(model_file_name.encode(), model)
00047 
00048 def evaluations(ty, pv):
00049         """
00050         evaluations(ty, pv) -> (ACC, MSE, SCC)
00051 
00052         Calculate accuracy, mean squared error and squared correlation coefficient
00053         using the true values (ty) and predicted values (pv).
00054         """
00055         if len(ty) != len(pv):
00056                 raise ValueError("len(ty) must equal to len(pv)")
00057         total_correct = total_error = 0
00058         sumv = sumy = sumvv = sumyy = sumvy = 0
00059         for v, y in zip(pv, ty):
00060                 if y == v: 
00061                         total_correct += 1
00062                 total_error += (v-y)*(v-y)
00063                 sumv += v
00064                 sumy += y
00065                 sumvv += v*v
00066                 sumyy += y*y
00067                 sumvy += v*y 
00068         l = len(ty)
00069         ACC = 100.0*total_correct/l
00070         MSE = total_error/l
00071         try:
00072                 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
00073         except:
00074                 SCC = float('nan')
00075         return (ACC, MSE, SCC)
00076 
00077 def svm_train(arg1, arg2=None, arg3=None):
00078         """
00079         svm_train(y, x [, 'options']) -> model | ACC | MSE 
00080         svm_train(prob, [, 'options']) -> model | ACC | MSE 
00081         svm_train(prob, param) -> model | ACC| MSE 
00082 
00083         Train an SVM model from data (y, x) or an svm_problem prob using
00084         'options' or an svm_parameter param. 
00085         If '-v' is specified in 'options' (i.e., cross validation)
00086         either accuracy (ACC) or mean-squared error (MSE) is returned.
00087         'options':
00088             -s svm_type : set type of SVM (default 0)
00089                 0 -- C-SVC              (multi-class classification)
00090                 1 -- nu-SVC             (multi-class classification)
00091                 2 -- one-class SVM
00092                 3 -- epsilon-SVR        (regression)
00093                 4 -- nu-SVR             (regression)
00094             -t kernel_type : set type of kernel function (default 2)
00095                 0 -- linear: u'*v
00096                 1 -- polynomial: (gamma*u'*v + coef0)^degree
00097                 2 -- radial basis function: exp(-gamma*|u-v|^2)
00098                 3 -- sigmoid: tanh(gamma*u'*v + coef0)
00099                 4 -- precomputed kernel (kernel values in training_set_file)
00100             -d degree : set degree in kernel function (default 3)
00101             -g gamma : set gamma in kernel function (default 1/num_features)
00102             -r coef0 : set coef0 in kernel function (default 0)
00103             -c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1)
00104             -n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)
00105             -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)
00106             -m cachesize : set cache memory size in MB (default 100)
00107             -e epsilon : set tolerance of termination criterion (default 0.001)
00108             -h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1)
00109             -b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)
00110             -wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)
00111             -v n: n-fold cross validation mode
00112             -q : quiet mode (no outputs)
00113         """
00114         prob, param = None, None
00115         if isinstance(arg1, (list, tuple)):
00116                 assert isinstance(arg2, (list, tuple))
00117                 y, x, options = arg1, arg2, arg3
00118                 param = svm_parameter(options)
00119                 prob = svm_problem(y, x, isKernel=(param.kernel_type == PRECOMPUTED))
00120         elif isinstance(arg1, svm_problem):
00121                 prob = arg1
00122                 if isinstance(arg2, svm_parameter):
00123                         param = arg2
00124                 else:
00125                         param = svm_parameter(arg2)
00126         if prob == None or param == None:
00127                 raise TypeError("Wrong types for the arguments")
00128 
00129         if param.kernel_type == PRECOMPUTED:
00130                 for xi in prob.x_space:
00131                         idx, val = xi[0].index, xi[0].value
00132                         if xi[0].index != 0:
00133                                 raise ValueError('Wrong input format: first column must be 0:sample_serial_number')
00134                         if val <= 0 or val > prob.n:
00135                                 raise ValueError('Wrong input format: sample_serial_number out of range')
00136 
00137         if param.gamma == 0 and prob.n > 0: 
00138                 param.gamma = 1.0 / prob.n
00139         libsvm.svm_set_print_string_function(param.print_func)
00140         err_msg = libsvm.svm_check_parameter(prob, param)
00141         if err_msg:
00142                 raise ValueError('Error: %s' % err_msg)
00143 
00144         if param.cross_validation:
00145                 l, nr_fold = prob.l, param.nr_fold
00146                 target = (c_double * l)()
00147                 libsvm.svm_cross_validation(prob, param, nr_fold, target)       
00148                 ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
00149                 if param.svm_type in [EPSILON_SVR, NU_SVR]:
00150                         print("Cross Validation Mean squared error = %g" % MSE)
00151                         print("Cross Validation Squared correlation coefficient = %g" % SCC)
00152                         return MSE
00153                 else:
00154                         print("Cross Validation Accuracy = %g%%" % ACC)
00155                         return ACC
00156         else:
00157                 m = libsvm.svm_train(prob, param)
00158                 m = toPyModel(m)
00159 
00160                 # If prob is destroyed, data including SVs pointed by m can remain.
00161                 m.x_space = prob.x_space
00162                 return m
00163 
00164 def svm_predict(y, x, m, options=""):
00165         """
00166         svm_predict(y, x, m [, "options"]) -> (p_labels, p_acc, p_vals)
00167 
00168         Predict data (y, x) with the SVM model m. 
00169         "options": 
00170             -b probability_estimates: whether to predict probability estimates, 
00171                 0 or 1 (default 0); for one-class SVM only 0 is supported.
00172             -q : quiet mode (no outputs).
00173 
00174         The return tuple contains
00175         p_labels: a list of predicted labels
00176         p_acc: a tuple including  accuracy (for classification), mean-squared 
00177                error, and squared correlation coefficient (for regression).
00178         p_vals: a list of decision values or probability estimates (if '-b 1' 
00179                 is specified). If k is the number of classes, for decision values,
00180                 each element includes results of predicting k(k-1)/2 binary-class
00181                 SVMs. For probabilities, each element contains k values indicating
00182                 the probability that the testing instance is in each class.
00183                 Note that the order of classes here is the same as 'model.label'
00184                 field in the model structure.
00185         """
00186 
00187         def info(s):
00188                 print(s)
00189 
00190         predict_probability = 0
00191         argv = options.split()
00192         i = 0
00193         while i < len(argv):
00194                 if argv[i] == '-b':
00195                         i += 1
00196                         predict_probability = int(argv[i])
00197                 elif argv[i] == '-q':
00198                         info = print_null
00199                 else:
00200                         raise ValueError("Wrong options")
00201                 i+=1
00202 
00203         svm_type = m.get_svm_type()
00204         is_prob_model = m.is_probability_model()
00205         nr_class = m.get_nr_class()
00206         pred_labels = []
00207         pred_values = []
00208 
00209         if predict_probability:
00210                 if not is_prob_model:
00211                         raise ValueError("Model does not support probabiliy estimates")
00212 
00213                 if svm_type in [NU_SVR, EPSILON_SVR]:
00214                         info("Prob. model for test data: target value = predicted value + z,\n"
00215                         "z: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g" % m.get_svr_probability());
00216                         nr_class = 0
00217 
00218                 prob_estimates = (c_double * nr_class)()
00219                 for xi in x:
00220                         xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == PRECOMPUTED))
00221                         label = libsvm.svm_predict_probability(m, xi, prob_estimates)
00222                         values = prob_estimates[:nr_class]
00223                         pred_labels += [label]
00224                         pred_values += [values]
00225         else:
00226                 if is_prob_model:
00227                         info("Model supports probability estimates, but disabled in predicton.")
00228                 if svm_type in (ONE_CLASS, EPSILON_SVR, NU_SVC):
00229                         nr_classifier = 1
00230                 else:
00231                         nr_classifier = nr_class*(nr_class-1)//2
00232                 dec_values = (c_double * nr_classifier)()
00233                 for xi in x:
00234                         xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == PRECOMPUTED))
00235                         label = libsvm.svm_predict_values(m, xi, dec_values)
00236                         if(nr_class == 1): 
00237                                 values = [1]
00238                         else: 
00239                                 values = dec_values[:nr_classifier]
00240                         pred_labels += [label]
00241                         pred_values += [values]
00242 
00243         ACC, MSE, SCC = evaluations(y, pred_labels)
00244         l = len(y)
00245         if svm_type in [EPSILON_SVR, NU_SVR]:
00246                 info("Mean squared error = %g (regression)" % MSE)
00247                 info("Squared correlation coefficient = %g (regression)" % SCC)
00248         else:
00249                 info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l))
00250 
00251         return pred_labels, (ACC, MSE, SCC), pred_values
00252