00001
00002
00003 import pdb
00004 from svm import *
00005
00006 def svm_read_problem(data_file_name):
00007 """
00008 svm_read_problem(data_file_name) -> [y, x]
00009
00010 Read LIBSVM-format data from data_file_name and return labels y
00011 and data instances x.
00012 """
00013 prob_y = []
00014 prob_x = []
00015 for line in open(data_file_name):
00016 line = line.split(None, 1)
00017
00018 if len(line) == 1: line += ['']
00019 label, features = line
00020 xi = {}
00021 for e in features.split():
00022 ind, val = e.split(":")
00023 xi[int(ind)] = float(val)
00024 prob_y += [float(label)]
00025 prob_x += [xi]
00026 return (prob_y, prob_x)
00027
00028 def svm_load_model(model_file_name):
00029 """
00030 svm_load_model(model_file_name) -> model
00031
00032 Load a LIBSVM model from model_file_name and return.
00033 """
00034 model = libsvm.svm_load_model(model_file_name)
00035 if not model:
00036 print("can't open model file %s" % model_file_name)
00037 return None
00038 model = toPyModel(model)
00039 return model
00040
00041 def svm_save_model(model_file_name, model):
00042 """
00043 svm_save_model(model_file_name, model) -> None
00044
00045 Save a LIBSVM model to the file model_file_name.
00046 """
00047 libsvm.svm_save_model(model_file_name, model)
00048
00049 def evaluations(ty, pv):
00050 """
00051 evaluations(ty, pv) -> (ACC, MSE, SCC)
00052
00053 Calculate accuracy, mean squared error and squared correlation coefficient
00054 using the true values (ty) and predicted values (pv).
00055 """
00056 if len(ty) != len(pv):
00057 raise ValueError("len(ty) must equal to len(pv)")
00058 total_correct = total_error = 0
00059 sumv = sumy = sumvv = sumyy = sumvy = 0
00060 for v, y in zip(pv, ty):
00061 if y == v:
00062 total_correct += 1
00063 total_error += (v-y)*(v-y)
00064 sumv += v
00065 sumy += y
00066 sumvv += v*v
00067 sumyy += y*y
00068 sumvy += v*y
00069 l = len(ty)
00070 ACC = 100.0*total_correct/l
00071 MSE = total_error/l
00072 try:
00073 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
00074 except:
00075 SCC = float('nan')
00076 return (ACC, MSE, SCC)
00077
00078 def svm_train(arg1, arg2=None, arg3=None):
00079 """
00080 svm_train(y, x [, 'options']) -> model | ACC | MSE
00081 svm_train(prob, [, 'options']) -> model | ACC | MSE
00082 svm_train(prob, param) -> model | ACC| MSE
00083
00084 Train an SVM model from data (y, x) or an svm_problem prob using
00085 'options' or an svm_parameter param.
00086 If '-v' is specified in 'options' (i.e., cross validation)
00087 either accuracy (ACC) or mean-squared error (MSE) is returned.
00088 'options':
00089 -s svm_type : set type of SVM (default 0)
00090 0 -- C-SVC
00091 1 -- nu-SVC
00092 2 -- one-class SVM
00093 3 -- epsilon-SVR
00094 4 -- nu-SVR
00095 -t kernel_type : set type of kernel function (default 2)
00096 0 -- linear: u'*v
00097 1 -- polynomial: (gamma*u'*v + coef0)^degree
00098 2 -- radial basis function: exp(-gamma*|u-v|^2)
00099 3 -- sigmoid: tanh(gamma*u'*v + coef0)
00100 4 -- precomputed kernel (kernel values in training_set_file)
00101 -d degree : set degree in kernel function (default 3)
00102 -g gamma : set gamma in kernel function (default 1/num_features)
00103 -r coef0 : set coef0 in kernel function (default 0)
00104 -c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1)
00105 -n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)
00106 -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)
00107 -m cachesize : set cache memory size in MB (default 100)
00108 -e epsilon : set tolerance of termination criterion (default 0.001)
00109 -h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1)
00110 -b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)
00111 -wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)
00112 -v n: n-fold cross validation mode
00113 -q : quiet mode (no outputs)
00114 """
00115 prob, param = None, None
00116
00117 if isinstance(arg1, (list, tuple)):
00118 assert isinstance(arg2, (list, tuple))
00119 y, x, options = arg1, arg2, arg3
00120 prob = svm_problem(y, x)
00121 param = svm_parameter(options)
00122 elif isinstance(arg1, svm_problem):
00123 prob = arg1
00124 if isinstance(arg2, svm_parameter):
00125 param = arg2
00126 else:
00127 param = svm_parameter(arg2)
00128 if prob == None or param == None:
00129 raise TypeError("Wrong types for the arguments")
00130
00131 if param.kernel_type == PRECOMPUTED:
00132 for xi in prob.x_space:
00133 idx, val = xi[0].index, xi[0].value
00134 if xi[0].index != 0:
00135 raise ValueError('Wrong input format: first column must be 0:sample_serial_number')
00136 if val <= 0 or val > prob.n:
00137 raise ValueError('Wrong input format: sample_serial_number out of range')
00138
00139 if param.gamma == 0 and prob.n > 0:
00140 param.gamma = 1.0 / prob.n
00141 libsvm.svm_set_print_string_function(param.print_func)
00142 err_msg = libsvm.svm_check_parameter(prob, param)
00143 if err_msg:
00144 raise ValueError('Error: %s' % err_msg)
00145
00146 if param.cross_validation:
00147 l, nr_fold = prob.l, param.nr_fold
00148 target = (c_double * l)()
00149 libsvm.svm_cross_validation(prob, param, nr_fold, target)
00150 ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
00151 if param.svm_type in [EPSILON_SVR, NU_SVR]:
00152 print("Cross Validation Mean squared error = %g" % MSE)
00153 print("Cross Validation Squared correlation coefficient = %g" % SCC)
00154 return MSE
00155 else:
00156 print("Cross Validation Accuracy = %g%%" % ACC)
00157 return ACC
00158 else:
00159 m = libsvm.svm_train(prob, param)
00160 m = toPyModel(m)
00161
00162
00163 m.x_space = prob.x_space
00164 return m
00165
00166 def svm_predict(y, x, m, options=""):
00167 """
00168 svm_predict(y, x, m [, "options"]) -> (p_labels, p_acc, p_vals)
00169
00170 Predict data (y, x) with the SVM model m.
00171 "options":
00172 -b probability_estimates: whether to predict probability estimates,
00173 0 or 1 (default 0); for one-class SVM only 0 is supported.
00174
00175 The return tuple contains
00176 p_labels: a list of predicted labels
00177 p_acc: a tuple including accuracy (for classification), mean-squared
00178 error, and squared correlation coefficient (for regression).
00179 p_vals: a list of decision values or probability estimates (if '-b 1'
00180 is specified). If k is the number of classes, for decision values,
00181 each element includes results of predicting k(k-1)/2 binary-class
00182 SVMs. For probabilities, each element contains k values indicating
00183 the probability that the testing instance is in each class.
00184 Note that the order of classes here is the same as 'model.label'
00185 field in the model structure.
00186 """
00187 predict_probability = 0
00188 argv = options.split()
00189 i = 0
00190 while i < len(argv):
00191 if argv[i] == '-b':
00192 i += 1
00193 predict_probability = int(argv[i])
00194 else:
00195 raise ValueError("Wrong options")
00196 i+=1
00197
00198 svm_type = m.get_svm_type()
00199 is_prob_model = m.is_probability_model()
00200 nr_class = m.get_nr_class()
00201 pred_labels = []
00202 pred_values = []
00203
00204 if predict_probability:
00205 if not is_prob_model:
00206 raise ValueError("Model does not support probabiliy estimates")
00207
00208 if svm_type in [NU_SVR, EPSILON_SVR]:
00209 print("Prob. model for test data: target value = predicted value + z,\n"
00210 "z: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g" % m.get_svr_probability());
00211 nr_class = 0
00212
00213 prob_estimates = (c_double * nr_class)()
00214 for xi in x:
00215 xi, idx = gen_svm_nodearray(xi)
00216 label = libsvm.svm_predict_probability(m, xi, prob_estimates)
00217 values = prob_estimates[:nr_class]
00218 pred_labels += [label]
00219 pred_values += [values]
00220 else:
00221 if is_prob_model:
00222 print("Model supports probability estimates, but disabled in predicton.")
00223 if svm_type in (ONE_CLASS, EPSILON_SVR, NU_SVC):
00224 nr_classifier = 1
00225 else:
00226 nr_classifier = nr_class*(nr_class-1)//2
00227 dec_values = (c_double * nr_classifier)()
00228 for xi in x:
00229
00230 xi, idx = gen_svm_nodearray(xi)
00231 label = libsvm.svm_predict_values(m, xi, dec_values)
00232 values = dec_values[:nr_classifier]
00233 pred_labels += [label]
00234 pred_values += [values]
00235
00236 ACC, MSE, SCC = evaluations(y, pred_labels)
00237 l = len(y)
00238 if svm_type in [EPSILON_SVR, NU_SVR]:
00239 print("Mean squared error = %g (regression)" % MSE)
00240 print("Squared correlation coefficient = %g (regression)" % SCC)
00241 else:
00242 print("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l))
00243
00244 return pred_labels, (ACC, MSE, SCC), pred_values
00245