checkdata.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 #
00004 # A format checker for LIBSVM
00005 #
00006 
00007 #
00008 # Copyright (c) 2007, Rong-En Fan
00009 #
00010 # All rights reserved.
00011 #
00012 # This program is distributed under the same license of the LIBSVM package.
00013 # 
00014 
00015 from sys import argv, exit
00016 import os.path
00017 
00018 def err(line_no, msg):
00019         print("line {0}: {1}".format(line_no, msg))
00020 
00021 # works like float() but does not accept nan and inf
00022 def my_float(x):
00023         if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
00024                 raise ValueError
00025 
00026         return float(x)
00027 
00028 def main():
00029         if len(argv) != 2:
00030                 print("Usage: {0} dataset".format(argv[0]))
00031                 exit(1)
00032 
00033         dataset = argv[1]
00034 
00035         if not os.path.exists(dataset):
00036                 print("dataset {0} not found".format(dataset))
00037                 exit(1)
00038 
00039         line_no = 1
00040         error_line_count = 0
00041         for line in open(dataset, 'r'):
00042                 line_error = False
00043 
00044                 # each line must end with a newline character
00045                 if line[-1] != '\n':
00046                         err(line_no, "missing a newline character in the end")
00047                         line_error = True
00048 
00049                 nodes = line.split()
00050 
00051                 # check label
00052                 try:
00053                         label = nodes.pop(0)
00054                         
00055                         if label.find(',') != -1:
00056                                 # multi-label format
00057                                 try:
00058                                         for l in label.split(','):
00059                                                 l = my_float(l)
00060                                 except:
00061                                         err(line_no, "label {0} is not a valid multi-label form".format(label))
00062                                         line_error = True
00063                         else:
00064                                 try:
00065                                         label = my_float(label)
00066                                 except:
00067                                         err(line_no, "label {0} is not a number".format(label))
00068                                         line_error = True
00069                 except:
00070                         err(line_no, "missing label, perhaps an empty line?")
00071                         line_error = True
00072 
00073                 # check features
00074                 prev_index = -1
00075                 for i in range(len(nodes)):
00076                         try:
00077                                 (index, value) =  nodes[i].split(':')
00078 
00079                                 index = int(index)
00080                                 value = my_float(value)
00081 
00082                                 # precomputed kernel's index starts from 0 and LIBSVM
00083                                 # checks it. Hence, don't treat index 0 as an error.
00084                                 if index < 0:
00085                                         err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
00086                                         line_error = True
00087                                 elif index <= prev_index:
00088                                         err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
00089                                         line_error = True
00090                                 prev_index = index
00091                         except:
00092                                 err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
00093                                 line_error = True
00094 
00095                 line_no += 1
00096 
00097                 if line_error:
00098                         error_line_count += 1
00099         
00100         if error_line_count > 0:
00101                 print("Found {0} lines with error.".format(error_line_count))
00102                 return 1
00103         else:
00104                 print("No error.")
00105                 return 0
00106 
00107 if __name__ == "__main__":
00108         exit(main())


ml_classifiers
Author(s): Scott Niekum
autogenerated on Fri Jan 3 2014 11:30:23