Go to the documentation of this file.00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 from sys import argv, exit
00016 import os.path
00017 
00018 def err(line_no, msg):
00019         print("line {0}: {1}".format(line_no, msg))
00020 
00021 
00022 def my_float(x):
00023         if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
00024                 raise ValueError
00025 
00026         return float(x)
00027 
00028 def main():
00029         if len(argv) != 2:
00030                 print("Usage: {0} dataset".format(argv[0]))
00031                 exit(1)
00032 
00033         dataset = argv[1]
00034 
00035         if not os.path.exists(dataset):
00036                 print("dataset {0} not found".format(dataset))
00037                 exit(1)
00038 
00039         line_no = 1
00040         error_line_count = 0
00041         for line in open(dataset, 'r'):
00042                 line_error = False
00043 
00044                 
00045                 if line[-1] != '\n':
00046                         err(line_no, "missing a newline character in the end")
00047                         line_error = True
00048 
00049                 nodes = line.split()
00050 
00051                 
00052                 try:
00053                         label = nodes.pop(0)
00054                         
00055                         if label.find(',') != -1:
00056                                 
00057                                 try:
00058                                         for l in label.split(','):
00059                                                 l = my_float(l)
00060                                 except:
00061                                         err(line_no, "label {0} is not a valid multi-label form".format(label))
00062                                         line_error = True
00063                         else:
00064                                 try:
00065                                         label = my_float(label)
00066                                 except:
00067                                         err(line_no, "label {0} is not a number".format(label))
00068                                         line_error = True
00069                 except:
00070                         err(line_no, "missing label, perhaps an empty line?")
00071                         line_error = True
00072 
00073                 
00074                 prev_index = -1
00075                 for i in range(len(nodes)):
00076                         try:
00077                                 (index, value) =  nodes[i].split(':')
00078 
00079                                 index = int(index)
00080                                 value = my_float(value)
00081 
00082                                 
00083                                 
00084                                 if index < 0:
00085                                         err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
00086                                         line_error = True
00087                                 elif index <= prev_index:
00088                                         err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
00089                                         line_error = True
00090                                 prev_index = index
00091                         except:
00092                                 err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
00093                                 line_error = True
00094 
00095                 line_no += 1
00096 
00097                 if line_error:
00098                         error_line_count += 1
00099         
00100         if error_line_count > 0:
00101                 print("Found {0} lines with error.".format(error_line_count))
00102                 return 1
00103         else:
00104                 print("No error.")
00105                 return 0
00106 
00107 if __name__ == "__main__":
00108         exit(main())