Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 from sys import argv, exit
00016 import os.path
00017
00018 def err(line_no, msg):
00019 print("line %d: %s" % (line_no, msg))
00020
00021
00022 def my_float(x):
00023 if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
00024 raise ValueError
00025
00026 return float(x)
00027
00028 def main():
00029 if len(argv) != 2:
00030 print("Usage: %s dataset" % (argv[0]))
00031 exit(1)
00032
00033 dataset = argv[1]
00034
00035 if not os.path.exists(dataset):
00036 print("dataset %s not found" % (dataset))
00037 exit(1)
00038
00039 line_no = 1
00040 error_line_count = 0
00041 for line in open(dataset, 'r'):
00042 line_error = False
00043
00044
00045 if line[-1] != '\n':
00046 err(line_no, "missing a newline character in the end")
00047 line_error = True
00048
00049 nodes = line.split()
00050
00051
00052 try:
00053 label = nodes.pop(0)
00054
00055 if label.find(',') != -1:
00056
00057 try:
00058 for l in label.split(','):
00059 l = my_float(l)
00060 except:
00061 err(line_no, "label %s is not a valid multi-label form" % label)
00062 line_error = True
00063 else:
00064 try:
00065 label = my_float(label)
00066 except:
00067 err(line_no, "label %s is not a number" % label)
00068 line_error = True
00069 except:
00070 err(line_no, "missing label, perhaps an empty line?")
00071 line_error = True
00072
00073
00074 prev_index = -1
00075 for i in range(len(nodes)):
00076 try:
00077 (index, value) = nodes[i].split(':')
00078
00079 index = int(index)
00080 value = my_float(value)
00081
00082
00083
00084 if index < 0:
00085 err(line_no, "feature index must be positive; wrong feature %s" % nodes[i])
00086 line_error = True
00087 elif index < prev_index:
00088 err(line_no, "feature indices must be in an ascending order, previous/current features %s %s" % (nodes[i-1], nodes[i]))
00089 line_error = True
00090 prev_index = index
00091 except:
00092 err(line_no, "feature '%s' not an <index>:<value> pair, <index> integer, <value> real number " % nodes[i])
00093 line_error = True
00094
00095 line_no += 1
00096
00097 if line_error:
00098 error_line_count += 1
00099
00100 if error_line_count > 0:
00101 print("Found %d lines with error." % (error_line_count))
00102 return 1
00103 else:
00104 print("No error.")
00105 return 0
00106
00107 if __name__ == "__main__":
00108 exit(main())