checkdata.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # A format checker for LIBSVM
5 #
6 
7 #
8 # Copyright (c) 2007, Rong-En Fan
9 #
10 # All rights reserved.
11 #
12 # This program is distributed under the same license of the LIBSVM package.
13 #
14 
15 from sys import argv, exit
16 import os.path
17 
18 def err(line_no, msg):
19  print("line {0}: {1}".format(line_no, msg))
20 
21 # works like float() but does not accept nan and inf
22 def my_float(x):
23  if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
24  raise ValueError
25 
26  return float(x)
27 
28 def main():
29  if len(argv) != 2:
30  print("Usage: {0} dataset".format(argv[0]))
31  exit(1)
32 
33  dataset = argv[1]
34 
35  if not os.path.exists(dataset):
36  print("dataset {0} not found".format(dataset))
37  exit(1)
38 
39  line_no = 1
40  error_line_count = 0
41  for line in open(dataset, 'r'):
42  line_error = False
43 
44  # each line must end with a newline character
45  if line[-1] != '\n':
46  err(line_no, "missing a newline character in the end")
47  line_error = True
48 
49  nodes = line.split()
50 
51  # check label
52  try:
53  label = nodes.pop(0)
54 
55  if label.find(',') != -1:
56  # multi-label format
57  try:
58  for l in label.split(','):
59  l = my_float(l)
60  except:
61  err(line_no, "label {0} is not a valid multi-label form".format(label))
62  line_error = True
63  else:
64  try:
65  label = my_float(label)
66  except:
67  err(line_no, "label {0} is not a number".format(label))
68  line_error = True
69  except:
70  err(line_no, "missing label, perhaps an empty line?")
71  line_error = True
72 
73  # check features
74  prev_index = -1
75  for i in range(len(nodes)):
76  try:
77  (index, value) = nodes[i].split(':')
78 
79  index = int(index)
80  value = my_float(value)
81 
82  # precomputed kernel's index starts from 0 and LIBSVM
83  # checks it. Hence, don't treat index 0 as an error.
84  if index < 0:
85  err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
86  line_error = True
87  elif index <= prev_index:
88  err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
89  line_error = True
90  prev_index = index
91  except:
92  err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
93  line_error = True
94 
95  line_no += 1
96 
97  if line_error:
98  error_line_count += 1
99 
100  if error_line_count > 0:
101  print("Found {0} lines with error.".format(error_line_count))
102  return 1
103  else:
104  print("No error.")
105  return 0
106 
107 if __name__ == "__main__":
108  exit(main())
def err(line_no, msg)
Definition: checkdata.py:18
def main()
Definition: checkdata.py:28
def my_float(x)
Definition: checkdata.py:22


ml_classifiers
Author(s): Scott Niekum , Joshua Whitley
autogenerated on Mon Feb 28 2022 22:46:49