subset.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 from sys import argv, exit, stdout, stderr
3 from random import randint
4 
5 method = 0
6 global n
7 global dataset_filename
8 subset_filename = ""
9 rest_filename = ""
10 
12  print("""\
13 Usage: {0} [options] dataset number [output1] [output2]
14 
15 This script selects a subset of the given dataset.
16 
17 options:
18 -s method : method of selection (default 0)
19  0 -- stratified selection (classification only)
20  1 -- random selection
21 
22 output1 : the subset (optional)
23 output2 : rest of the data (optional)
24 If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
25  exit(1)
26 
28  global method, n
29  global dataset_filename, subset_filename, rest_filename
30 
31  argc = len(argv)
32  if argc < 3:
34 
35  i = 1
36  while i < len(argv):
37  if argv[i][0] != "-":
38  break
39  if argv[i] == "-s":
40  i = i + 1
41  method = int(argv[i])
42  if method < 0 or method > 1:
43  print("Unknown selection method {0}".format(method))
45  i = i + 1
46 
47  dataset_filename = argv[i]
48  n = int(argv[i+1])
49  if i+2 < argc:
50  subset_filename = argv[i+2]
51  if i+3 < argc:
52  rest_filename = argv[i+3]
53 
54 def main():
55  class Label:
56  def __init__(self, label, index, selected):
57  self.label = label
58  self.index = index
59  self.selected = selected
60 
62 
63  # get labels
64  i = 0
65  labels = []
66  f = open(dataset_filename, 'r')
67  for line in f:
68  labels.append(Label(float((line.split())[0]), i, 0))
69  i = i + 1
70  f.close()
71  l = i
72 
73  # determine where to output
74  if subset_filename != "":
75  file1 = open(subset_filename, 'w')
76  else:
77  file1 = stdout
78  split = 0
79  if rest_filename != "":
80  split = 1
81  file2 = open(rest_filename, 'w')
82 
83  # select the subset
84  warning = 0
85  if method == 0: # stratified
86  labels.sort(key = lambda x: x.label)
87 
88  label_end = labels[l-1].label + 1
89  labels.append(Label(label_end, l, 0))
90 
91  begin = 0
92  label = labels[begin].label
93  for i in range(l+1):
94  new_label = labels[i].label
95  if new_label != label:
96  nr_class = i - begin
97  k = i*n//l - begin*n//l
98  # at least one instance per class
99  if k == 0:
100  k = 1
101  warning = warning + 1
102  for j in range(nr_class):
103  if randint(0, nr_class-j-1) < k:
104  labels[begin+j].selected = 1
105  k = k - 1
106  begin = i
107  label = new_label
108  elif method == 1: # random
109  k = n
110  for i in range(l):
111  if randint(0,l-i-1) < k:
112  labels[i].selected = 1
113  k = k - 1
114  i = i + 1
115 
116  # output
117  i = 0
118  if method == 0:
119  labels.sort(key = lambda x: int(x.index))
120 
121  f = open(dataset_filename, 'r')
122  for line in f:
123  if labels[i].selected == 1:
124  file1.write(line)
125  else:
126  if split == 1:
127  file2.write(line)
128  i = i + 1
129 
130  if warning > 0:
131  stderr.write("""\
132 Warning:
133 1. You may have regression data. Please use -s 1.
134 2. Classification data unbalanced or too small. We select at least 1 per class.
135  The subset thus contains {0} instances.
136 """.format(n+warning))
137 
138  # cleanup
139  f.close()
140 
141  file1.close()
142 
143  if split == 1:
144  file2.close()
145 
146 main()
def process_options()
Definition: subset.py:27
def exit_with_help()
Definition: subset.py:11
def main()
Definition: subset.py:54


haf_grasping
Author(s): David Fischinger
autogenerated on Mon Jun 10 2019 13:28:43