2 from sys
import argv, exit, stdout, stderr
3 from random
import randint
7 global dataset_filename
13 Usage: {0} [options] dataset number [output1] [output2] 15 This script selects a subset of the given dataset. 18 -s method : method of selection (default 0) 19 0 -- stratified selection (classification only) 22 output1 : the subset (optional) 23 output2 : rest of the data (optional) 24 If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
29 global dataset_filename, subset_filename, rest_filename
42 if method < 0
or method > 1:
43 print(
"Unknown selection method {0}".format(method))
47 dataset_filename = argv[i]
50 subset_filename = argv[i+2]
52 rest_filename = argv[i+3]
56 def __init__(self, label, index, selected):
59 self.selected = selected
66 f = open(dataset_filename,
'r') 68 labels.append(Label(float((line.split())[0]), i, 0))
74 if subset_filename !=
"":
75 file1 = open(subset_filename,
'w')
79 if rest_filename !=
"":
81 file2 = open(rest_filename,
'w')
86 labels.sort(key =
lambda x: x.label)
88 label_end = labels[l-1].label + 1
89 labels.append(Label(label_end, l, 0))
92 label = labels[begin].label
94 new_label = labels[i].label
95 if new_label != label:
97 k = i*n//l - begin*n//l
101 warning = warning + 1
102 for j
in range(nr_class):
103 if randint(0, nr_class-j-1) < k:
104 labels[begin+j].selected = 1
111 if randint(0,l-i-1) < k:
112 labels[i].selected = 1
119 labels.sort(key =
lambda x: int(x.index))
121 f = open(dataset_filename,
'r') 123 if labels[i].selected == 1:
133 1. You may have regression data. Please use -s 1. 134 2. Classification data unbalanced or too small. We select at least 1 per class. 135 The subset thus contains {0} instances. 136 """.format(n+warning))