3 import os, sys, math, random
4 from collections
import defaultdict
6 if sys.version_info[0] >= 3:
11 Usage: {0} [options] dataset subset_size [output1] [output2] 13 This script randomly selects a subset of the dataset. 16 -s method : method of selection (default 0) 17 0 -- stratified selection (classification only) 20 output1 : the subset (optional) 21 output2 : rest of the data (optional) 22 If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
32 subset_file = sys.stdout
42 if method
not in [0,1]:
43 print(
"Unknown selection method {0}".format(method))
48 subset_size = int(argv[i+1])
50 subset_file = open(argv[i+2],
'w')
52 rest_file = open(argv[i+3],
'w')
54 return dataset, subset_size, method, subset_file, rest_file
57 l = sum(1
for line
in open(dataset,
'r')) 58 return sorted(random.sample(
xrange(l), subset_size))
61 labels = [line.split(
None,1)[0]
for line
in open(dataset)]
62 label_linenums = defaultdict(list)
63 for i, label
in enumerate(labels):
64 label_linenums[label] += [i]
67 remaining = subset_size
72 for label
in sorted(label_linenums, key=
lambda x: len(label_linenums[x])):
73 linenums = label_linenums[label]
74 label_size = len(linenums)
76 s = int(
min(remaining,
max(1, math.ceil(label_size*(float(subset_size)/l)))))
79 Error: failed to have at least one instance per class 80 1. You may have regression data. 81 2. Your classification data is unbalanced or too small. 86 ret += [linenums[i]
for i
in random.sample(
xrange(label_size), s)]
90 dataset, subset_size, method, subset_file, rest_file =
process_options(argv)
101 dataset = open(dataset,
'r') 102 prev_selected_linenum = -1 103 for i
in xrange(len(selected_lines)):
104 for cnt
in xrange(selected_lines[i]-prev_selected_linenum-1):
105 line = dataset.readline()
107 rest_file.write(line)
108 subset_file.write(dataset.readline())
109 prev_selected_linenum = selected_lines[i]
114 rest_file.write(line)
118 if __name__ ==
'__main__':
def process_options(argv)
def stratified_selection(dataset, subset_size)
def random_selection(dataset, subset_size)