python logbert.py train
produces this error
Output:
device cpu
features logkey:True time: False
mask ratio 0.65
arguments Namespace(mode='train')
Save options parameters
Loading vocab output/hdfs/vocab.pkl
vocab Size: 17
Loading Train Dataset
before filtering short session
train size 1918
valid size 213
========================================
100%|██████████| 2131/2131 [00:00<00:00, 564307.21it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
[<ipython-input-4-4393a2a735e9>](https://localhost:8080/#) in <module>
----> 1 LogBert("train")
4 frames
[<ipython-input-2-698dc9901810>](https://localhost:8080/#) in LogBert(cmd_mode)
87
88 if args.mode == 'train':
---> 89 Trainer(options).train()
90
91 elif args.mode == 'predict':
[/content/bert_pytorch/train_log.py](https://localhost:8080/#) in train(self)
60
61 print("\nLoading Train Dataset")
---> 62 logkey_train, logkey_valid, time_train, time_valid = generate_train_valid(self.output_path + "train", window_size=self.window_size,
63 adaptive_window=self.adaptive_window,
64 valid_size=self.valid_ratio,
[/content/bert_pytorch/dataset/sample.py](https://localhost:8080/#) in generate_train_valid(data_path, window_size, adaptive_window, sample_ratio, valid_size, output_path, scale, scale_path, seq_len, min_len)
91 time_seq_pairs = np.array(time_seq_pairs)
92
---> 93 logkey_trainset, logkey_validset, time_trainset, time_validset = train_test_split(logkey_seq_pairs,
94 time_seq_pairs,
95 test_size=test_size,
[/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py](https://localhost:8080/#) in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays)
2422
2423 n_samples = _num_samples(arrays[0])
-> 2424 n_train, n_test = _validate_shuffle_split(
2425 n_samples, test_size, train_size, default_test_size=0.25
2426 )
[/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py](https://localhost:8080/#) in _validate_shuffle_split(n_samples, test_size, train_size, default_test_size)
2045 and (test_size <= 0 or test_size >= 1)
2046 ):
-> 2047 raise ValueError(
2048 "test_size={0} should be either positive and smaller"
2049 " than the number of samples {1} or a float in the "
ValueError: test_size=213 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range
and after investigating a bit into the packages, there seems to a problem with assigning the test_size in sample.py in bert_pytorch/dataset/...
test size parameter needs to be given as float value between 0 and 1, but this assigns the no.of samples in test size itself as argument to train_test_split()
part of code that causes this prob (marked with ## symbol nearby)
def generate_train_valid(data_path, window_size=20, adaptive_window=True,
sample_ratio=1, valid_size=0.1, output_path=None,
scale=None, scale_path=None, seq_len=None, min_len=0):
with open(data_path, 'r') as f:
data_iter = f.readlines()
num_session = int(len(data_iter) * sample_ratio)
# only even number of samples, or drop_last=True in DataLoader API
# coz in parallel computing in CUDA, odd number of samples reports issue when merging the result
# num_session += num_session % 2
test_size = int(min(num_session, len(data_iter)) * valid_size)
# only even number of samples
# test_size += test_size % 2
print("before filtering short session")
print("train size ", int(num_session - test_size))
print("valid size ", int(test_size)) ## notice here it is no.of samples, an int value obviously greater than 1
print("="*40)
logkey_seq_pairs = []
time_seq_pairs = []
session = 0
for line in tqdm(data_iter):
if session >= num_session:
break
session += 1
logkeys, times = fixed_window(line, window_size, adaptive_window, seq_len, min_len)
logkey_seq_pairs += logkeys
time_seq_pairs += times
logkey_seq_pairs = np.array(logkey_seq_pairs)
time_seq_pairs = np.array(time_seq_pairs)
logkey_trainset, logkey_validset, time_trainset, time_validset = train_test_split(logkey_seq_pairs,
time_seq_pairs,
test_size=test_size, ## here it is passed as argument as it is with changing to be in range(0,1)
random_state=1234)
I think this should solve the problem (i don't know the exact work of the package, but I think this this could be a minor fix and just wanted to make sure its correct...):
def generate_train_valid(data_path, window_size=20, adaptive_window=True,
sample_ratio=1, valid_size=0.1, output_path=None,
scale=None, scale_path=None, seq_len=None, min_len=0):
with open(data_path, 'r') as f:
data_iter = f.readlines()
num_session = int(len(data_iter) * sample_ratio)
# only even number of samples, or drop_last=True in DataLoader API
# coz in parallel computing in CUDA, odd number of samples reports issue when merging the result
# num_session += num_session % 2
test_size = int(min(num_session, len(data_iter)) * valid_size)
# only even number of samples
# test_size += test_size % 2
valid_size = round(test_size/num_session,3)
# update split size
print("before filtering short session")
print("train size ", int(num_session - test_size))
print("valid size ", int(test_size))
print("="*40)
logkey_seq_pairs = []
time_seq_pairs = []
session = 0
for line in tqdm(data_iter):
if session >= num_session:
break
session += 1
logkeys, times = fixed_window(line, window_size, adaptive_window, seq_len, min_len)
logkey_seq_pairs += logkeys
time_seq_pairs += times
logkey_seq_pairs = np.array(logkey_seq_pairs)
time_seq_pairs = np.array(time_seq_pairs)
logkey_trainset, logkey_validset, time_trainset, time_validset = train_test_split(logkey_seq_pairs,
time_seq_pairs,
test_size=valid_size,
random_state=1234)