-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
45 lines (34 loc) · 1.04 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from tokenizers import ByteLevelBPETokenizer
import os
def tokenizer_main() -> tuple[[ByteLevelBPETokenizer][os.path]]:
"""
The function trains the tokenizer on a given text data file and saves the model to disk.
It also sets certain environmental variables for CUDA device order and visible devices.
Parameters:
-----------
None
Returns:
-----------
tokenizer: trained ByteLevelBPETokenizer
path: path to the files for training
"""
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 3, 4, 5"
if not os.path.exists("tokenizer"):
os.mkdir("tokenizer")
TRAINABLE = True
# Path to the file/files
paths = [] # Assign your paths
# Tokenizer
if TRAINABLE:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size = 52_000, min_frequency = 2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
# Save files to disk
tokenizer.save_model("tokenizer")
return tokenizer