small format tweaks, get rid of prints in tokenizer

This commit is contained in:
Andrej Karpathy 2023-07-23 17:09:23 +00:00
parent deb3818db9
commit 5baaf9df06
2 changed files with 3 additions and 8 deletions

View File

@ -27,8 +27,5 @@ for line in proc.stdout:
last = dec
t1 = time.time()
print('\n---\n')
print(enc.decode(tokens))
print(f"achieved tok/s: {len(tokens) / (t1 - t0)}")
print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}")
proc.wait()

View File

@ -15,16 +15,14 @@ class Tokenizer:
model_path = TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
print(f"Loaded SentencePiece model from {model_path}")
#print(f"Loaded SentencePiece model from {model_path}")
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
self.bos_id: int = self.sp_model.bos_id()
self.eos_id: int = self.sp_model.eos_id()
self.pad_id: int = self.sp_model.pad_id()
print(
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
)
#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
def encode(self, s: str, bos: bool, eos: bool) -> List[int]: