mirror of
https://github.com/trholding/llama2.c.git
synced 2026-02-06 11:26:53 +00:00
small format tweaks, get rid of prints in tokenizer
This commit is contained in:
parent
deb3818db9
commit
5baaf9df06
@ -27,8 +27,5 @@ for line in proc.stdout:
|
||||
last = dec
|
||||
t1 = time.time()
|
||||
|
||||
print('\n---\n')
|
||||
print(enc.decode(tokens))
|
||||
|
||||
print(f"achieved tok/s: {len(tokens) / (t1 - t0)}")
|
||||
print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}")
|
||||
proc.wait()
|
||||
|
||||
@ -15,16 +15,14 @@ class Tokenizer:
|
||||
model_path = TOKENIZER_MODEL
|
||||
assert os.path.isfile(model_path), model_path
|
||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
||||
print(f"Loaded SentencePiece model from {model_path}")
|
||||
#print(f"Loaded SentencePiece model from {model_path}")
|
||||
|
||||
# BOS / EOS token IDs
|
||||
self.n_words: int = self.sp_model.vocab_size()
|
||||
self.bos_id: int = self.sp_model.bos_id()
|
||||
self.eos_id: int = self.sp_model.eos_id()
|
||||
self.pad_id: int = self.sp_model.pad_id()
|
||||
print(
|
||||
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
|
||||
)
|
||||
#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
|
||||
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
||||
|
||||
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user