fix sample.py from tokenizer changes before

2026-02-06 11:26:53 +00:00 · 2023-08-15 02:33:01 +00:00 · 2023-08-15 02:33:01 +00:00 · fe2de68688
commit fe2de68688
parent a9a0628c92
1 changed files with 7 additions and 2 deletions
--- a/sample.py
+++ b/sample.py
@ -51,11 +51,16 @@ if compile:
    print("Compiling the model...")
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

-# load the tokenizer, either provided, or attempt to find it
+# load the tokenizer
+vocab_source = checkpoint_dict.get("vocab_source", "llama2")
+vocab_size = gptconf.vocab_size
 if tokenizer:
+    # a specific tokenizer is provided, use it
    tokenizer_model = tokenizer
 else:
-    tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
+    # let's try to find the tokenizer model automatically. bit gross here...
+    query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
+    tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
 enc = Tokenizer(tokenizer_model=tokenizer_model)

 # encode the beginning of the prompt