diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..7e6474d
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,193 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
+  # for manual triggering
+  workflow_dispatch:
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  # check basic builds to avoid breaking changes
+  ubuntu-focal-make:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential -y
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Pip setup
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+      - name: Build runfast
+        id: make_build_runfast
+        run: |
+          make runfast
+
+      - name: Test with pytest
+        run: |
+          pytest
+
+  macOS-latest-make:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Pip setup
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+      - name: Build clang
+        id: make_build_clang
+        run: |
+          make run CC=clang
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+      - name: Build runfast
+        id: make_build_runfast
+        run: |
+          make runfast
+
+      - name: Test with pytest
+        run: pytest
+
+
+
+
+  windows-latest-make:
+    runs-on: windows-latest
+
+    strategy:
+      fail-fast: false  #necessary, otherwise the matrix breaks
+      matrix:
+        arch:
+          - amd64
+          - amd64_x86
+          - amd64_arm64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Setup MSBuild
+        uses: microsoft/setup-msbuild@v1
+
+      - name: Setup MSVC ${{ matrix.arch }}
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: ${{ matrix.arch }}
+
+      - name: Set up Python 3.10
+        if: matrix.arch != 'amd64_arm64'
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Pip setup
+        if: matrix.arch != 'amd64_arm64'
+        run: |
+          python -m pip install --upgrade pip
+          if (Test-Path requirements.txt) {
+            pip install -r requirements.txt
+          }
+
+      - name: Build ${{ matrix.arch }}
+        id: build_msvc
+        run: |
+          .\build_msvc.bat
+
+      #cross-comiled, cannot be run on host
+      - name: Test with pytest
+        if: matrix.arch != 'amd64_arm64'
+        run: pytest
+
+  windows-latest-mingw:
+    runs-on: windows-latest
+
+    defaults:
+      run:
+        shell: msys2 {0}
+
+    strategy:
+      matrix:
+        include:
+          - { sys: mingw64, env: x86_64 }
+
+    steps:
+      - name: Checkout
+        id: checkout
+        uses: actions/checkout@v3
+
+      - uses: msys2/setup-msys2@v2
+        id: setup-msys2
+        with:
+          msystem: ${{ matrix.sys }}
+          install: mingw-w64-${{matrix.env}}-gcc make
+
+      - name: Build ${{ matrix.sys }} ${{ matrix.env }}
+        id: build_mingw
+        run: |
+          make win64
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Pip setup
+        shell: powershell
+        run: |
+          python -m pip install --upgrade pip
+          if (Test-Path requirements.txt) {
+            pip install -r requirements.txt
+          }
+
+      - name: Test with pytest
+        shell: powershell
+        run: pytest
diff --git a/configurator.py b/configurator.py
new file mode 100644
index 0000000..a8bba95
--- /dev/null
+++ b/configurator.py
@@ -0,0 +1,47 @@
+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+
+import sys
+from ast import literal_eval
+
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            assert type(attempt) == type(globals()[key])
+            # cross fingers
+            print(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")
diff --git a/export_meta_llama_bin.py b/export_meta_llama_bin.py
new file mode 100644
index 0000000..4e42197
--- /dev/null
+++ b/export_meta_llama_bin.py
@@ -0,0 +1,112 @@
+"""
+This script exports the Llama 2 weights in llama2c.bin format.
+"""
+import os
+import sys
+import struct
+from pathlib import Path
+import json
+
+import torch
+
+from model import precompute_freqs_cis
+
+
+def export(p, state_dict, filepath='model.bin'):
+    """export the model weights in fp32 into .bin file to be read from C"""
+    f = open(filepath, 'wb')
+
+    def serialize(key):
+        print(f"writing {key}...")
+        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
+        f.write(memoryview(t))
+        del state_dict[key]
+
+    # first write out the header
+    hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
+    p['vocab_size'] = 32000
+    p['max_seq_len'] = 2048
+
+    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
+    header = struct.pack(
+        'iiiiiii',
+        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
+        n_kv_heads, -p['vocab_size'], p['max_seq_len']
+    )
+    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
+    # in the checkpoint and should be loaded.
+    f.write(header)
+
+    # next write out the embedding weights
+    print("writing tok_embeddings...")
+    serialize('tok_embeddings.weight')
+
+    # now all the layers
+    # attention weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
+    # ffn weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
+
+    # final rmsnorm
+    serialize('norm.weight')
+    # freqs_cos, freqs_sin
+    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
+    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
+    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
+    serialize('freqs_cos')
+    serialize('freqs_sin')
+
+    # finally write the output weights
+    serialize('output.weight')
+
+    f.close()
+    print(f"wrote {filepath}")
+
+
+def concat_weights(models):
+    state_dict = {}
+    for name in list(models[0]):
+        tensors = [model[name] for model in models]
+        if len(tensors) == 1 or len(tensors[0].shape) == 1:
+            state_dict[name] = tensors[0]
+            continue
+        is_axis_1 = (
+            name.startswith('tok_embeddings.')
+            or name.endswith('.attention.wo.weight')
+            or name.endswith('.feed_forward.w2.weight')
+        )
+        axis = 1 if is_axis_1 else 0
+        state_dict[name] = torch.cat(tensors, dim=axis)
+        for model in models:
+            del model[name]
+    return state_dict
+
+
+def load_and_export(model_path, output_path):
+    params_path = os.path.join(model_path, 'params.json')
+    with open(params_path) as f:
+        params = json.load(f)
+        print(params)
+
+    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
+    models = [torch.load(p, map_location='cpu') for p in model_paths]
+    state_dict = concat_weights(models)
+    del models
+    export(params, state_dict, output_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print('[Llama model folder path] [output path]')
+        exit()
+
+    model_path = sys.argv[1]
+    output_path = sys.argv[2]
+    load_and_export(model_path, output_path)
diff --git a/export_meta_llama_hf_bin.py b/export_meta_llama_hf_bin.py
new file mode 100644
index 0000000..e3a8c73
--- /dev/null
+++ b/export_meta_llama_hf_bin.py
@@ -0,0 +1,113 @@
+"""
+This script exports the Llama 2 weights in llama2c.bin format.
+"""
+import os
+import sys
+import struct
+from pathlib import Path
+import json
+
+import torch
+
+from model import precompute_freqs_cis
+
+
+def export(p, state_dict, filepath='model.bin'):
+    """export the model weights in fp32 into .bin file to be read from C"""
+    f = open(filepath, 'wb')
+
+    def serialize(key):
+        print(f"writing {key}...")
+        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
+        f.write(memoryview(t))
+        del state_dict[key]
+
+    # first write out the header
+    hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0]
+    p['vocab_size'] = 32000
+    p['max_seq_len'] = 2048
+
+    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
+    header = struct.pack(
+        'iiiiiii',
+        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
+        n_kv_heads, -p['vocab_size'], p['max_seq_len']
+    )
+    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
+    # in the checkpoint and should be loaded.
+    f.write(header)
+
+    # next write out the embedding weights
+    print("writing tok_embeddings...")
+    serialize('model.embed_tokens.weight')
+
+    # now all the layers
+    # attention weights
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight')
+    # ffn weights
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight')
+    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight')
+
+    # final rmsnorm
+    serialize('model.norm.weight')
+    # freqs_cos, freqs_sin
+    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
+    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
+    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
+    # check if this requires addtional conversion
+    serialize('freqs_cos')
+    serialize('freqs_sin')
+
+    # finally write the output weights
+    serialize('lm_head.weight')
+
+    f.close()
+    print(f"wrote {filepath}")
+
+
+def concat_weights(models):
+    state_dict = {}
+    for name in list(models[0]):
+        tensors = [model[name] for model in models]
+        if len(tensors) == 1 or len(tensors[0].shape) == 1:
+            state_dict[name] = tensors[0]
+            continue
+        is_axis_1 = (
+            name.startswith('model.embed_tokens.weight')
+            or name.endswith('.self_attn.o_proj.weight')
+            or name.endswith('.mlp.down_proj.weight')
+        )
+        axis = 1 if is_axis_1 else 0
+        state_dict[name] = torch.cat(tensors, dim=axis)
+        for model in models:
+            del model[name]
+    return state_dict
+
+
+def load_and_export(model_path, output_path):
+    params_path = os.path.join(model_path, 'params.json')
+    with open(params_path) as f:
+        params = json.load(f)
+        print(params)
+
+    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
+    models = [torch.load(p, map_location='cpu') for p in model_paths]
+    state_dict = concat_weights(models)
+    del models
+    export(params, state_dict, output_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print('[Llama model folder path] [output path]')
+        exit()
+
+    model_path = sys.argv[1]
+    output_path = sys.argv[2]
+    load_and_export(model_path, output_path)
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..c8c82a9
--- /dev/null
+++ b/model.py
@@ -0,0 +1,392 @@
+import math
+import struct
+import inspect
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+@dataclass
+class ModelArgs:
+    # default hyperparameters for the Llama 7B model
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 32000
+    multiple_of: int = 256  # MLP hidden layer size will be multiple of
+    norm_eps: float = 1e-5
+    max_seq_len: int = 2048
+    dropout: float = 0.0
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return freqs_cos, freqs_sin
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(shape)
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cos: torch.Tensor,
+    freqs_sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    # reshape xq and xk to match the complex representation
+    xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
+    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
+
+    # reshape freqs_cos and freqs_sin for broadcasting
+    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
+    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
+
+    # apply rotation using real numbers
+    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
+    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
+    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
+    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
+
+    # flatten last two dimensions
+    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
+    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert args.n_heads % self.n_kv_heads == 0
+        model_parallel_size = 1
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+
+        # use flash attention or a manual implementation?
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            self.register_buffer("mask", mask)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+
+        # QKV
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # RoPE relative positional embeddings
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
+
+        # grouped multiquery attention: expand out keys and values
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+
+        # make heads into a batch dimension
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+
+        # flash implementation
+        if self.flash:
+            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
+        else:
+            # manual implementation
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+            assert hasattr(self, 'mask')
+            scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+
+        # restore time as batch dimension and concat heads
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+
+        # final projection into the residual stream
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            dropout=args.dropout,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+
+    def forward(self, x, freqs_cos, freqs_sin):
+        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
+        out = h + self.feed_forward.forward(self.ffn_norm(h))
+        return out
+
+
+class Transformer(nn.Module):
+    last_loss: Optional[torch.Tensor]
+
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.dropout = nn.Dropout(params.dropout)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+
+        # share the unembedding parameters with the embedding parameters
+        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
+
+        # some useful precompute for the RoPE relative positional embeddings
+        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
+
+        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.
+        self.last_loss = None
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        h = self.dropout(h)
+        freqs_cos = self.freqs_cos[:seqlen]
+        freqs_sin = self.freqs_sin[:seqlen]
+
+        for layer in self.layers:
+            h = layer(h, freqs_cos, freqs_sin)
+        h = self.norm(h)
+
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.output(h)
+            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the output on the very last position
+            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            self.last_loss = None
+
+        return logits
+
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+
+        return optimizer
+
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = sum(p.numel() for p in self.parameters())
+        cfg = self.params
+        L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+
+    @torch.inference_mode()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        Also note this is a super inefficient version of sampling with no key/value cache.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
+            # forward the model to get the logits for the index in the sequence
+            logits = self(idx_cond)
+            logits = logits[:, -1, :] # crop to just the final time step
+            if temperature == 0.0:
+                # "sample" the single most likely index
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
+            else:
+                # pluck the logits at the final step and scale by desired temperature
+                logits = logits / temperature
+                # optionally crop the logits to only the top k options
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # apply softmax to convert logits to (normalized) probabilities
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx
+
+    def export(self, filepath='model.bin'):
+        """export the model weights in fp32 into .bin file to be read from C"""
+        f = open(filepath, 'wb')
+
+        def serialize(t):
+            d = t.detach().cpu().view(-1).numpy().astype(np.float32)
+            b = struct.pack(f'{len(d)}f', *d)
+            f.write(b)
+
+        # first write out the header
+        hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
+        p = self.params
+        n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
+        header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
+                                       n_kv_heads, p.vocab_size, p.max_seq_len)
+        f.write(header)
+
+        # next write out the embedding weights
+        serialize(self.tok_embeddings.weight)
+
+        # now all the layers
+        # attention weights
+        for layer in self.layers:
+            serialize(layer.attention_norm.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wq.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wk.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wv.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wo.weight)
+        # ffn weights
+        for layer in self.layers:
+            serialize(layer.ffn_norm.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w1.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w2.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w3.weight)
+        # final rmsnorm
+        serialize(self.norm.weight)
+        # note: no need to write final classifier weights due to weight sharing
+        # freqs_cis
+        serialize(self.freqs_cos[:p.max_seq_len])
+        serialize(self.freqs_sin[:p.max_seq_len])
+
+        # write to binary file
+        f.close()
+        print(f"wrote {filepath}")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7187a73
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+numpy==1.23.5
+pytest==7.4.0
+Requests==2.31.0
+sentencepiece==0.1.99
+torch==2.0.1
+tqdm==4.64.1
+wandb==0.15.5
diff --git a/run.ipynb b/run.ipynb
new file mode 100644
index 0000000..ac57593
--- /dev/null
+++ b/run.ipynb
@@ -0,0 +1,130 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HLdoj4cz-xal"
+      },
+      "source": [
+        "# Run.c\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/karpathy/llama2.c/blob/master/run.ipynb)\n",
+        "\n",
+        "More details can be found in the [README.md](README.md) ."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Une3Ozlnu1B7"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Clone Project\n",
+        "\n",
+        "!git clone https://github.com/karpathy/llama2.c.git\n",
+        "%cd llama2.c"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#@title Build\n",
+        "\n",
+        "!make runfast"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "thm0ZBrtSgoC"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Pick Your Model\n",
+        "\n",
+        "#@markdown Choose model\n",
+        "model = \"stories15M\" #@param [\"stories15M\", \"stories42M\", \"stories110M\"]\n",
+        "\n",
+        "download_url = \"\"\n",
+        "\n",
+        "if(model == \"stories15M\"):\n",
+        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin\"\n",
+        "if(model == \"stories42M\"):\n",
+        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin\"\n",
+        "if(model == \"stories110M\"):\n",
+        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin\"\n",
+        "\n",
+        "print(f\"download_url: {download_url}\")\n",
+        "\n",
+        "!wget $download_url\n",
+        "\n",
+        "model_file = model + \".bin\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OgAc3KjuT-NM"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Generate Stories\n",
+        "\n",
+        "# Generate args\n",
+        "max_token = 256 #@param {type:\"slider\", min:32, max:1024, step:32}\n",
+        "temperature = 0.8 #@param {type:\"slider\", min:0.0, max:1, step:0.05}\n",
+        "top_p = 0.9 #@param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
+        "prompt = \"One day, Lily met a Shoggoth\" #@param {type:\"string\"}\n",
+        "\n",
+        "print(f\"model: {model_file}, max_token: {max_token}, temperature: {temperature}, top_p: {top_p}, prompt: {prompt}\")\n",
+        "print(f\"----------------------------\\n\")\n",
+        "\n",
+        "cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n",
+        "!{cmd}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#@title Run Meta's Llama 2 models\n",
+        "\n",
+        "#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n",
+        "\n",
+        "from huggingface_hub import snapshot_download\n",
+        "\n",
+        "token = \"replace your huggingface access token\" #@param {type:\"string\"}\n",
+        "path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n",
+        "\n",
+        "!python export_meta_llama_bin.py $path llama2_7b.bin\n",
+        "\n",
+        "print(\"./run llama2_7b.bin\\n\")\n",
+        "!./run llama2_7b.bin"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "private_outputs": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/sample.py b/sample.py
new file mode 100644
index 0000000..d2f56ea
--- /dev/null
+++ b/sample.py
@@ -0,0 +1,79 @@
+"""
+Sample from the trained model with PyTorch
+"""
+import os
+import pickle
+from contextlib import nullcontext
+import torch
+from model import ModelArgs, Transformer
+from tokenizer import Tokenizer
+
+from tinystories import get_tokenizer_model_path
+
+# -----------------------------------------------------------------------------
+checkpoint = 'out/ckpt.pt'
+start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+num_samples = 1 # number of samples to draw
+max_new_tokens = 100 # number of tokens generated in each sample
+temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
+tokenizer = "" # override the tokenizer model path
+seed = 1337
+device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
+dtype = "float32"
+compile = False # use PyTorch 2.0 to compile the model to be faster
+exec(open('configurator.py').read()) # overrides from command line or config file
+# -----------------------------------------------------------------------------
+
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+
+# init from a model saved in a specific directory
+checkpoint_dict = torch.load(checkpoint, map_location=device)
+gptconf = ModelArgs(**checkpoint_dict['model_args'])
+model = Transformer(gptconf)
+state_dict = checkpoint_dict['model']
+unwanted_prefix = '_orig_mod.'
+for k,v in list(state_dict.items()):
+    if k.startswith(unwanted_prefix):
+        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+model.load_state_dict(state_dict, strict=False)
+
+model.eval()
+model.to(device)
+if compile:
+    print("Compiling the model...")
+    model = torch.compile(model) # requires PyTorch 2.0 (optional)
+
+# load the tokenizer
+vocab_source = checkpoint_dict.get("vocab_source", "llama2")
+vocab_size = gptconf.vocab_size
+if tokenizer:
+    # a specific tokenizer is provided, use it
+    tokenizer_model = tokenizer
+else:
+    # let's try to find the tokenizer model automatically. bit gross here...
+    query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
+    tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
+enc = Tokenizer(tokenizer_model=tokenizer_model)
+
+# encode the beginning of the prompt
+if start.startswith('FILE:'):
+    with open(start[5:], 'r', encoding='utf-8') as f:
+        start = f.read()
+start_ids = enc.encode(start, bos=True, eos=False)
+x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+
+# run generation
+with torch.no_grad():
+    with ctx:
+        for k in range(num_samples):
+            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+            print(enc.decode(y[0].tolist()))
+            print('---------------')
diff --git a/save_torchscript.py b/save_torchscript.py
new file mode 100755
index 0000000..af3a299
--- /dev/null
+++ b/save_torchscript.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""Saves the model as a TorchScript.
+
+Usage examples:
+    ./save_torchscript.py
+    ./save_torchscript.py --dim=300
+    ./save_torchscript.py --gzip_output=True --zero_params=True
+
+The resulting file can be loaded in C++ code and then used for training or
+inference with:
+    #include <torch/script.h>
+    torch::jit::Module module = torch::jit::load("model.pt")
+
+Note that the serialized model includes the initial parameters and with the default
+ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
+the model parameters separately you can zero out the parameters before saving it and
+it will gzip down to 780K.
+"""
+import gzip
+import os
+import shutil
+from inspect import signature
+
+import torch
+
+from model import ModelArgs, Transformer
+
+# Model args config
+dim = 288
+n_layers = 6
+n_heads = 6
+n_kv_heads = n_heads
+multiple_of = 32
+max_seq_len = 256
+dropout = 0.0
+vocab_size = 32000
+norm_eps = 1e-5
+# Save config
+model_path = "model.pt"
+zero_params = False
+gzip_output = False
+# Allow config overrides
+exec(open("configurator.py").read())
+
+
+def main() -> None:
+    model_args = {k: globals()[k] for k in signature(ModelArgs).parameters}
+    model = Transformer(ModelArgs(**model_args))
+
+    # If requested zero params before saving the model. This is useful in
+    # conjunction with gzip_output.
+    if zero_params:
+        for p in model.parameters():
+            p.detach().zero_()
+
+    torch.jit.save(torch.jit.script(model), model_path)
+
+    if gzip_output:
+        with open(model_path, "rb") as f_in:
+            with gzip.open(f"{model_path}.gz", "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        os.unlink(model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tinystories.py b/tinystories.py
new file mode 100644
index 0000000..690cb02
--- /dev/null
+++ b/tinystories.py
@@ -0,0 +1,274 @@
+"""
+Download, preprocess and serve the TinyStories dataset as a DataLoader.
+"""
+
+import argparse
+import glob
+import json
+import os
+import random
+from typing import List
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+
+from tokenizer import Tokenizer
+
+DATA_CACHE_DIR = "data"
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+
+
+def download():
+    """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+
+    # download the TinyStories dataset, unless it's already downloaded
+    data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
+    data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+    # unpack the tar.gz file into all the data shards (json files)
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir, exist_ok=True)
+        print(f"Unpacking {data_filename}...")
+        os.system(f"tar -xzf {data_filename} -C {data_dir}")
+    else:
+        print(f"{data_dir} already exists, skipping unpacking...")
+
+    # print a single example just for debugging and such
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+    with open(shard_filenames[0], "r") as f:
+        data = json.load(f)
+    print("Download done.")
+    print(f"Number of shards: {len(shard_filenames)}")
+    print(f"Example story:\n{data[0]}")
+
+def train_vocab(vocab_size):
+    """
+    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
+    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
+    where N is the vocab size. This is also where the pretok .bin files will go.
+    """
+    assert vocab_size > 0, "Vocab size must be positive"
+
+    # output file prefix path for sentencepiece
+    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+
+    # how many shards we'll use for vocab training, kept low for efficiency
+    num_shards = 10
+
+    # 1) export a large chunk of text as a single text file tiny.txt
+    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+
+    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
+    with open(tiny_file, "w") as of:
+        for shard in tqdm(shard_filenames[:num_shards]):
+            with open(shard, "r") as f:
+                data = json.load(f)
+            for example in data:
+                text = example["story"]
+                text = text.strip()
+                of.write(text + "\n")
+    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
+
+    # 2) run the train_vocab.sh script that trains the sentencepiece model
+    print("Will now train the vocab with:")
+    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
+    print(cmd)
+    print("OK? [y/N] ")
+    dec = input()
+    if dec.lower() != "y":
+        print("Exiting...")
+        return
+    os.system(cmd)
+
+    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
+    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
+    if dec.lower() == "y":
+        os.remove(tiny_file)
+        print(f"Deleted {tiny_file}")
+
+    print(f"Trained tokenizer is in {prefix}.model")
+    print("Done.")
+
+
+def process_shard(args, vocab_size):
+    shard_id, shard = args
+    tokenizer_model = get_tokenizer_model_path(vocab_size)
+    enc = Tokenizer(tokenizer_model)
+    with open(shard, "r") as f:
+        data = json.load(f)
+    all_tokens = []
+    for example in tqdm(data, position=shard_id):
+        text = example["story"]
+        text = text.strip()  # get rid of leading/trailing whitespace
+        tokens = enc.encode(text, bos=True, eos=False)  # encode the text, use BOS
+        all_tokens.extend(tokens)
+    # convert to uint16 nparray
+    all_tokens = np.array(all_tokens, dtype=np.uint16)
+    # calculate the output filename
+    if vocab_size == 0:
+        # if we're using Llama 2, just save the tokenized file in the same dir
+        tokenized_filename = shard.replace(".json", ".bin")
+    else:
+        # save .bin files into a new tok{N} directory
+        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+        shard_basename = os.path.basename(shard)
+        bin_basename = shard_basename.replace(".json", ".bin")
+        tokenized_filename = os.path.join(bin_dir, bin_basename)
+    # write the bytes
+    with open(tokenized_filename, "wb") as f:
+        f.write(all_tokens.tobytes())
+    # calculate the average sequence length (they are separated by BOS=1)
+    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
+    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
+
+
+def pretokenize(vocab_size):
+    # iterate the shards and tokenize all of them one by one
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+    if vocab_size > 0:
+        # .bin files will be saved into tok{N} directory, create it once here
+        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+        os.makedirs(bin_dir, exist_ok=True)
+
+    # process all the shards in a process pool
+    fun = partial(process_shard, vocab_size=vocab_size)
+    with ProcessPoolExecutor() as executor:
+        executor.map(fun, enumerate(shard_filenames))
+    print("Done.")
+
+
+class PretokDataset(torch.utils.data.IterableDataset):
+    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
+
+    def __init__(self, split, max_seq_len, vocab_size, vocab_source):
+        super().__init__()
+        self.split = split
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.vocab_source = vocab_source
+
+    def __iter__(self):
+        # get worker info within a DataLoader
+        worker_info = torch.utils.data.get_worker_info()
+        worker_id = worker_info.id if worker_info else 0
+        # get DDP rank info
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        # combine the worker_id and worker_rank to create a unique seed for rng
+        seed = 42 + worker_id + 1337 * rank
+        rng = random.Random(seed)
+        print(f"Created a PretokDataset with rng seed {seed}")
+        if self.vocab_source == "llama2":
+            # the .bin files are right along the .json files
+            bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
+        elif self.vocab_source == "custom":
+            # the .bin files are in tok{N} directory
+            bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
+            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
+        # train/test split. let's use only shard 0 for test split, rest train
+        shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
+        while True:
+            rng.shuffle(shard_filenames)
+            for shard in shard_filenames:
+                # open the dataset for reading but keep it on disk with memmap
+                m = np.memmap(shard, dtype=np.uint16, mode="r")
+                num_batches = len(m) // self.max_seq_len
+                num_batches -= 1  # drop the last partial batch
+                assert num_batches > 0, "this shard is way too small? investigate."
+                ixs = list(range(num_batches))
+                rng.shuffle(ixs)
+                for ix in ixs:
+                    start = ix * self.max_seq_len
+                    end = start + self.max_seq_len + 1
+                    # calling .astype will copy the data into a new numpy array, now in RAM
+                    chunk = torch.from_numpy((m[start:end]).astype(np.int64))
+                    x = chunk[:-1]
+                    y = chunk[1:]
+                    yield x, y
+
+# -----------------------------------------------------------------------------
+# public interface functions
+
+def get_tokenizer_model_path(vocab_size):
+    """
+    Returns path to the sentencepiece tokenizer model for a given vocab size
+    vocab_size = 0 designates the default Llama 2 tokenizer, in that case
+    None is returned.
+    """
+    if vocab_size == 0:
+        return None
+    else:
+        return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
+
+class Task:
+
+    @staticmethod
+    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
+        ds = PretokDataset(**dataset_kwargs)
+        dl = torch.utils.data.DataLoader(
+            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
+        )
+        for x, y in dl:
+            x = x.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            yield x, y
+
+# -----------------------------------------------------------------------------
+# CLI for constructing the dataset
+
+if __name__ == "__main__":
+    """
+    These stages are designed to be run in order.
+
+    To tokenize data with the Llama 2 tokenizer:
+    python tinystories.py download
+    python tinystories.py pretokenize
+
+    To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
+    python tinystories.py download
+    python tinystories.py train_vocab --vocab_size=2048
+    python tinystories.py pretokenize --vocab_size=2048
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
+    parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
+    args = parser.parse_args()
+
+    # depending on the stage call the appropriate function
+    if args.stage == "download":
+        download()
+    elif args.stage == "train_vocab":
+        train_vocab(vocab_size=args.vocab_size)
+    elif args.stage == "pretokenize":
+        pretokenize(vocab_size=args.vocab_size)
+    else:
+        raise ValueError(f"Unknown stage {args.stage}")
diff --git a/tokenizer.py b/tokenizer.py
new file mode 100644
index 0000000..f3c0cc3
--- /dev/null
+++ b/tokenizer.py
@@ -0,0 +1,78 @@
+# Taken from llama code and lightly modified
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import os
+import struct
+import argparse
+from typing import List
+
+from sentencepiece import SentencePieceProcessor
+
+TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
+
+class Tokenizer:
+    def __init__(self, tokenizer_model=None):
+        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        self.model_path = model_path
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.pad_id()
+        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
+
+    def export(self):
+
+        # get all the tokens (postprocessed) and their scores as floats
+        tokens, scores = [], []
+        for i in range(self.n_words):
+
+            # decode the token and light postprocessing
+            t = self.sp_model.id_to_piece(i)
+            s = self.sp_model.get_score(i)
+            if i == self.bos_id:
+                t = '\n<s>\n'
+            elif i == self.eos_id:
+                t = '\n</s>\n'
+            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
+            b = t.encode('utf-8') # bytes of this token, utf-8 encoded
+
+            tokens.append(b)
+            scores.append(s)
+
+        # record the max token length
+        max_token_length = max(len(t) for t in tokens)
+
+        # write to a binary file
+        # the tokenizer.bin file is the same as .model file, but .bin
+        tokenizer_bin = self.model_path.replace('.model', '.bin')
+        with open(tokenizer_bin, 'wb') as f:
+            f.write(struct.pack("I", max_token_length))
+            for bytes, score in zip(tokens, scores):
+                f.write(struct.pack("fI", score, len(bytes)))
+                f.write(bytes)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
+    args = parser.parse_args()
+
+    t = Tokenizer(args.tokenizer_model)
+    t.export()
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..b1972dc
--- /dev/null
+++ b/train.py
@@ -0,0 +1,342 @@
+"""
+This training script can be run both on a single gpu in debug mode,
+and also in a larger training run with distributed data parallel (ddp).
+
+To run on a single GPU small debug run, example:
+$ python -m train.py --compile=False --eval_iters=10 --batch_size=8
+
+To run with DDP on 4 gpus on 1 node, example:
+$ torchrun --standalone --nproc_per_node=4 train.py
+
+To run with DDP on 4 gpus across 2 nodes, example:
+- Run on the first (master) node with example IP 123.456.123.456:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
+- Run on the worker node:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
+(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
+"""
+
+import math
+import os
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from functools import partial
+
+import torch
+from model import Transformer, ModelArgs
+from torch.distributed import destroy_process_group, init_process_group
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from tinystories import Task
+
+# -----------------------------------------------------------------------------
+# I/O
+out_dir = "out"
+eval_interval = 2000
+log_interval = 1
+eval_iters = 100
+eval_only = False  # if True, script exits right after the first eval
+always_save_checkpoint = False  # if True, always save a checkpoint after each eval
+init_from = "scratch"  # 'scratch' or 'resume'
+# wandb logging
+wandb_log = False  # disabled by default
+wandb_project = "llamac"
+wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+# data
+batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
+max_seq_len = 256
+vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
+# model
+dim = 288
+n_layers = 6
+n_heads = 6
+n_kv_heads = 6
+multiple_of = 32
+dropout = 0.0
+# adamw optimizer
+gradient_accumulation_steps = 4  # used to simulate larger batch sizes
+learning_rate = 5e-4  # max learning rate
+max_iters = 100000  # total number of training iterations
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
+# learning rate decay settings
+decay_lr = True  # whether to decay the learning rate
+warmup_iters = 1000  # how many steps to warm up for
+# system
+device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+dtype = "bfloat16"  # float32|bfloat16|float16
+compile = True  # use PyTorch 2.0 to compile the model to be faster
+# -----------------------------------------------------------------------------
+config_keys = [
+    k
+    for k, v in globals().items()
+    if not k.startswith("_") and isinstance(v, (int, float, bool, str))
+]
+exec(open("configurator.py").read())  # overrides from command line or config file
+config = {k: globals()[k] for k in config_keys}  # will be useful for logging
+# -----------------------------------------------------------------------------
+
+# fixing some hyperparams to sensible defaults
+lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
+min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+
+# validating checks
+assert vocab_source in ["llama2", "custom"]
+assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
+
+# various inits, derived attributes, I/O setup
+ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
+if ddp:
+    init_process_group(backend="nccl")
+    ddp_rank = int(os.environ["RANK"])
+    ddp_local_rank = int(os.environ["LOCAL_RANK"])
+    ddp_world_size = int(os.environ["WORLD_SIZE"])
+    device = f"cuda:{ddp_local_rank}"
+    torch.cuda.set_device(device)
+    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
+    seed_offset = ddp_rank  # each process gets a different seed
+    # world_size number of processes will be training simultaneously, so we can scale
+    # down the desired gradient accumulation iterations per process proportionally
+    assert gradient_accumulation_steps % ddp_world_size == 0
+    gradient_accumulation_steps //= ddp_world_size
+else:
+    # if not ddp, we are running on a single gpu, and one process
+    master_process = True
+    seed_offset = 0
+    ddp_world_size = 1
+tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
+if master_process:
+    print(f"tokens per iteration will be: {tokens_per_iter:,}")
+    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
+
+if master_process:
+    os.makedirs(out_dir, exist_ok=True)
+torch.manual_seed(1337 + seed_offset)
+torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
+# note: float16 data type will automatically use a GradScaler
+ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
+ctx = (
+    nullcontext()
+    if device_type == "cpu"
+    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+)
+
+# task-specific setup
+iter_batches = partial(
+    Task.iter_batches,
+    batch_size=batch_size,
+    max_seq_len=max_seq_len,
+    vocab_size=vocab_size,
+    vocab_source=vocab_source,
+    device=device,
+    num_workers=0,
+)
+
+# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
+iter_num = 0
+best_val_loss = 1e9
+
+# model init
+model_args = dict(
+    dim=dim,
+    n_layers=n_layers,
+    n_heads=n_heads,
+    n_kv_heads=n_kv_heads,
+    vocab_size=vocab_size,
+    multiple_of=multiple_of,
+    max_seq_len=max_seq_len,
+    dropout=dropout,
+)  # start with model_args from command line
+if init_from == "scratch":
+    # init a new model from scratch
+    print("Initializing a new model from scratch")
+    gptconf = ModelArgs(**model_args)
+    model = Transformer(gptconf)
+elif init_from == "resume":
+    print(f"Resuming training from {out_dir}")
+    # resume training from a checkpoint.
+    ckpt_path = os.path.join(out_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    checkpoint_model_args = checkpoint["model_args"]
+    # force these config attributes to be equal otherwise we can't even resume training
+    # the rest of the attributes (e.g. dropout) can stay as desired from command line
+    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
+        model_args[k] = checkpoint_model_args[k]
+    # create the model
+    gptconf = ModelArgs(**model_args)
+    model = Transformer(gptconf)
+    state_dict = checkpoint["model"]
+    # fix the keys of the state dictionary :(
+    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
+    unwanted_prefix = "_orig_mod."
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+    iter_num = checkpoint["iter_num"]
+    best_val_loss = checkpoint["best_val_loss"]
+model.to(device)
+
+# initialize a GradScaler. If enabled=False scaler is a no-op
+scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))
+
+# optimizer
+optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
+if init_from == "resume" and "optimizer" in checkpoint:
+    optimizer.load_state_dict(checkpoint["optimizer"])
+checkpoint = None  # free up memory
+
+# compile the model
+if compile:
+    print("compiling the model... (takes a ~minute)")
+    unoptimized_model = model
+    model = torch.compile(model)  # requires PyTorch 2.0
+
+# wrap model into DDP container
+if ddp:
+    # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
+    # construction time since NCCL does not support `ComplexFloat`
+    prefix = "_orig_mod." if compile else ""
+    model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
+    model = DDP(model, device_ids=[ddp_local_rank])
+
+# helps estimate an arbitrarily accurate loss over either split using many batches
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ["train", "val"]:
+        batch_iter = iter_batches(split=split)
+        losses = torch.zeros(eval_iters)  # keep on CPU
+        for k in range(eval_iters):
+            X, Y = next(batch_iter)
+            with ctx:
+                logits = model(X, Y)
+                loss = raw_model.last_loss
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+
+# learning rate decay scheduler (cosine with warmup)
+def get_lr(it):
+    # 1) linear warmup for warmup_iters steps
+    if it < warmup_iters:
+        return learning_rate * it / warmup_iters
+    # 2) if it > lr_decay_iters, return min learning rate
+    if it > lr_decay_iters:
+        return min_lr
+    # 3) in between, use cosine decay down to min learning rate
+    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    assert 0 <= decay_ratio <= 1
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+    return min_lr + coeff * (learning_rate - min_lr)
+
+# logging
+if wandb_log and master_process:
+    import wandb
+    wandb.init(project=wandb_project, name=wandb_run_name, config=config)
+
+# training loop
+train_batch_iter = iter_batches(split="train")
+X, Y = next(train_batch_iter)  # fetch the very first batch
+t0 = time.time()
+local_iter_num = 0  # number of iterations in the lifetime of this process
+raw_model = model.module if ddp else model  # unwrap DDP container if needed
+running_mfu = -1.0
+while True:
+    # determine and set the learning rate for this iteration
+    lr = get_lr(iter_num) if decay_lr else learning_rate
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+
+    # evaluate the loss on train/val sets and write checkpoints
+    if iter_num % eval_interval == 0 and master_process:
+        losses = estimate_loss()
+        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        if wandb_log:
+            try:
+                wandb.log(
+                    {
+                        "iter": iter_num,
+                        "tokens": iter_num * tokens_per_iter,
+                        "loss/train": losses["train"],
+                        "loss/val": losses["val"],
+                        "lr": lr,
+                        "mfu": running_mfu * 100,  # convert to percentage
+                    }
+                )
+            except Exception as e:
+                print(f"logging to wandb failed: {e}")
+        if losses["val"] < best_val_loss or always_save_checkpoint:
+            best_val_loss = losses["val"]
+            if iter_num > 0:
+                checkpoint = {
+                    "model": raw_model.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "model_args": model_args,
+                    "iter_num": iter_num,
+                    "best_val_loss": best_val_loss,
+                    "config": config,
+                }
+                print(f"saving checkpoint to {out_dir}")
+                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
+                raw_model.export(os.path.join(out_dir, "model.bin"))
+    if iter_num == 0 and eval_only:
+        break
+
+    # forward backward update, with optional gradient accumulation to simulate larger batch size
+    # and using the GradScaler if data type is float16
+    for micro_step in range(gradient_accumulation_steps):
+        if ddp:
+            # in DDP training we only need to sync gradients at the last micro step.
+            # the official way to do this is with model.no_sync() context manager, but
+            # I really dislike that this bloats the code and forces us to repeat code
+            # looking at the source of that context manager, it just toggles this variable
+            model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1
+        with ctx:
+            logits = model(X, Y)
+            loss = raw_model.last_loss
+            loss = loss / gradient_accumulation_steps
+        # immediately async prefetch next batch while model is doing the forward pass on the GPU
+        X, Y = next(train_batch_iter)
+        # backward pass, with gradient scaling if training in fp16
+        scaler.scale(loss).backward()
+    # clip the gradient
+    if grad_clip != 0.0:
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+    # step the optimizer and scaler if training in fp16
+    scaler.step(optimizer)
+    scaler.update()
+    # flush the gradients as soon as we can, no need for this memory anymore
+    optimizer.zero_grad(set_to_none=True)
+
+    # timing and logging
+    t1 = time.time()
+    dt = t1 - t0
+    t0 = t1
+    if iter_num % log_interval == 0 and master_process:
+        # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
+        lossf = loss.item() * gradient_accumulation_steps
+        if local_iter_num >= 5:  # let the training loop settle a bit
+            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
+            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+        print(
+            f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
+        )
+    iter_num += 1
+    local_iter_num += 1
+
+    # termination conditions
+    if iter_num > max_iters:
+        break
+
+if ddp:
+    destroy_process_group()
diff --git a/train_vocab.sh b/train_vocab.sh
new file mode 100755
index 0000000..7803af8
--- /dev/null
+++ b/train_vocab.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Trains a sentencepiece tokenizer model on a bunch of given data, my best
+# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
+
+# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
+# example:
+# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
+# requirements:
+# install https://github.com/google/sentencepiece
+
+# check if the correct number of arguments are provided
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    exit 1
+fi
+
+# assign command-line arguments to variables
+input=$1
+model_prefix=$2
+vocab_size=$3
+
+# check if input file exists
+if [ ! -f "$input" ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    echo "input '$input' not found."
+    exit 1
+fi
+
+# check if vocab_size is a positive integer
+if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    echo "vocab_size size must be a positive integer."
+    exit 1
+fi
+
+# Print the processed inputs
+echo "Input: $input"
+echo "Model Prefix: $model_prefix"
+echo "Vocabulary Size: $vocab_size"
+
+# train a sentencepiece tokenizer model
+# Llama 2 config can be printed as follows:
+
+# import sentencepiece.sentencepiece_model_pb2
+# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
+# mp.ParseFromString(open("tokenizer.model", "rb").read())
+# print(mp.trainer_spec)
+# print(mp.normalizer_spec)
+
+# this gives:
+
+# trainer_spec {
+#   input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
+#   model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
+#   model_type: BPE
+#   vocab_size: 32000
+#   self_test_sample_size: 0
+#   input_format: "text"
+#   character_coverage: 0.9999499917030334
+#   input_sentence_size: 200000000
+#   seed_sentencepiece_size: 1000000
+#   shrinking_factor: 0.75
+#   num_threads: 80
+#   num_sub_iterations: 2
+#   max_sentence_length: 4192
+#   shuffle_input_sentence: true
+#   max_sentencepiece_length: 16
+#   split_by_unicode_script: true
+#   split_by_whitespace: true
+#   split_by_number: true
+#   treat_whitespace_as_suffix: false
+#   split_digits: true
+#   allow_whitespace_only_pieces: true
+#   vocabulary_output_piece_score: true
+#   hard_vocab_limit: true
+#   use_all_vocab: false
+#   byte_fallback: true
+#   required_chars: ""
+#   unk_id: 0
+#   bos_id: 1
+#   eos_id: 2
+#   pad_id: -1
+#   unk_surface: " \342\201\207 "
+#   unk_piece: "<unk>"
+#   bos_piece: "<s>"
+#   eos_piece: "</s>"
+#   pad_piece: "<pad>"
+#   train_extremely_large_corpus: false
+#   enable_differential_privacy: false
+#   differential_privacy_noise_level: 0.0
+#   differential_privacy_clipping_threshold: 0
+# }
+# normalizer_spec {
+#   name: "identity"
+#   precompiled_charsmap: ""
+#   add_dummy_prefix: true
+#   remove_extra_whitespaces: false
+#   normalization_rule_tsv: ""
+# }
+
+# let's now use spm_train to train this exact model
+# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
+
+# we'll depart on a few settings:
+# character_coverage -> 1.0
+
+# other important notes:
+# --split-digits = true, per the paper
+# --allow_whitespace_only_pieces is true, default in spm is false
+# --byte_fallback is true, default in spm is false
+# --normalization_rule_name is identity, default in spm is nmt_nfkc
+
+spm_train --input="$input" \
+          --model_prefix="$model_prefix" \
+          --model_type=bpe \
+          --vocab_size="$vocab_size" \
+          --self_test_sample_size=0 \
+          --input_format="text" \
+          --character_coverage=1.0 \
+          --num_threads="$(nproc)" \
+          --split_digits=true \
+          --allow_whitespace_only_pieces=true \
+          --byte_fallback=true \
+          --unk_surface=" \342\201\207 " \
+          --normalization_rule_name=identity \