Model Properties Table#
n_params |
n_layers |
d_model |
n_heads |
act_fn |
n_ctx |
d_vocab |
d_head |
d_mlp |
|
|---|---|---|---|---|---|---|---|---|---|
gpt2-small |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
gpt2-medium |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
gpt2-large |
708M |
36 |
1280 |
20 |
gelu |
1024 |
50257 |
64 |
5120 |
gpt2-xl |
1.5B |
48 |
1600 |
25 |
gelu |
1024 |
50257 |
64 |
6400 |
distillgpt2 |
42M |
6 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
opt-125m |
85M |
12 |
768 |
12 |
relu |
2048 |
50272 |
64 |
3072 |
opt-1.3b |
1.2B |
24 |
2048 |
32 |
relu |
2048 |
50272 |
64 |
8192 |
opt-2.7b |
2.5B |
32 |
2560 |
32 |
relu |
2048 |
50272 |
80 |
10240 |
opt-6.7b |
6.4B |
32 |
4096 |
32 |
relu |
2048 |
50272 |
128 |
16384 |
opt-13b |
13B |
40 |
5120 |
40 |
relu |
2048 |
50272 |
128 |
20480 |
opt-30b |
30B |
48 |
7168 |
56 |
relu |
2048 |
50272 |
128 |
28672 |
opt-66b |
65B |
64 |
9216 |
72 |
relu |
2048 |
50272 |
128 |
36864 |
gpt-neo-125M |
85M |
12 |
768 |
12 |
gelu |
2048 |
50257 |
64 |
3072 |
gpt-neo-1.3B |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50257 |
128 |
8192 |
gpt-neo-2.7B |
2.5B |
32 |
2560 |
20 |
gelu |
2048 |
50257 |
128 |
10240 |
gpt-j-6B |
5.6B |
28 |
4096 |
16 |
gelu |
2048 |
50400 |
256 |
16384 |
gpt-neox-20b |
20B |
44 |
6144 |
64 |
gelu |
2048 |
50432 |
96 |
24576 |
stanford-gpt2-small-a |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
stanford-gpt2-small-b |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
stanford-gpt2-small-c |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
stanford-gpt2-small-d |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
stanford-gpt2-small-e |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
stanford-gpt2-medium-a |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
stanford-gpt2-medium-b |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
stanford-gpt2-medium-c |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
stanford-gpt2-medium-d |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
stanford-gpt2-medium-e |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
pythia-14m |
1.2M |
6 |
128 |
4 |
gelu |
2048 |
50304 |
32 |
512 |
pythia-31m |
4.7M |
6 |
256 |
8 |
gelu |
2048 |
50304 |
32 |
1024 |
pythia-70m |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
pythia-160m |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-410m |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
pythia-1b |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
pythia-1.4b |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
pythia-2.8b |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
pythia-6.9b |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
pythia-12b |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
pythia-70m-deduped |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
pythia-160m-deduped |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-410m-deduped |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
pythia-1b-deduped |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
pythia-1.4b-deduped |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
pythia-2.8b-deduped |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
pythia-6.9b-deduped |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
pythia-12b-deduped |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
pythia-70m-v0 |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
pythia-160m-v0 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-410m-v0 |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
pythia-1b-v0 |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
pythia-1.4b-v0 |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
pythia-2.8b-v0 |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
pythia-6.9b-v0 |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
pythia-12b-v0 |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
pythia-70m-deduped-v0 |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
pythia-160m-deduped-v0 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-410m-deduped-v0 |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
pythia-1b-deduped-v0 |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
pythia-1.4b-deduped-v0 |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
pythia-2.8b-deduped-v0 |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
pythia-6.9b-deduped-v0 |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
pythia-12b-deduped-v0 |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
pythia-160m-seed1 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-160m-seed2 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
pythia-160m-seed3 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
solu-1l-pile |
13M |
1 |
1024 |
16 |
solu |
1024 |
50278 |
64 |
4096 |
solu-2l-pile |
13M |
2 |
736 |
11 |
solu |
1024 |
50278 |
64 |
2944 |
solu-4l-pile |
13M |
4 |
512 |
8 |
solu |
1024 |
50278 |
64 |
2048 |
solu-6l-pile |
42M |
6 |
768 |
12 |
solu |
1024 |
50278 |
64 |
3072 |
solu-8l-pile |
101M |
8 |
1024 |
16 |
solu |
1024 |
50278 |
64 |
4096 |
solu-10l-pile |
197M |
10 |
1280 |
20 |
solu |
1024 |
50278 |
64 |
5120 |
solu-12l-pile |
340M |
12 |
1536 |
24 |
solu |
1024 |
50278 |
64 |
6144 |
solu-1l |
3.1M |
1 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
solu-2l |
6.3M |
2 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
solu-3l |
9.4M |
3 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
solu-4l |
13M |
4 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
solu-6l |
42M |
6 |
768 |
12 |
solu |
1024 |
48262 |
64 |
3072 |
solu-8l |
101M |
8 |
1024 |
16 |
solu |
1024 |
48262 |
64 |
4096 |
solu-10l |
197M |
10 |
1280 |
20 |
solu |
1024 |
48262 |
64 |
5120 |
solu-12l |
340M |
12 |
1536 |
24 |
solu |
1024 |
48262 |
64 |
6144 |
gelu-1l |
3.1M |
1 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
gelu-2l |
6.3M |
2 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
gelu-3l |
9.4M |
3 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
gelu-4l |
13M |
4 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
attn-only-1l |
1.0M |
1 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
attn-only-2l |
2.1M |
2 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
attn-only-3l |
3.1M |
3 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
attn-only-4l |
4.2M |
4 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
attn-only-2l-demo |
2.1M |
2 |
512 |
8 |
attn_only |
1024 |
50277 |
64 |
2048 |
solu-1l-wiki |
3.1M |
1 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
solu-4l-wiki |
13M |
4 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
redwood_attn_2l |
524K |
2 |
256 |
8 |
attn_only |
2048 |
50259 |
32 |
-1 |
llama-7b |
5.0B |
32 |
4096 |
32 |
silu |
2048 |
32000 |
128 |
11008 |
llama-13b |
9.9B |
40 |
5120 |
40 |
silu |
2048 |
32000 |
128 |
13824 |
llama-30b |
25B |
60 |
6656 |
52 |
silu |
2048 |
32000 |
128 |
17920 |
llama-65b |
50B |
80 |
8192 |
64 |
silu |
2048 |
32000 |
128 |
22016 |
Llama-2-7b |
5.0B |
32 |
4096 |
32 |
silu |
4096 |
32000 |
128 |
11008 |
Llama-2-7b-chat |
5.0B |
32 |
4096 |
32 |
silu |
4096 |
32000 |
128 |
11008 |
Llama-2-13b |
9.9B |
40 |
5120 |
40 |
silu |
4096 |
32000 |
128 |
13824 |
Llama-2-13b-chat |
9.9B |
40 |
5120 |
40 |
silu |
4096 |
32000 |
128 |
13824 |
othello-gpt |
25M |
8 |
512 |
8 |
gelu |
59 |
61 |
64 |
2048 |
bert-base-cased |
85M |
12 |
768 |
12 |
gelu |
512 |
28996 |
64 |
3072 |
tiny-stories-1M |
393K |
8 |
64 |
16 |
gelu |
2048 |
50257 |
4 |
256 |
tiny-stories-3M |
1.6M |
8 |
128 |
16 |
gelu |
2048 |
50257 |
8 |
512 |
tiny-stories-8M |
6.3M |
8 |
256 |
16 |
gelu |
2048 |
50257 |
16 |
1024 |
tiny-stories-28M |
25M |
8 |
512 |
16 |
gelu |
2048 |
50257 |
32 |
2048 |
tiny-stories-33M |
28M |
4 |
768 |
16 |
gelu |
2048 |
50257 |
48 |
3072 |
tiny-stories-instruct-1M |
393K |
8 |
64 |
16 |
gelu |
2048 |
50257 |
4 |
256 |
tiny-stories-instruct-3M |
1.6M |
8 |
128 |
16 |
gelu |
2048 |
50257 |
8 |
512 |
tiny-stories-instruct-8M |
6.3M |
8 |
256 |
16 |
gelu |
2048 |
50257 |
16 |
1024 |
tiny-stories-instruct-28M |
25M |
8 |
512 |
16 |
gelu |
2048 |
50257 |
32 |
2048 |
tiny-stories-instruct-33M |
28M |
4 |
768 |
16 |
gelu |
2048 |
50257 |
48 |
3072 |
tiny-stories-1L-21M |
13M |
1 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
tiny-stories-2L-33M |
25M |
2 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
tiny-stories-instruct-1L-21M |
13M |
1 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
tiny-stories-instruct-2L-33M |
25M |
2 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
stablelm-base-alpha-3b |
3.2B |
16 |
4096 |
32 |
gelu |
4096 |
50688 |
128 |
16384 |
stablelm-base-alpha-7b |
7.2B |
16 |
6144 |
48 |
gelu |
4096 |
50432 |
128 |
24576 |
stablelm-tuned-alpha-3b |
3.2B |
16 |
4096 |
32 |
gelu |
4096 |
50688 |
128 |
16384 |
stablelm-tuned-alpha-7b |
7.2B |
16 |
6144 |
48 |
gelu |
4096 |
50432 |
128 |
24576 |
santacoder |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
49280 |
128 |
8192 |