Model Properties Table#

n_params

n_layers

d_model

n_heads

act_fn

n_ctx

d_vocab

d_head

d_mlp

gpt2-small

85M

12

768

12

gelu

1024

50257

64

3072

gpt2-medium

302M

24

1024

16

gelu

1024

50257

64

4096

gpt2-large

708M

36

1280

20

gelu

1024

50257

64

5120

gpt2-xl

1.5B

48

1600

25

gelu

1024

50257

64

6400

distillgpt2

42M

6

768

12

gelu

1024

50257

64

3072

opt-125m

85M

12

768

12

relu

2048

50272

64

3072

opt-1.3b

1.2B

24

2048

32

relu

2048

50272

64

8192

opt-2.7b

2.5B

32

2560

32

relu

2048

50272

80

10240

opt-6.7b

6.4B

32

4096

32

relu

2048

50272

128

16384

opt-13b

13B

40

5120

40

relu

2048

50272

128

20480

opt-30b

30B

48

7168

56

relu

2048

50272

128

28672

opt-66b

65B

64

9216

72

relu

2048

50272

128

36864

gpt-neo-125M

85M

12

768

12

gelu

2048

50257

64

3072

gpt-neo-1.3B

1.2B

24

2048

16

gelu

2048

50257

128

8192

gpt-neo-2.7B

2.5B

32

2560

20

gelu

2048

50257

128

10240

gpt-j-6B

5.6B

28

4096

16

gelu

2048

50400

256

16384

gpt-neox-20b

20B

44

6144

64

gelu

2048

50432

96

24576

stanford-gpt2-small-a

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-b

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-c

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-d

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-e

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-medium-a

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-b

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-c

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-d

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-e

302M

24

1024

16

gelu

1024

50257

64

4096

pythia-14m

1.2M

6

128

4

gelu

2048

50304

32

512

pythia-31m

4.7M

6

256

8

gelu

2048

50304

32

1024

pythia-70m

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-deduped

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-deduped

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-deduped

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-deduped

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-deduped

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-deduped

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-deduped

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-deduped

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-v0

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-v0

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-v0

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-v0

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-v0

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-v0

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-v0

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-v0

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-deduped-v0

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-deduped-v0

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-deduped-v0

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-deduped-v0

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-deduped-v0

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-deduped-v0

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-deduped-v0

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-deduped-v0

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-160m-seed1

85M

12

768

12

gelu

2048

50304

64

3072

pythia-160m-seed2

85M

12

768

12

gelu

2048

50304

64

3072

pythia-160m-seed3

85M

12

768

12

gelu

2048

50304

64

3072

solu-1l-pile

13M

1

1024

16

solu

1024

50278

64

4096

solu-2l-pile

13M

2

736

11

solu

1024

50278

64

2944

solu-4l-pile

13M

4

512

8

solu

1024

50278

64

2048

solu-6l-pile

42M

6

768

12

solu

1024

50278

64

3072

solu-8l-pile

101M

8

1024

16

solu

1024

50278

64

4096

solu-10l-pile

197M

10

1280

20

solu

1024

50278

64

5120

solu-12l-pile

340M

12

1536

24

solu

1024

50278

64

6144

solu-1l

3.1M

1

512

8

solu

1024

48262

64

2048

solu-2l

6.3M

2

512

8

solu

1024

48262

64

2048

solu-3l

9.4M

3

512

8

solu

1024

48262

64

2048

solu-4l

13M

4

512

8

solu

1024

48262

64

2048

solu-6l

42M

6

768

12

solu

1024

48262

64

3072

solu-8l

101M

8

1024

16

solu

1024

48262

64

4096

solu-10l

197M

10

1280

20

solu

1024

48262

64

5120

solu-12l

340M

12

1536

24

solu

1024

48262

64

6144

gelu-1l

3.1M

1

512

8

gelu

1024

48262

64

2048

gelu-2l

6.3M

2

512

8

gelu

1024

48262

64

2048

gelu-3l

9.4M

3

512

8

gelu

1024

48262

64

2048

gelu-4l

13M

4

512

8

gelu

1024

48262

64

2048

attn-only-1l

1.0M

1

512

8

attn_only

1024

48262

64

2048

attn-only-2l

2.1M

2

512

8

attn_only

1024

48262

64

2048

attn-only-3l

3.1M

3

512

8

attn_only

1024

48262

64

2048

attn-only-4l

4.2M

4

512

8

attn_only

1024

48262

64

2048

attn-only-2l-demo

2.1M

2

512

8

attn_only

1024

50277

64

2048

solu-1l-wiki

3.1M

1

512

8

solu

1024

48262

64

2048

solu-4l-wiki

13M

4

512

8

solu

1024

48262

64

2048

redwood_attn_2l

524K

2

256

8

attn_only

2048

50259

32

-1

llama-7b

5.0B

32

4096

32

silu

2048

32000

128

11008

llama-13b

9.9B

40

5120

40

silu

2048

32000

128

13824

llama-30b

25B

60

6656

52

silu

2048

32000

128

17920

llama-65b

50B

80

8192

64

silu

2048

32000

128

22016

Llama-2-7b

5.0B

32

4096

32

silu

4096

32000

128

11008

Llama-2-7b-chat

5.0B

32

4096

32

silu

4096

32000

128

11008

Llama-2-13b

9.9B

40

5120

40

silu

4096

32000

128

13824

Llama-2-13b-chat

9.9B

40

5120

40

silu

4096

32000

128

13824

othello-gpt

25M

8

512

8

gelu

59

61

64

2048

bert-base-cased

85M

12

768

12

gelu

512

28996

64

3072

tiny-stories-1M

393K

8

64

16

gelu

2048

50257

4

256

tiny-stories-3M

1.6M

8

128

16

gelu

2048

50257

8

512

tiny-stories-8M

6.3M

8

256

16

gelu

2048

50257

16

1024

tiny-stories-28M

25M

8

512

16

gelu

2048

50257

32

2048

tiny-stories-33M

28M

4

768

16

gelu

2048

50257

48

3072

tiny-stories-instruct-1M

393K

8

64

16

gelu

2048

50257

4

256

tiny-stories-instruct-3M

1.6M

8

128

16

gelu

2048

50257

8

512

tiny-stories-instruct-8M

6.3M

8

256

16

gelu

2048

50257

16

1024

tiny-stories-instruct-28M

25M

8

512

16

gelu

2048

50257

32

2048

tiny-stories-instruct-33M

28M

4

768

16

gelu

2048

50257

48

3072

tiny-stories-1L-21M

13M

1

1024

16

gelu

2048

50257

64

4096

tiny-stories-2L-33M

25M

2

1024

16

gelu

2048

50257

64

4096

tiny-stories-instruct-1L-21M

13M

1

1024

16

gelu

2048

50257

64

4096

tiny-stories-instruct-2L-33M

25M

2

1024

16

gelu

2048

50257

64

4096

stablelm-base-alpha-3b

3.2B

16

4096

32

gelu

4096

50688

128

16384

stablelm-base-alpha-7b

7.2B

16

6144

48

gelu

4096

50432

128

24576

stablelm-tuned-alpha-3b

3.2B

16

4096

32

gelu

4096

50688

128

16384

stablelm-tuned-alpha-7b

7.2B

16

6144

48

gelu

4096

50432

128

24576

santacoder

1.2B

24

2048

16

gelu

2048

49280

128

8192