Using head_dim instead of override scalar value for the 9b model.

michaelmoynihan · michaelmoynihan · commit 03e657582d17 · 2024-07-09T18:35:31.000Z
diff --git a/gemma/config.py b/gemma/config.py
@@ -117,7 +117,6 @@ def get_config_for_9b() -> GemmaConfig:
         head_dim=256,
         attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 21,
         sliding_window_size=4096,
-        query_pre_attn_scalar=224, # hidden_size / num_attention_heads
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,6 @@ def get_config_for_9b() -> GemmaConfig:`
`117`	`117`	`head_dim=256,`
`118`	`118`	`attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 21,`
`119`	`119`	`sliding_window_size=4096,`
`120`		`- query_pre_attn_scalar=224, # hidden_size / num_attention_heads`
`121`	`120`	`)`
`122`	`121`
`123`	`122`