Skip to content

Commit 01b6226

Browse files
committed
Add i18n variant of so400m model w/ weights. Add two in1k fine-tunes of original so400m 384x384 but at 378x378 (better matches patch14)
1 parent 41a79e0 commit 01b6226

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

timm/models/vision_transformer.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18411841
hf_hub_id='timm/ViT-SO400M-14-SigLIP',
18421842
hf_hub_filename='open_clip_pytorch_model.bin',
18431843
num_classes=0),
1844+
'vit_so400m_patch16_siglip_256.webli': _cfg(
1845+
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
1846+
hf_hub_filename='open_clip_pytorch_model.bin',
1847+
input_size=(3, 256, 256),
1848+
num_classes=0),
1849+
'vit_so400m_patch14_siglip_378.webli': _cfg(
1850+
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
1851+
hf_hub_filename='open_clip_pytorch_model.bin',
1852+
input_size=(3, 378, 378),
1853+
num_classes=0),
18441854
'vit_so400m_patch14_siglip_384.webli': _cfg(
18451855
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
18461856
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1890,6 +1900,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18901900
hf_hub_filename='paligemma-3b-pt-224.npz',
18911901
custom_load='hf',
18921902
num_classes=0),
1903+
'vit_so400m_patch16_siglip_gap_256.webli': _cfg(
1904+
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
1905+
hf_hub_filename='open_clip_pytorch_model.bin',
1906+
input_size=(3, 256, 256),
1907+
num_classes=0),
1908+
'vit_so400m_patch14_siglip_gap_378.webli': _cfg(
1909+
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
1910+
hf_hub_filename='open_clip_pytorch_model.bin',
1911+
input_size=(3, 378, 378), crop_pct=1.0,
1912+
num_classes=0),
18931913
'vit_so400m_patch14_siglip_gap_384.webli': _cfg(
18941914
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
18951915
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1914,6 +1934,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19141934
input_size=(3, 896, 896), crop_pct=1.0,
19151935
num_classes=0),
19161936

1937+
'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg(
1938+
#hf_hub_id='timm/',
1939+
#file='vit_so400m_p14_378_map-8.pth',
1940+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
1941+
),
1942+
'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k': _cfg(
1943+
# hf_hub_id='timm/',
1944+
#file='vit_so400m_p14_378_gap-8.pth',
1945+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
1946+
),
1947+
19171948
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg(
19181949
hf_hub_id='timm/',
19191950
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -2935,6 +2966,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
29352966
return model
29362967

29372968

2969+
@register_model
2970+
def vit_so400m_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
2971+
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2972+
model_args = dict(
2973+
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
2974+
)
2975+
model = _create_vision_transformer(
2976+
'vit_so400m_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs))
2977+
return model
2978+
2979+
2980+
@register_model
2981+
def vit_so400m_patch14_siglip_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
2982+
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2983+
model_args = dict(
2984+
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
2985+
)
2986+
model = _create_vision_transformer(
2987+
'vit_so400m_patch14_siglip_378', pretrained=pretrained, **dict(model_args, **kwargs))
2988+
return model
2989+
2990+
29382991
@register_model
29392992
def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
29402993
model_args = dict(
@@ -3023,6 +3076,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
30233076
return model
30243077

30253078

3079+
@register_model
3080+
def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
3081+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3082+
model_args = dict(
3083+
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
3084+
class_token=False, global_pool='avg', fc_norm=False,
3085+
)
3086+
model = _create_vision_transformer(
3087+
'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
3088+
return model
3089+
3090+
3091+
@register_model
3092+
def vit_so400m_patch14_siglip_gap_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
3093+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3094+
model_args = dict(
3095+
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
3096+
class_token=False, global_pool='avg', fc_norm=False,
3097+
)
3098+
model = _create_vision_transformer(
3099+
'vit_so400m_patch14_siglip_gap_378', pretrained=pretrained, **dict(model_args, **kwargs))
3100+
return model
3101+
3102+
30263103
@register_model
30273104
def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
30283105
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""

0 commit comments

Comments
 (0)