Skip to content

Commit 6ee638a

Browse files
authored
Merge pull request #2299 from huggingface/siglip_update
Add i18n variant of so400m model w/ weights. Add two in1k fine-tunes
2 parents 41a79e0 + d9321b0 commit 6ee638a

File tree

1 file changed

+85
-0
lines changed

1 file changed

+85
-0
lines changed

timm/models/vision_transformer.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18171817
hf_hub_filename='open_clip_pytorch_model.bin',
18181818
input_size=(3, 256, 256),
18191819
num_classes=0),
1820+
'vit_base_patch16_siglip_256.webli_i18n': _cfg(
1821+
hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256',
1822+
hf_hub_filename='open_clip_pytorch_model.bin',
1823+
input_size=(3, 256, 256),
1824+
num_classes=0),
18201825
'vit_base_patch16_siglip_384.webli': _cfg(
18211826
hf_hub_id='timm/ViT-B-16-SigLIP-384',
18221827
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1841,6 +1846,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18411846
hf_hub_id='timm/ViT-SO400M-14-SigLIP',
18421847
hf_hub_filename='open_clip_pytorch_model.bin',
18431848
num_classes=0),
1849+
'vit_so400m_patch16_siglip_256.webli_i18n': _cfg(
1850+
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
1851+
hf_hub_filename='open_clip_pytorch_model.bin',
1852+
input_size=(3, 256, 256),
1853+
num_classes=0),
1854+
'vit_so400m_patch14_siglip_378.webli': _cfg(
1855+
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
1856+
hf_hub_filename='open_clip_pytorch_model.bin',
1857+
input_size=(3, 378, 378),
1858+
num_classes=0),
18441859
'vit_so400m_patch14_siglip_384.webli': _cfg(
18451860
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
18461861
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1856,6 +1871,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18561871
hf_hub_filename='open_clip_pytorch_model.bin',
18571872
input_size=(3, 256, 256),
18581873
num_classes=0),
1874+
'vit_base_patch16_siglip_gap_256.webli_i18n': _cfg(
1875+
hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256',
1876+
hf_hub_filename='open_clip_pytorch_model.bin',
1877+
input_size=(3, 256, 256),
1878+
num_classes=0),
18591879
'vit_base_patch16_siglip_gap_384.webli': _cfg(
18601880
hf_hub_id='timm/ViT-B-16-SigLIP-384',
18611881
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1890,6 +1910,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18901910
hf_hub_filename='paligemma-3b-pt-224.npz',
18911911
custom_load='hf',
18921912
num_classes=0),
1913+
'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg(
1914+
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
1915+
hf_hub_filename='open_clip_pytorch_model.bin',
1916+
input_size=(3, 256, 256),
1917+
num_classes=0),
1918+
'vit_so400m_patch14_siglip_gap_378.webli': _cfg(
1919+
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
1920+
hf_hub_filename='open_clip_pytorch_model.bin',
1921+
input_size=(3, 378, 378), crop_pct=1.0,
1922+
num_classes=0),
18931923
'vit_so400m_patch14_siglip_gap_384.webli': _cfg(
18941924
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
18951925
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1914,6 +1944,15 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19141944
input_size=(3, 896, 896), crop_pct=1.0,
19151945
num_classes=0),
19161946

1947+
'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg(
1948+
hf_hub_id='timm/',
1949+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
1950+
),
1951+
'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k': _cfg(
1952+
hf_hub_id='timm/',
1953+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
1954+
),
1955+
19171956
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg(
19181957
hf_hub_id='timm/',
19191958
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -2935,6 +2974,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
29352974
return model
29362975

29372976

2977+
@register_model
2978+
def vit_so400m_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
2979+
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2980+
model_args = dict(
2981+
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
2982+
)
2983+
model = _create_vision_transformer(
2984+
'vit_so400m_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs))
2985+
return model
2986+
2987+
2988+
@register_model
2989+
def vit_so400m_patch14_siglip_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
2990+
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2991+
model_args = dict(
2992+
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
2993+
)
2994+
model = _create_vision_transformer(
2995+
'vit_so400m_patch14_siglip_378', pretrained=pretrained, **dict(model_args, **kwargs))
2996+
return model
2997+
2998+
29382999
@register_model
29393000
def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
29403001
model_args = dict(
@@ -3023,6 +3084,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
30233084
return model
30243085

30253086

3087+
@register_model
3088+
def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
3089+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3090+
model_args = dict(
3091+
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
3092+
class_token=False, global_pool='avg', fc_norm=False,
3093+
)
3094+
model = _create_vision_transformer(
3095+
'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
3096+
return model
3097+
3098+
3099+
@register_model
3100+
def vit_so400m_patch14_siglip_gap_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
3101+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3102+
model_args = dict(
3103+
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
3104+
class_token=False, global_pool='avg', fc_norm=False,
3105+
)
3106+
model = _create_vision_transformer(
3107+
'vit_so400m_patch14_siglip_gap_378', pretrained=pretrained, **dict(model_args, **kwargs))
3108+
return model
3109+
3110+
30263111
@register_model
30273112
def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
30283113
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""

0 commit comments

Comments
 (0)