@@ -1817,6 +1817,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1817
1817
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1818
1818
input_size = (3 , 256 , 256 ),
1819
1819
num_classes = 0 ),
1820
+ 'vit_base_patch16_siglip_256.webli_i18n' : _cfg (
1821
+ hf_hub_id = 'timm/ViT-B-16-SigLIP-i18n-256' ,
1822
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1823
+ input_size = (3 , 256 , 256 ),
1824
+ num_classes = 0 ),
1820
1825
'vit_base_patch16_siglip_384.webli' : _cfg (
1821
1826
hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
1822
1827
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1841,6 +1846,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1841
1846
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
1842
1847
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1843
1848
num_classes = 0 ),
1849
+ 'vit_so400m_patch16_siglip_256.webli_i18n' : _cfg (
1850
+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1851
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1852
+ input_size = (3 , 256 , 256 ),
1853
+ num_classes = 0 ),
1854
+ 'vit_so400m_patch14_siglip_378.webli' : _cfg (
1855
+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1856
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1857
+ input_size = (3 , 378 , 378 ),
1858
+ num_classes = 0 ),
1844
1859
'vit_so400m_patch14_siglip_384.webli' : _cfg (
1845
1860
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1846
1861
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1856,6 +1871,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1856
1871
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1857
1872
input_size = (3 , 256 , 256 ),
1858
1873
num_classes = 0 ),
1874
+ 'vit_base_patch16_siglip_gap_256.webli_i18n' : _cfg (
1875
+ hf_hub_id = 'timm/ViT-B-16-SigLIP-i18n-256' ,
1876
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1877
+ input_size = (3 , 256 , 256 ),
1878
+ num_classes = 0 ),
1859
1879
'vit_base_patch16_siglip_gap_384.webli' : _cfg (
1860
1880
hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
1861
1881
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1890,6 +1910,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1890
1910
hf_hub_filename = 'paligemma-3b-pt-224.npz' ,
1891
1911
custom_load = 'hf' ,
1892
1912
num_classes = 0 ),
1913
+ 'vit_so400m_patch16_siglip_gap_256.webli_i18n' : _cfg (
1914
+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1915
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1916
+ input_size = (3 , 256 , 256 ),
1917
+ num_classes = 0 ),
1918
+ 'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
1919
+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1920
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1921
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 ,
1922
+ num_classes = 0 ),
1893
1923
'vit_so400m_patch14_siglip_gap_384.webli' : _cfg (
1894
1924
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1895
1925
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1914,6 +1944,15 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1914
1944
input_size = (3 , 896 , 896 ), crop_pct = 1.0 ,
1915
1945
num_classes = 0 ),
1916
1946
1947
+ 'vit_so400m_patch14_siglip_378.webli_ft_in1k' : _cfg (
1948
+ hf_hub_id = 'timm/' ,
1949
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1950
+ ),
1951
+ 'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k' : _cfg (
1952
+ hf_hub_id = 'timm/' ,
1953
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1954
+ ),
1955
+
1917
1956
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1918
1957
hf_hub_id = 'timm/' ,
1919
1958
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2935,6 +2974,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
2935
2974
return model
2936
2975
2937
2976
2977
+ @register_model
2978
+ def vit_so400m_patch16_siglip_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2979
+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2980
+ model_args = dict (
2981
+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2982
+ )
2983
+ model = _create_vision_transformer (
2984
+ 'vit_so400m_patch16_siglip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2985
+ return model
2986
+
2987
+
2988
+ @register_model
2989
+ def vit_so400m_patch14_siglip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2990
+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2991
+ model_args = dict (
2992
+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2993
+ )
2994
+ model = _create_vision_transformer (
2995
+ 'vit_so400m_patch14_siglip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2996
+ return model
2997
+
2998
+
2938
2999
@register_model
2939
3000
def vit_so400m_patch14_siglip_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2940
3001
model_args = dict (
@@ -3023,6 +3084,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
3023
3084
return model
3024
3085
3025
3086
3087
+ @register_model
3088
+ def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3089
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3090
+ model_args = dict (
3091
+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3092
+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3093
+ )
3094
+ model = _create_vision_transformer (
3095
+ 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3096
+ return model
3097
+
3098
+
3099
+ @register_model
3100
+ def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3101
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3102
+ model_args = dict (
3103
+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3104
+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3105
+ )
3106
+ model = _create_vision_transformer (
3107
+ 'vit_so400m_patch14_siglip_gap_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3108
+ return model
3109
+
3110
+
3026
3111
@register_model
3027
3112
def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3028
3113
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
0 commit comments