@@ -1841,6 +1841,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1841
1841
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
1842
1842
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1843
1843
num_classes = 0 ),
1844
+ 'vit_so400m_patch16_siglip_256.webli' : _cfg (
1845
+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1846
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1847
+ input_size = (3 , 256 , 256 ),
1848
+ num_classes = 0 ),
1849
+ 'vit_so400m_patch14_siglip_378.webli' : _cfg (
1850
+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1851
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1852
+ input_size = (3 , 378 , 378 ),
1853
+ num_classes = 0 ),
1844
1854
'vit_so400m_patch14_siglip_384.webli' : _cfg (
1845
1855
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1846
1856
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1890,6 +1900,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1890
1900
hf_hub_filename = 'paligemma-3b-pt-224.npz' ,
1891
1901
custom_load = 'hf' ,
1892
1902
num_classes = 0 ),
1903
+ 'vit_so400m_patch16_siglip_gap_256.webli' : _cfg (
1904
+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1905
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1906
+ input_size = (3 , 256 , 256 ),
1907
+ num_classes = 0 ),
1908
+ 'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
1909
+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1910
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1911
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 ,
1912
+ num_classes = 0 ),
1893
1913
'vit_so400m_patch14_siglip_gap_384.webli' : _cfg (
1894
1914
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1895
1915
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1914,6 +1934,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1914
1934
input_size = (3 , 896 , 896 ), crop_pct = 1.0 ,
1915
1935
num_classes = 0 ),
1916
1936
1937
+ 'vit_so400m_patch14_siglip_378.webli_ft_in1k' : _cfg (
1938
+ #hf_hub_id='timm/',
1939
+ #file='vit_so400m_p14_378_map-8.pth',
1940
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1941
+ ),
1942
+ 'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k' : _cfg (
1943
+ # hf_hub_id='timm/',
1944
+ #file='vit_so400m_p14_378_gap-8.pth',
1945
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1946
+ ),
1947
+
1917
1948
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1918
1949
hf_hub_id = 'timm/' ,
1919
1950
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2935,6 +2966,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
2935
2966
return model
2936
2967
2937
2968
2969
+ @register_model
2970
+ def vit_so400m_patch16_siglip_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2971
+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2972
+ model_args = dict (
2973
+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2974
+ )
2975
+ model = _create_vision_transformer (
2976
+ 'vit_so400m_patch16_siglip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2977
+ return model
2978
+
2979
+
2980
+ @register_model
2981
+ def vit_so400m_patch14_siglip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2982
+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2983
+ model_args = dict (
2984
+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2985
+ )
2986
+ model = _create_vision_transformer (
2987
+ 'vit_so400m_patch14_siglip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2988
+ return model
2989
+
2990
+
2938
2991
@register_model
2939
2992
def vit_so400m_patch14_siglip_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2940
2993
model_args = dict (
@@ -3023,6 +3076,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
3023
3076
return model
3024
3077
3025
3078
3079
+ @register_model
3080
+ def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3081
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3082
+ model_args = dict (
3083
+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3084
+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3085
+ )
3086
+ model = _create_vision_transformer (
3087
+ 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3088
+ return model
3089
+
3090
+
3091
+ @register_model
3092
+ def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3093
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3094
+ model_args = dict (
3095
+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3096
+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3097
+ )
3098
+ model = _create_vision_transformer (
3099
+ 'vit_so400m_patch14_siglip_gap_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3100
+ return model
3101
+
3102
+
3026
3103
@register_model
3027
3104
def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3028
3105
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
0 commit comments