Skip to content

Commit 790decc

Browse files
committed
Add more pali(2) weights. Switch rest of models adapting open_clip weights to their own weight instances.
1 parent 01cf0f7 commit 790decc

File tree

4 files changed

+178
-107
lines changed

4 files changed

+178
-107
lines changed

timm/models/byobnet.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2282,107 +2282,91 @@ def _cfgr(url='', **kwargs):
22822282
# original attention pool head variants
22832283
'resnet50_clip.openai': _cfgr(
22842284
hf_hub_id='timm/',
2285-
hf_hub_filename='open_clip_pytorch_model.bin',
22862285
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
22872286
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
22882287
classifier='head.proj',
22892288
),
22902289
'resnet101_clip.openai': _cfgr(
22912290
hf_hub_id='timm/',
2292-
hf_hub_filename='open_clip_pytorch_model.bin',
22932291
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
22942292
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
22952293
classifier='head.proj',
22962294
),
22972295
'resnet50x4_clip.openai': _cfgr(
22982296
hf_hub_id='timm/',
2299-
hf_hub_filename='open_clip_pytorch_model.bin',
23002297
num_classes=640, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23012298
fixed_input_size=True, input_size=(3, 288, 288), pool_size=(9, 9),
23022299
classifier='head.proj',
23032300
),
23042301
'resnet50x16_clip.openai': _cfgr(
23052302
hf_hub_id='timm/',
2306-
hf_hub_filename='open_clip_pytorch_model.bin',
23072303
num_classes=768, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23082304
fixed_input_size=True, input_size=(3, 384, 384), pool_size=(12, 12),
23092305
classifier='head.proj',
23102306
),
23112307
'resnet50x64_clip.openai': _cfgr(
23122308
hf_hub_id='timm/',
2313-
hf_hub_filename='open_clip_pytorch_model.bin',
23142309
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23152310
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
23162311
classifier='head.proj',
23172312
),
23182313
'resnet50_clip.cc12m': _cfgr(
23192314
hf_hub_id='timm/',
2320-
hf_hub_filename='open_clip_pytorch_model.bin',
23212315
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23222316
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23232317
classifier='head.proj',
23242318
),
23252319
'resnet50_clip.yfcc15m': _cfgr(
23262320
hf_hub_id='timm/',
2327-
hf_hub_filename='open_clip_pytorch_model.bin',
23282321
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23292322
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23302323
classifier='head.proj',
23312324
),
23322325
'resnet101_clip.yfcc15m': _cfgr(
23332326
hf_hub_id='timm/',
2334-
hf_hub_filename='open_clip_pytorch_model.bin',
23352327
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23362328
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23372329
classifier='head.proj',
23382330
),
23392331

23402332
# avg-pool w/ optional standard classifier head variants
23412333
'resnet50_clip_gap.openai': _cfgr(
2342-
hf_hub_id='timm/resnet50_clip.openai',
2343-
hf_hub_filename='open_clip_pytorch_model.bin',
2334+
hf_hub_id='timm/',
23442335
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23452336
input_size=(3, 224, 224), pool_size=(7, 7),
23462337
),
23472338
'resnet101_clip_gap.openai': _cfgr(
2348-
hf_hub_id='timm/resnet101_clip.openai',
2349-
hf_hub_filename='open_clip_pytorch_model.bin',
2339+
hf_hub_id='timm/',
23502340
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23512341
input_size=(3, 224, 224), pool_size=(7, 7),
23522342
),
23532343
'resnet50x4_clip_gap.openai': _cfgr(
2354-
hf_hub_id='timm/resnet50x4_clip.openai',
2355-
hf_hub_filename='open_clip_pytorch_model.bin',
2344+
hf_hub_id='timm/',
23562345
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23572346
input_size=(3, 288, 288), pool_size=(9, 9),
23582347
),
23592348
'resnet50x16_clip_gap.openai': _cfgr(
2360-
hf_hub_id='timm/resnet50x16_clip.openai',
2361-
hf_hub_filename='open_clip_pytorch_model.bin',
2349+
hf_hub_id='timm/',
23622350
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23632351
input_size=(3, 384, 384), pool_size=(12, 12),
23642352
),
23652353
'resnet50x64_clip_gap.openai': _cfgr(
2366-
hf_hub_id='timm/resnet50x64_clip.openai',
2367-
hf_hub_filename='open_clip_pytorch_model.bin',
2354+
hf_hub_id='timm/',
23682355
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23692356
input_size=(3, 448, 448), pool_size=(14, 14),
23702357
),
23712358
'resnet50_clip_gap.cc12m': _cfgr(
2372-
hf_hub_id='timm/resnet50_clip.cc12m',
2373-
hf_hub_filename='open_clip_pytorch_model.bin',
2359+
hf_hub_id='timm/',
23742360
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23752361
input_size=(3, 224, 224), pool_size=(7, 7),
23762362
),
23772363
'resnet50_clip_gap.yfcc15m': _cfgr(
2378-
hf_hub_id='timm/resnet50_clip.yfcc15m',
2379-
hf_hub_filename='open_clip_pytorch_model.bin',
2364+
hf_hub_id='timm/',
23802365
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23812366
input_size=(3, 224, 224), pool_size=(7, 7),
23822367
),
23832368
'resnet101_clip_gap.yfcc15m': _cfgr(
2384-
hf_hub_id='timm/resnet101_clip.yfcc15m',
2385-
hf_hub_filename='open_clip_pytorch_model.bin',
2369+
hf_hub_id='timm/',
23862370
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23872371
input_size=(3, 224, 224), pool_size=(7, 7),
23882372
),

timm/models/eva.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -912,45 +912,52 @@ def _cfg(url='', **kwargs):
912912
# EVA01 and EVA02 CLIP image towers
913913
'eva_giant_patch14_clip_224.laion400m': _cfg(
914914
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt',
915-
hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights
916-
hf_hub_filename='open_clip_pytorch_model.bin',
915+
# hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights
916+
# hf_hub_filename='open_clip_pytorch_model.bin',
917+
hf_hub_id='timm/',
917918
num_classes=1024,
918919
),
919920
'eva_giant_patch14_clip_224.merged2b': _cfg(
920921
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt',
921-
hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights
922-
hf_hub_filename='open_clip_pytorch_model.bin',
922+
# hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights
923+
# hf_hub_filename='open_clip_pytorch_model.bin',
924+
hf_hub_id='timm/',
923925
num_classes=1024,
924926
),
925927
'eva02_base_patch16_clip_224.merged2b': _cfg(
926928
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
927-
hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights
928-
hf_hub_filename='open_clip_pytorch_model.bin',
929+
# hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights
930+
# hf_hub_filename='open_clip_pytorch_model.bin',
931+
hf_hub_id='timm/',
929932
num_classes=512,
930933
),
931934
'eva02_large_patch14_clip_224.merged2b': _cfg(
932935
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
933-
hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights
934-
hf_hub_filename='open_clip_pytorch_model.bin',
936+
# hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights
937+
# hf_hub_filename='open_clip_pytorch_model.bin',
938+
hf_hub_id='timm/',
935939
num_classes=768,
936940
),
937941
'eva02_large_patch14_clip_336.merged2b': _cfg(
938942
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
939-
hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights
940-
hf_hub_filename='open_clip_pytorch_model.bin',
943+
# hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights
944+
# hf_hub_filename='open_clip_pytorch_model.bin',
945+
hf_hub_id='timm/',
941946
input_size=(3, 336, 336), crop_pct=1.0,
942947
num_classes=768,
943948
),
944949
'eva02_enormous_patch14_clip_224.laion2b': _cfg(
945950
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt',
946-
hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights
947-
hf_hub_filename='open_clip_pytorch_model.bin',
951+
# hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights
952+
# hf_hub_filename='open_clip_pytorch_model.bin',
953+
hf_hub_id='timm/',
948954
num_classes=1024,
949955
),
950956
'eva02_enormous_patch14_clip_224.laion2b_plus': _cfg(
951957
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt',
952-
hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights
953-
hf_hub_filename='open_clip_pytorch_model.bin',
958+
# hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights
959+
# hf_hub_filename='open_clip_pytorch_model.bin',
960+
hf_hub_id='timm/',
954961
num_classes=1024,
955962
),
956963
'eva02_enormous_patch14_clip_224.pretrain': _cfg(

timm/models/hieradet_sam2.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -530,26 +530,47 @@ def _cfg(url='', **kwargs):
530530

531531

532532
default_cfgs = generate_default_cfgs({
533-
"sam2_hiera_tiny.r224": _cfg(
534-
hf_hub_id='facebook/sam2-hiera-tiny',
535-
hf_hub_filename='sam2_hiera_tiny.pt',
536-
input_size=(3, 224, 224), pool_size=(7, 7),
537-
), # FIXME reduced res for testing
538-
"sam2_hiera_tiny.r896": _cfg(
539-
hf_hub_id='facebook/sam2-hiera-tiny',
540-
hf_hub_filename='sam2_hiera_tiny.pt',
533+
"sam2_hiera_tiny.fb_r896": _cfg(
534+
# hf_hub_id='facebook/sam2-hiera-tiny',
535+
# hf_hub_filename='sam2_hiera_tiny.pt',
536+
hf_hub_id='timm/',
541537
),
542-
"sam2_hiera_small": _cfg(
543-
hf_hub_id='facebook/sam2-hiera-small',
544-
hf_hub_filename='sam2_hiera_small.pt',
538+
"sam2_hiera_tiny.fb_r896_2pt1": _cfg(
539+
# hf_hub_id='facebook/sam2.1-hiera-tiny',
540+
# hf_hub_filename='sam2.1_hiera_tiny.pt',
541+
hf_hub_id='timm/',
545542
),
546-
"sam2_hiera_base_plus": _cfg(
547-
hf_hub_id='facebook/sam2-hiera-base-plus',
548-
hf_hub_filename='sam2_hiera_base_plus.pt',
543+
"sam2_hiera_small.fb_r896": _cfg(
544+
# hf_hub_id='facebook/sam2-hiera-small',
545+
# hf_hub_filename='sam2_hiera_small.pt',
546+
hf_hub_id='timm/',
549547
),
550-
"sam2_hiera_large": _cfg(
551-
hf_hub_id='facebook/sam2-hiera-large',
552-
hf_hub_filename='sam2_hiera_large.pt',
548+
"sam2_hiera_small.fb_r896_2pt1": _cfg(
549+
# hf_hub_id='facebook/sam2.1-hiera-small',
550+
# hf_hub_filename='sam2.1_hiera_small.pt',
551+
hf_hub_id='timm/',
552+
),
553+
"sam2_hiera_base_plus.fb_r896": _cfg(
554+
# hf_hub_id='facebook/sam2-hiera-base-plus',
555+
# hf_hub_filename='sam2_hiera_base_plus.pt',
556+
hf_hub_id='timm/',
557+
),
558+
"sam2_hiera_base_plus.fb_r896_2pt1": _cfg(
559+
# hf_hub_id='facebook/sam2.1-hiera-base-plus',
560+
# hf_hub_filename='sam2.1_hiera_base_plus.pt',
561+
hf_hub_id='timm/',
562+
),
563+
"sam2_hiera_large.fb_r1024": _cfg(
564+
# hf_hub_id='facebook/sam2-hiera-large',
565+
# hf_hub_filename='sam2_hiera_large.pt',
566+
hf_hub_id='timm/',
567+
min_input_size=(3, 256, 256),
568+
input_size=(3, 1024, 1024), pool_size=(32, 32),
569+
),
570+
"sam2_hiera_large.fb_r1024_2pt1": _cfg(
571+
# hf_hub_id='facebook/sam2.1-hiera-large',
572+
# hf_hub_filename='sam2.1_hiera_large.pt',
573+
hf_hub_id='timm/',
553574
min_input_size=(3, 256, 256),
554575
input_size=(3, 1024, 1024), pool_size=(32, 32),
555576
),
@@ -578,11 +599,11 @@ def checkpoint_filter_fn(state_dict, model=None, prefix=''):
578599
def _create_hiera_det(variant: str, pretrained: bool = False, **kwargs) -> HieraDet:
579600
out_indices = kwargs.pop('out_indices', 4)
580601
checkpoint_prefix = ''
581-
if 'sam2' in variant:
582-
# SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`)
583-
# This is workaround loading with num_classes=0 w/o removing norm-layer.
584-
kwargs.setdefault('pretrained_strict', False)
585-
checkpoint_prefix = 'image_encoder.trunk.'
602+
# if 'sam2' in variant:
603+
# # SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`)
604+
# # This is workaround loading with num_classes=0 w/o removing norm-layer.
605+
# kwargs.setdefault('pretrained_strict', False)
606+
# checkpoint_prefix = 'image_encoder.trunk.'
586607
return build_model_with_cfg(
587608
HieraDet,
588609
variant,

0 commit comments

Comments
 (0)