File tree Expand file tree Collapse file tree 5 files changed +38
-14
lines changed
examples/models/llama3_2_vision Expand file tree Collapse file tree 5 files changed +38
-14
lines changed Original file line number Diff line number Diff line change 72
72
conda activate "${CONDA_ENV}"
73
73
74
74
MODEL_NAME=${{ matrix.model }}
75
+ # Install requirements for llama vision
76
+ if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
77
+ bash examples/models/llama3_2_vision/install_requirements.sh
78
+ fi
75
79
BUILD_TOOL=${{ matrix.build-tool }}
76
80
BACKEND=${{ matrix.backend }}
77
81
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
Original file line number Diff line number Diff line change 58
58
bash .ci/scripts/setup-conda.sh
59
59
# Setup MacOS dependencies as there is no Docker support on MacOS atm
60
60
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
61
- # Build and test xecutorch
61
+ # Install requirements for llama vision
62
+ if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
63
+ ${CONDA_RUN} bash examples/models/llama3_2_vision/install_requirements.sh
64
+ fi
65
+ # Build and test executorch
62
66
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
63
67
64
68
test-custom-ops-macos :
Original file line number Diff line number Diff line change
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .vision_encoder import FlamingoVisionEncoderModel , VisionEncoderConfig
8
+
9
+ __all__ = [
10
+ "FlamingoVisionEncoderModel" ,
11
+ "VisionEncoderConfig" ,
12
+ ]
Original file line number Diff line number Diff line change 16
16
)
17
17
from torchtune .models .llama3_2_vision ._component_builders import llama3_2_vision_encoder
18
18
19
- max_seq_len = 8192
20
- in_channels = 3
21
- tile_size = 560
22
- max_num_tiles = 4
23
- # how many tokens per image generated by the vision encoder
24
- tokens_per_image = 6404
25
- # how many images to cache in the kv cache in cross attention
26
- kv_cache_image_num = 1
27
- # maximum number of tokens generated by encoder and thus stored in the kv cache in cross attention
28
- encoder_max_seq_len = tokens_per_image * kv_cache_image_num
29
-
30
19
31
20
@dataclass
32
21
class VisionEncoderConfig :
@@ -42,11 +31,26 @@ class VisionEncoderConfig:
42
31
in_channels : int = 3
43
32
44
33
34
+ # 8 layers for CI testing purpose
35
+ demo_config : VisionEncoderConfig = VisionEncoderConfig (
36
+ patch_size = 14 ,
37
+ num_heads = 8 ,
38
+ clip_embed_dim = 768 ,
39
+ clip_num_layers = 6 ,
40
+ clip_hidden_states = [1 , 3 , 5 ],
41
+ decoder_embed_dim = 1024 ,
42
+ num_layers_projection = 4 ,
43
+ tile_size = 224 ,
44
+ max_num_tiles = 4 ,
45
+ in_channels = 3 ,
46
+ )
47
+
48
+
45
49
class FlamingoVisionEncoderModel (EagerModelBase ):
46
50
def __init__ (self , config : Optional [VisionEncoderConfig ] = None ):
47
51
super ().__init__ ()
48
52
if config is None :
49
- config = VisionEncoderConfig ()
53
+ config = demo_config
50
54
self .config = config
51
55
self .model = llama3_2_vision_encoder (
52
56
patch_size = config .patch_size ,
Original file line number Diff line number Diff line change @@ -17,7 +17,7 @@ addopts =
17
17
# examples
18
18
examples/models/llama/tests
19
19
examples/models/llama3_2_vision/preprocess
20
- # examples/models/llama3_2_vision/vision_encoder/test TODO: enable this
20
+ examples/models/llama3_2_vision/vision_encoder/test
21
21
# examples/models/llava/test TODO: enable this
22
22
# exir
23
23
exir/_serialize/test
You can’t perform that action at this time.
0 commit comments