Skip to content

Commit 0a9598b

Browse files
authored
[llama-mm] Fix vision encoder model test (#6842)
* [llama-mm] Fix vision encoder model test Summary: As titled. We need a smaller model config to make it work on CI jobs. Test Plan: Reviewers: Subscribers: Tasks: Tags: * Fix pull.yml and trunk.yml Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Remove torchao Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Small fixes Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Add torchao back Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 21eecff commit 0a9598b

File tree

5 files changed

+38
-14
lines changed

5 files changed

+38
-14
lines changed

.github/workflows/pull.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ jobs:
7272
conda activate "${CONDA_ENV}"
7373
7474
MODEL_NAME=${{ matrix.model }}
75+
# Install requirements for llama vision
76+
if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
77+
bash examples/models/llama3_2_vision/install_requirements.sh
78+
fi
7579
BUILD_TOOL=${{ matrix.build-tool }}
7680
BACKEND=${{ matrix.backend }}
7781
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}

.github/workflows/trunk.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,11 @@ jobs:
5858
bash .ci/scripts/setup-conda.sh
5959
# Setup MacOS dependencies as there is no Docker support on MacOS atm
6060
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
61-
# Build and test xecutorch
61+
# Install requirements for llama vision
62+
if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
63+
${CONDA_RUN} bash examples/models/llama3_2_vision/install_requirements.sh
64+
fi
65+
# Build and test executorch
6266
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
6367
6468
test-custom-ops-macos:
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from .vision_encoder import FlamingoVisionEncoderModel, VisionEncoderConfig
8+
9+
__all__ = [
10+
"FlamingoVisionEncoderModel",
11+
"VisionEncoderConfig",
12+
]

examples/models/llama3_2_vision/vision_encoder/model.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,6 @@
1616
)
1717
from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_encoder
1818

19-
max_seq_len = 8192
20-
in_channels = 3
21-
tile_size = 560
22-
max_num_tiles = 4
23-
# how many tokens per image generated by the vision encoder
24-
tokens_per_image = 6404
25-
# how many images to cache in the kv cache in cross attention
26-
kv_cache_image_num = 1
27-
# maximum number of tokens generated by encoder and thus stored in the kv cache in cross attention
28-
encoder_max_seq_len = tokens_per_image * kv_cache_image_num
29-
3019

3120
@dataclass
3221
class VisionEncoderConfig:
@@ -42,11 +31,26 @@ class VisionEncoderConfig:
4231
in_channels: int = 3
4332

4433

34+
# 8 layers for CI testing purpose
35+
demo_config: VisionEncoderConfig = VisionEncoderConfig(
36+
patch_size=14,
37+
num_heads=8,
38+
clip_embed_dim=768,
39+
clip_num_layers=6,
40+
clip_hidden_states=[1, 3, 5],
41+
decoder_embed_dim=1024,
42+
num_layers_projection=4,
43+
tile_size=224,
44+
max_num_tiles=4,
45+
in_channels=3,
46+
)
47+
48+
4549
class FlamingoVisionEncoderModel(EagerModelBase):
4650
def __init__(self, config: Optional[VisionEncoderConfig] = None):
4751
super().__init__()
4852
if config is None:
49-
config = VisionEncoderConfig()
53+
config = demo_config
5054
self.config = config
5155
self.model = llama3_2_vision_encoder(
5256
patch_size=config.patch_size,

pytest.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ addopts =
1717
# examples
1818
examples/models/llama/tests
1919
examples/models/llama3_2_vision/preprocess
20-
# examples/models/llama3_2_vision/vision_encoder/test TODO: enable this
20+
examples/models/llama3_2_vision/vision_encoder/test
2121
# examples/models/llava/test TODO: enable this
2222
# exir
2323
exir/_serialize/test

0 commit comments

Comments
 (0)