Skip to content

[llama-mm] Fix vision encoder model test #6842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ jobs:
conda activate "${CONDA_ENV}"

MODEL_NAME=${{ matrix.model }}
# Install requirements for llama vision
if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
bash examples/models/llama3_2_vision/install_requirements.sh
fi
BUILD_TOOL=${{ matrix.build-tool }}
BACKEND=${{ matrix.backend }}
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ jobs:
bash .ci/scripts/setup-conda.sh
# Setup MacOS dependencies as there is no Docker support on MacOS atm
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
# Build and test xecutorch
# Install requirements for llama vision
if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
${CONDA_RUN} bash examples/models/llama3_2_vision/install_requirements.sh
fi
# Build and test executorch
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"

test-custom-ops-macos:
Expand Down
12 changes: 12 additions & 0 deletions examples/models/llama3_2_vision/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .vision_encoder import FlamingoVisionEncoderModel, VisionEncoderConfig

__all__ = [
"FlamingoVisionEncoderModel",
"VisionEncoderConfig",
]
28 changes: 16 additions & 12 deletions examples/models/llama3_2_vision/vision_encoder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,6 @@
)
from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_encoder

max_seq_len = 8192
in_channels = 3
tile_size = 560
max_num_tiles = 4
# how many tokens per image generated by the vision encoder
tokens_per_image = 6404
# how many images to cache in the kv cache in cross attention
kv_cache_image_num = 1
# maximum number of tokens generated by encoder and thus stored in the kv cache in cross attention
encoder_max_seq_len = tokens_per_image * kv_cache_image_num


@dataclass
class VisionEncoderConfig:
Expand All @@ -42,11 +31,26 @@ class VisionEncoderConfig:
in_channels: int = 3


# 8 layers for CI testing purpose
demo_config: VisionEncoderConfig = VisionEncoderConfig(
patch_size=14,
num_heads=8,
clip_embed_dim=768,
clip_num_layers=6,
clip_hidden_states=[1, 3, 5],
decoder_embed_dim=1024,
num_layers_projection=4,
tile_size=224,
max_num_tiles=4,
in_channels=3,
)


class FlamingoVisionEncoderModel(EagerModelBase):
def __init__(self, config: Optional[VisionEncoderConfig] = None):
super().__init__()
if config is None:
config = VisionEncoderConfig()
config = demo_config
self.config = config
self.model = llama3_2_vision_encoder(
patch_size=config.patch_size,
Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ addopts =
# examples
examples/models/llama/tests
examples/models/llama3_2_vision/preprocess
# examples/models/llama3_2_vision/vision_encoder/test TODO: enable this
examples/models/llama3_2_vision/vision_encoder/test
# examples/models/llava/test TODO: enable this
# exir
exir/_serialize/test
Expand Down
Loading