bump

ishandhanani · ishandhanani · commit 1dfb1a8fbd67 · 2025-06-11T21:43:00.000Z
diff --git a/examples/sglang/components/embedding_worker.py b/examples/sglang/components/embedding_worker.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Using SGLang and Dynamo to serve embedding models!
+"""
+
+import asyncio
+import logging
+import random
+import socket
+from typing import Any
+
+import sglang as sgl
+from utils.protocol import EmbeddingRequest
+from utils.sglang import parse_sglang_args
+
+from dynamo.llm import ModelType, register_llm
+from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
+
+logger = logging.getLogger(__name__)
+
+
+@service(
+    dynamo={
+        "namespace": "dynamo",
+    },
+    resources={"gpu": 1},
+    workers=1,
+)
+class SGLangEmbeddingWorker:
+
+    def __init__(self):
+        class_name = self.__class__.__name__
+        self.engine_args = parse_sglang_args(class_name, "")
+        self.engine = sgl.Engine(server_args=self.engine_args)
+
+        logger.info("SGLangEmbeddingWorker initialized")
+
+    @async_on_start
+    async def async_init(self):
+        runtime = dynamo_context["runtime"]
+        logger.info("Registering LLM for discovery")
+        comp_ns, comp_name = SGLangEmbeddingWorker.dynamo_address()  # type: ignore
+        endpoint = runtime.namespace(comp_ns).component(comp_name).endpoint("generate")
+        await register_llm(
+            ModelType.Embedding,
+            endpoint,
+            self.engine_args.model_path,
+            self.engine_args.served_model_name,
+        )
+
+    @endpoint()
+    async def generate(self, request: EmbeddingRequest):
+        if isinstance(request.input, str):
+            input = request.input
+        elif isinstance(request.input, list):
+            input = [i for i in request.input]
+        else:
+            raise ValueError(f"Invalid input type: {type(request.input)}")
+        
+        g = await self.engine.async_encode(
+            prompt=input,
+        )
+        
+        # Transform response to match OpenAI embedding format
+        response = self._transform_response(g, request.model)
+        yield response
+
+    def _transform_response(self, ret, model_name):
+        """Transform SGLang response to OpenAI embedding format"""
+        if not isinstance(ret, list):
+            ret = [ret]
+        
+        embedding_objects = []
+        prompt_tokens = 0
+        
+        for idx, ret_item in enumerate(ret):
+            embedding_objects.append({
+                "object": "embedding",
+                "embedding": ret_item["embedding"],
+                "index": idx,
+            })
+            prompt_tokens += ret_item["meta_info"]["prompt_tokens"]
+
+        return {
+            "object": "list",
+            "data": embedding_objects,
+            "model": model_name,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "total_tokens": prompt_tokens,
+            },
+        }
diff --git a/examples/sglang/components/frontend.py b/examples/sglang/components/frontend.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 
 from components.worker import SGLangWorker
+from components.embedding_worker import SGLangEmbeddingWorker
 from fastapi import FastAPI
 from pydantic import BaseModel
 
@@ -57,6 +58,7 @@ class FrontendConfig(BaseModel):
 )
 class Frontend:
     worker = depends(SGLangWorker)
+    embedding_worker = depends(SGLangEmbeddingWorker)
 
     def __init__(self):
         """Initialize Frontend service with HTTP server and model configuration."""
diff --git a/examples/sglang/configs/embedding.yaml b/examples/sglang/configs/embedding.yaml
@@ -0,0 +1,16 @@
+Frontend:
+  served_model_name: e5
+  endpoint: SGLangEmbeddingWorker.generate
+  port: 8000
+SGLangEmbeddingWorker:
+  model-path: intfloat/e5-base-v2
+  served-model-name: e5
+  is-embedding: true
+  tp: 1
+  trust-remote-code: true
+  is-embedding: true
+  json-model-override-args: '{"get_embedding": true, "chat_template": ""}'
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: 1
diff --git a/examples/sglang/graphs/embedding.py b/examples/sglang/graphs/embedding.py
@@ -0,0 +1,20 @@
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from components.frontend import Frontend
+from components.embedding_worker import SGLangEmbeddingWorker
+
+Frontend.link(SGLangEmbeddingWorker)
diff --git a/examples/sglang/utils/protocol.py b/examples/sglang/utils/protocol.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional
+from typing import List, Optional, Union, Literal
 
 from pydantic import BaseModel, Field
 
@@ -60,3 +60,19 @@ class DisaggPreprocessedRequest(BaseModel):
     bootstrap_host: str
     bootstrap_port: int
     bootstrap_room: int
+
+EmbeddingInput = Union[
+    str,
+    List[str], 
+    List[int],  
+    List[List[int]]  
+]
+
+EncodingFormat = Literal["float", "base64"]
+
+class EmbeddingRequest(BaseModel):
+    model: str
+    input: EmbeddingInput
+    encoding_format: Optional[EncodingFormat] = None
+    user: Optional[str] = None
+    dimensions: Optional[int] = None # only supported in text-embedding-3 and later models from OpenAI