pytorch
diff --git a/‎README.md
Lines changed: 9 additions & 9 deletions b/‎README.md
Lines changed: 9 additions & 9 deletions
diff --git a/‎browser/browser.py
Lines changed: 81 additions & 33 deletions b/‎browser/browser.py
Lines changed: 81 additions & 33 deletions
diff --git a/‎browser/chat_in_browser.py
Lines changed: 0 additions & 107 deletions b/‎browser/chat_in_browser.py
Lines changed: 0 additions & 107 deletions
diff --git a/‎browser/static/css/style.css
Lines changed: 0 additions & 96 deletions b/‎browser/static/css/style.css
Lines changed: 0 additions & 96 deletions
diff --git a/‎browser/templates/chat.html
Lines changed: 0 additions & 27 deletions b/‎browser/templates/chat.html
Lines changed: 0 additions & 27 deletions
@@ -123,22 +123,22 @@ For more information run `python3 torchchat.py generate --help`
 
 ### Browser
 This mode provides access to the model via the browser's localhost.
+
+Launch an interactive chat with your model. Running the command will automatically open a tab in your browser. [Streamlit](https://streamlit.io/) should already be installed by the `install_requirements.sh` script.
+```
+streamlit run torchchat.py -- browser <model_name> <model_args>
+```
+
+For example, to quantize and chat with LLaMA3:
 [skip default]: begin
 ```
-python3 torchchat.py browser llama3
+streamlit run torchchat.py -- browser llama3 --quantize '{"precision": {"dtype":"float16"}, "executor":{"accelerator":"cpu"}}' --max-new-tokens 256 --compile
 ```
 [skip default]: end
 
 
-*Running on http://127.0.0.1:5000* should be printed out on the
- terminal. Click the link or go to
- [http://127.0.0.1:5000](http://127.0.0.1:5000) on your browser to
- start interacting with it.
 
-Enter some text in the input box, then hit the enter key or click the
-“SEND” button. After a second or two, the text you entered together
-with the generated text will be displayed. Repeat to have a
-conversation.
+
 
 
 
 
@@ -4,40 +4,88 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-import subprocess
-import sys
+import time
+
+import streamlit as st
+from api.api import CompletionRequest, OpenAiApiGenerator
+
+from build.builder import BuilderArgs, TokenizerArgs
+
+from generate import GeneratorArgs
 
 
 def main(args):
+    builder_args = BuilderArgs.from_args(args)
+    speculative_builder_args = BuilderArgs.from_speculative_args(args)
+    tokenizer_args = TokenizerArgs.from_args(args)
+    generator_args = GeneratorArgs.from_args(args)
+    generator_args.chat_mode = False
+
+    @st.cache_resource
+    def initialize_generator() -> OpenAiApiGenerator:
+        return OpenAiApiGenerator(
+            builder_args,
+            speculative_builder_args,
+            tokenizer_args,
+            generator_args,
+            args.profile,
+            args.quantize,
+            args.draft_quantize,
+        )
+
+    gen = initialize_generator()
+
+    st.title("torchchat")
+
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+
+    # Accept user input
+    if prompt := st.chat_input("What is up?"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(prompt)
+
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"), st.status(
+            "Generating... ", expanded=True
+        ) as status:
+
+            req = CompletionRequest(
+                model=gen.builder_args.checkpoint_path,
+                prompt=prompt,
+                temperature=generator_args.temperature,
+                messages=[],
+            )
+
+            def unwrap(completion_generator):
+                start = time.time()
+                tokcount = 0
+                for chunk_response in completion_generator:
+                    content = chunk_response.choices[0].delta.content
+                    if not gen.is_llama3_model or content not in set(
+                        gen.tokenizer.special_tokens.keys()
+                    ):
+                        yield content
+                    if content == gen.tokenizer.eos_id():
+                        yield "."
+                    tokcount += 1
+                status.update(
+                    label="Done, averaged {:.2f} tokens/second".format(
+                        tokcount / (time.time() - start)
+                    ),
+                    state="complete",
+                )
+
+            response = st.write_stream(unwrap(gen.completion(req)))
 
-    # Directory Containing the server file "chat_in_browser.py"
-    server_dir = "browser"
-
-    # Look for port from cmd args. Default to 5000 if not found.
-    port = 5000
-    i = 2
-    while i < len(sys.argv):
-        if sys.argv[i] == "--port":
-            if i + 1 < len(sys.argv):
-                # Extract the value and remove '--port' and the value from sys.argv
-                port = sys.argv[i + 1]
-                del sys.argv[i : i + 2]
-                break
-        else:
-            i += 1
-
-    # Construct arguments for the flask app minus 'browser' command
-    # plus '--chat'
-    args_plus_chat = ["'{}'".format(s) for s in sys.argv[1:] if s != "browser"] + [
-        '"--chat"'
-    ]
-    formatted_args = ", ".join(args_plus_chat)
-    command = [
-        "flask",
-        "--app",
-        f"{server_dir}/chat_in_browser:create_app(" + formatted_args + ")",
-        "run",
-        "--port",
-        f"{port}",
-    ]
-    subprocess.run(command)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})