From 0d47c43a98a46146e694ff69918a1d3f6551e385 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 27 Apr 2024 23:11:34 +0100
Subject: [PATCH 01/68] gguf: add GGUFReader.read_field(field) method + read
 template example

---
 gguf-py/examples/read_template.py | 15 +++++++++++++++
 gguf-py/gguf/gguf_reader.py       | 15 +++++++++++++++
 gguf-py/scripts/gguf-dump.py      | 18 ++----------------
 3 files changed, 32 insertions(+), 16 deletions(-)
 create mode 100644 gguf-py/examples/read_template.py

diff --git a/gguf-py/examples/read_template.py b/gguf-py/examples/read_template.py
new file mode 100644
index 0000000000000..34a998ae5fee4
--- /dev/null
+++ b/gguf-py/examples/read_template.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from gguf.gguf_reader import GGUFReader
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: read_template.py <path_to_gguf_file>")
+        sys.exit(1)
+    gguf_file_path = sys.argv[1]
+
+    reader = GGUFReader(gguf_file_path)
+    print(reader.read_field(reader.fields['tokenizer.chat_template']))
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 33afac552ca75..8b7035aaca5b2 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -120,6 +120,21 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c']
     # Fetch a key/value metadata field by key.
     def get_field(self, key: str) -> Union[ReaderField, None]:
         return self.fields.get(key, None)
+    
+    def read_field(self, field):
+        if not field.types:
+            return None
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            itype = field.types[-1]
+            if itype == GGUFValueType.STRING:
+                return [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
+            else:
+                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        elif field.types[0] == GGUFValueType.STRING:
+            return str(bytes(field.parts[-1]), encoding="utf-8")
+        else:
+            assert(field.types[0] in self.gguf_scalar_to_np)
+            return field.parts[-1].tolist()[0]
 
     # Fetch a tensor from the list by index.
     def get_tensor(self, idx: int) -> ReaderTensor:
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
index dbf8915089275..1b594ca4b138c 100755
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -41,11 +41,7 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
             pretty_type = str(field.types[-1].name)
         print(f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
         if len(field.types) == 1:
-            curr_type = field.types[0]
-            if curr_type == GGUFValueType.STRING:
-                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
-            elif field.types[0] in reader.gguf_scalar_to_np:
-                print(' = {0}'.format(field.parts[-1][0]), end = '')
+            print(' = {0}'.format(repr(reader.read_field(field))[:60]), end = '')
         print()
     if args.no_tensors:
         return
@@ -75,17 +71,7 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
         metadata[field.name] = curr
         if field.types[:1] == [GGUFValueType.ARRAY]:
             curr["array_types"] = [t.name for t in field.types][1:]
-            if not args.json_array:
-                continue
-            itype = field.types[-1]
-            if itype == GGUFValueType.STRING:
-                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
-            else:
-                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
-        elif field.types[0] == GGUFValueType.STRING:
-            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
-        else:
-            curr["value"] = field.parts[-1].tolist()[0]
+        curr["value"] = reader.read_field(field)
     if not args.no_tensors:
         for idx, tensor in enumerate(reader.tensors):
             tensors[tensor.name] = {

From 0d1d46ef1de18f3dfe2f2cfcc3e15ae540a9ea82 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 8 Apr 2024 20:10:15 +0100
Subject: [PATCH 02/68] grammars: add troubleshooting section to readme

---
 grammars/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grammars/README.md b/grammars/README.md
index c924e8d46e5cb..5c210de4cdcc3 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -98,4 +98,4 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
 
 A common pattern is to allow repetitions of a pattern `x` up to N times.
 
-While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
+While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
\ No newline at end of file

From 63d13245e1668b01533765e00958c19b27df29fc Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 25 Mar 2024 23:57:25 +0000
Subject: [PATCH 03/68] server.py: hacky code

---
 examples/openai/README.md               |  53 ++++++
 examples/openai/__main__.py             |   8 +
 examples/openai/api.py                  |  27 +++
 examples/openai/chat_format.py          |  59 +++++++
 examples/openai/gguf_kvs.py             |  20 +++
 examples/openai/llama_cpp_server_api.py |  28 +++
 examples/openai/requirements.txt        |   7 +
 examples/openai/server.py               | 215 ++++++++++++++++++++++++
 examples/openai/ts_converter.py         |  85 ++++++++++
 9 files changed, 502 insertions(+)
 create mode 100644 examples/openai/README.md
 create mode 100644 examples/openai/__main__.py
 create mode 100644 examples/openai/api.py
 create mode 100644 examples/openai/chat_format.py
 create mode 100644 examples/openai/gguf_kvs.py
 create mode 100644 examples/openai/llama_cpp_server_api.py
 create mode 100644 examples/openai/requirements.txt
 create mode 100644 examples/openai/server.py
 create mode 100644 examples/openai/ts_converter.py

diff --git a/examples/openai/README.md b/examples/openai/README.md
new file mode 100644
index 0000000000000..47c9c67cc633e
--- /dev/null
+++ b/examples/openai/README.md
@@ -0,0 +1,53 @@
+# examples.openai: OpenAI API-compatible server
+
+A simple Python server that sits above the C++ [../server](examples/server) and offers improved OAI compatibility.
+
+## Usage
+
+```bash
+python -m examples.openai -m some-model.gguf
+
+
+```
+
+## Features
+
+The new examples/openai/server.py:
+
+- Uses llama.cpp C++ server as a backend (spawns it or connects to existing)
+
+- Uses actual jinja2 chat templates read from the models
+
+- Supports grammar-constrained output for both JSON response format and tool calls
+
+- Tool calling “works” w/ all models (even non-specialized ones like Mixtral 7x8B)
+
+    - Optimised support for Functionary & Nous Hermes, easy to extend to other tool-calling fine-tunes
+
+## TODO
+
+- Embedding endpoint w/ distinct server subprocess
+
+- Automatic/manual session caching
+
+    - Spawns the main C++ CLI under the hood
+
+    - Support precaching long prompts from CLI
+
+    - Instant incremental inference in long threads
+
+- Improve examples/agent:
+
+    - Interactive agent CLI that auto-discovers tools from OpenAPI endpoints
+
+    - Script that wraps any Python source as a container-sandboxed OpenAPI endpoint (allowing running ~unsafe code w/ tools)
+
+    - Basic memory / RAG / python interpreter tools
+
+- Follow-ups
+
+    - Remove OAI support from server
+
+    - Remove non-Python json schema to grammar converters
+
+    - Reach out to frameworks to advertise new option. 
diff --git a/examples/openai/__main__.py b/examples/openai/__main__.py
new file mode 100644
index 0000000000000..5204826b2dc21
--- /dev/null
+++ b/examples/openai/__main__.py
@@ -0,0 +1,8 @@
+
+from jsonargparse import CLI
+
+from examples.openai.server import main
+
+if __name__ == "__main__":
+    CLI(main)
+
diff --git a/examples/openai/api.py b/examples/openai/api.py
new file mode 100644
index 0000000000000..b883ecec41cbb
--- /dev/null
+++ b/examples/openai/api.py
@@ -0,0 +1,27 @@
+from typing import Any, Optional
+from pydantic import BaseModel, Json
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+class ToolFunction(BaseModel):
+    name: str
+    description: str
+    parameters: Any
+
+class Tool(BaseModel):
+    type: str
+    function: ToolFunction
+
+class ResponseFormat(BaseModel):
+    type: str
+    json_schema: Optional[Any] = None
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    tools: Optional[list[Tool]] = None
+    messages: list[Message]
+    response_format: Optional[ResponseFormat] = None
+    temperature: float = 1.0
+    stream: bool = False
diff --git a/examples/openai/chat_format.py b/examples/openai/chat_format.py
new file mode 100644
index 0000000000000..bb7d0c94c60f6
--- /dev/null
+++ b/examples/openai/chat_format.py
@@ -0,0 +1,59 @@
+from enum import StrEnum
+import jinja2
+
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+
+def raise_exception(msg: str):
+    raise Exception(msg)
+
+class ToolStyle(StrEnum):
+    # https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
+    DEFAULT="Default",
+    # https://github.com/MeetKai/functionary
+    # TODO: look at https://github.com/ggerganov/llama.cpp/pull/5695
+    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
+    FUNCTIONARY_V2="Functionary V2",
+    # https://github.com/NousResearch/Hermes-Function-Calling
+    NOUS_RESEARCH_HERMES="Nous-Research-Hermes-Function-Calling",
+
+class ChatFormat: #(BaseModel):
+    def __init__(self, template: str, eos_token: str, bos_token: str):
+        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
+        self.template = env.from_string(template)
+        self.eos_token = eos_token
+        self.bos_token = bos_token
+
+        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+
+        if "<|recipient|>' + tool_call['function']['name']" in template:
+            self.tool_style = ToolStyle.FUNCTIONARY_V2
+        else:
+            self.tool_style = ToolStyle.DEFAULT
+
+
+    def __str__(self):
+        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
+
+
+    @staticmethod
+    def from_gguf(metadata: GGUFKeyValues):
+        return ChatFormat(
+            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
+            bos_token = metadata[Keys.Tokenizer.BOS_ID],
+            eos_token = metadata[Keys.Tokenizer.EOS_ID])
+    # @staticmethod
+    # def from_gguf(model: Path):
+    #     reader = GGUFReader(model.as_posix())
+    #     return ChatFormat(
+    #         template = reader.fields[Keys.Tokenizer.CHAT_TEMPLATE].read(),
+    #         bos_token = reader.fields[Keys.Tokenizer.BOS_ID].read(),
+    #         eos_token = reader.fields[Keys.Tokenizer.EOS_ID].read())
+    
+    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
+        return self.template.render(
+            messages=messages,
+            eos_token=self.eos_token,
+            bos_token='' if omit_bos else self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=add_generation_prompt,
+        )
diff --git a/examples/openai/gguf_kvs.py b/examples/openai/gguf_kvs.py
new file mode 100644
index 0000000000000..2eba427b33eec
--- /dev/null
+++ b/examples/openai/gguf_kvs.py
@@ -0,0 +1,20 @@
+from pathlib import Path
+import sys
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "gguf-py"))
+
+from gguf.gguf_reader import GGUFReader
+from gguf.constants import Keys
+
+class GGUFKeyValues:
+    def __init__(self, model: Path):
+        reader = GGUFReader(model.as_posix())
+        self.fields = reader.fields
+    def __getitem__(self, key: str):
+        if '{arch}' in key:
+            key = key.replace('{arch}', self[Keys.General.ARCHITECTURE])
+        return self.fields[key].read()
+    def __contains__(self, key: str):
+        return key in self.fields
+    def keys(self):
+        return self.fields.keys()
diff --git a/examples/openai/llama_cpp_server_api.py b/examples/openai/llama_cpp_server_api.py
new file mode 100644
index 0000000000000..93690072826d2
--- /dev/null
+++ b/examples/openai/llama_cpp_server_api.py
@@ -0,0 +1,28 @@
+from typing import Optional
+from pydantic import BaseModel, Json
+
+class LlamaCppServerCompletionRequest(BaseModel):
+    prompt: str
+    stream: Optional[bool] = None
+    cache_prompt: Optional[bool] = None
+    n_predict: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    min_p: Optional[float] = None
+    tfs_z: Optional[float] = None
+    typical_p: Optional[float] = None
+    temperature: Optional[float] = None
+    dynatemp_range: Optional[float] = None
+    dynatemp_exponent: Optional[float] = None
+    repeat_last_n: Optional[int] = None
+    repeat_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    mirostat: Optional[bool] = None
+    mirostat_tau: Optional[float] = None
+    mirostat_eta: Optional[float] = None
+    penalize_nl: Optional[bool] = None
+    n_keep: Optional[int] = None
+    seed: Optional[int] = None
+    grammar: Optional[str] = None
+    json_schema: Optional[Json] = None
\ No newline at end of file
diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
new file mode 100644
index 0000000000000..219fda4178b5e
--- /dev/null
+++ b/examples/openai/requirements.txt
@@ -0,0 +1,7 @@
+fastapi[all]
+gguf
+jinja2
+jsonargparse
+pydantic
+sse-starlette
+uvicorn[all]
\ No newline at end of file
diff --git a/examples/openai/server.py b/examples/openai/server.py
new file mode 100644
index 0000000000000..db075bddb6fe2
--- /dev/null
+++ b/examples/openai/server.py
@@ -0,0 +1,215 @@
+import json, sys, subprocess, atexit
+from pathlib import Path
+
+# sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
+from examples.json_schema_to_grammar import SchemaConverter
+
+from typing import Optional
+import httpx
+from fastapi import Depends, FastAPI, Request, Response
+from starlette.responses import StreamingResponse
+from fastapi.responses import JSONResponse
+from jsonargparse import CLI
+
+from examples.openai.ts_converter import SchemaToTypeScriptConverter
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.api import Message, Tool, ToolFunction, ResponseFormat, ChatCompletionRequest
+from examples.openai.chat_format import ChatFormat, ToolStyle
+
+def _add_system_prompt(messages: list['Message'], system_prompt: str):
+    # TODO: add to last system message, or create a new one just before the last user message
+    system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
+    if system_message is not None:
+        (i, m) = system_message
+        messages[i].content = m.content + '\n' + system_prompt
+    else:
+        messages.insert(0, Message(role="system", content=system_prompt))
+    return messages
+
+def main(
+    model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
+    host: str = "localhost",
+    port: int = 8080,
+    main_server_endpoint: Optional[str] = None,
+    main_server_host: str = "localhost",
+    main_server_port: Optional[int] = 8081,
+):
+    import uvicorn
+
+    metadata = GGUFKeyValues(model)
+    context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
+    chat_format = ChatFormat.from_gguf(metadata)
+    print(chat_format)
+
+    if not main_server_endpoint:
+        server_process = subprocess.Popen([
+            "./server", "-m", model,
+            "--host", main_server_host, "--port", f'{main_server_port}',
+            '-ctk', 'q4_0', '-ctv', 'f16',
+            "-c", f"8192",
+            # "-c", f"{context_length}",
+        ])
+        atexit.register(server_process.kill)
+        main_server_endpoint = f"http://{main_server_host}:{main_server_port}"
+
+    app = FastAPI()
+
+    @app.post("/v1/chat/completions")
+    async def chat_completions(request: Request, chat_request: ChatCompletionRequest):
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": request.headers.get("Authorization"),
+        }
+
+        if chat_request.response_format is not None:
+            assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
+            response_schema = chat_request.response_format.json_schema or {}
+        else:
+            response_schema = None
+
+        messages = chat_request.messages
+        parser=None
+        grammar=None
+
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+
+        response_rule = converter.visit(response_schema, "response") if response_schema else None
+          
+        
+        delimiter = '<%$[SAMPLE]$%>'
+        empty_prompt = chat_format.render([], add_generation_prompt=True)
+        planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False)
+        assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
+        [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
+
+        if chat_request.tools:
+            if chat_format.tool_style in (ToolStyle.DEFAULT, ToolStyle.NOUS_RESEARCH_HERMES):
+                messages = _add_system_prompt(messages, '\n'.join([
+                    'Here are the tools available:',
+                    '<tools>',
+                    *(tool.model_dump_json() for tool in chat_request.tools),
+                    '</tools>',
+                ]))
+
+                tool_rules = [
+                    converter.visit(
+                        dict(
+                            type="object",
+                            properties=dict(
+                                name=dict(const=tool.function.name),
+                                arguments=tool.function.parameters,
+                            ),
+                            required=['name', 'arguments']
+                        ),
+                        f'{tool.function.name}-tool-call'
+                    )
+                    for tool in chat_request.tools
+                ]
+
+                # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
+                # OR a tool-call message respecting the schema of any of the tools
+                converter._add_rule(
+                    "root", 
+                    converter._format_literal(prefix) + " (" +
+                        (response_rule or converter.not_literal("<tool_call>")) + " | " +
+                        converter._format_literal("<tool_call>") + " (" +
+                        ' | '.join(tool_rules) +
+                        ") " + converter._format_literal("</tool_call>") +
+                    ") " + converter._format_literal(suffix))
+                grammar = converter.format_grammar()
+                
+                def parse(s: str):
+                    if '<tool_call>'.startswith(s):
+                        if s.startswith('<tool_call>') and s.endswith('</tool_call>' + suffix):
+                            s = s[len('<tool_call>'):-len('</tool_call>' + suffix)]
+                            return {"role": "assistant", "tool_calls": [json.loads(s)]}
+                        return None
+                    else:
+                        return {"role": "assistant", "content": s}
+                
+                parser = parse
+
+            elif chat_format.tool_style == ToolStyle.FUNCTIONARY_V2:
+                
+                ts_converter = SchemaToTypeScriptConverter()
+                
+                messages = _add_system_prompt(messages, '\n'.join([
+                    '// Supported function definitions that should be called when necessary.'
+                    'namespace functions {',
+                    *[
+                        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
+                        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
+                        for tool in chat_request.tools
+                    ],
+                    '} // namespace functions',
+                ]))
+
+                # Only allowing a single tool call at a time for now.
+                # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
+                converter._add_rule(
+                    "root", 
+                    converter._format_literal(prefix) + " (" +
+                        (response_rule or converter.not_literal("<|recipient|>")) + " | " +
+                        (' | '.join(
+                            converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
+                            converter.visit(tool.function.parameters, tool.function.name + '-args')
+                            for tool in chat_request.tools
+                        )) +
+                        ") " +
+                    ") " + converter._format_literal(suffix))
+                grammar = converter.format_grammar()
+            else:
+                raise NotImplementedError(f'Unsupported tool_style: {chat_format.tool_style}')
+
+        elif response_schema:
+            converter._add_rule('root', response_rule)
+            grammar = converter.format_grammar()
+
+            def parse(s):
+                if s.endswith(suffix):
+                    s = s[:-len(suffix)]
+                    return {"role": "assistant", "content": s}
+                return None
+            
+            parser = parse
+
+        if chat_format.strict_user_assistant_alternation:
+            print("TODO: merge system messages into user messages")
+            # new_messages = []
+
+        # TODO: Test whether the template supports formatting tool_calls
+            
+        prompt = chat_format.render(messages, add_generation_prompt=True)
+        # print(prompt)
+        # print(grammar)
+        print(json.dumps(dict(
+            prompt=prompt,
+            stream=chat_request.stream,
+            grammar=grammar,
+        ), indent=2))
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{main_server_endpoint}/completions",
+                json=LlamaCppServerCompletionRequest(
+                    prompt=prompt,
+                    stream=chat_request.stream,
+                    n_predict=100,
+                    grammar=grammar,
+                ).model_dump(),
+                headers=headers,
+                timeout=None)
+        
+        return StreamingResponse(generate_chunks(response), media_type="text/event-stream") if chat_request.stream \
+            else JSONResponse(response.json())
+
+    async def generate_chunks(response):
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == "__main__":
+    CLI(main)
+
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
new file mode 100644
index 0000000000000..d018118cbab8d
--- /dev/null
+++ b/examples/openai/ts_converter.py
@@ -0,0 +1,85 @@
+from typing import Any, List, Set, Tuple, Union
+from jsonargparse import CLI
+
+class SchemaToTypeScriptConverter:
+    # TODO: comments for arguments!
+    # // Get the price of a particular car model
+    # type get_car_price = (_: {
+    # // The name of the car model.
+    # car_name: string,
+    # }) => any;
+
+    # // get the weather of a location
+    # type get_weather = (_: {
+    # // where to get weather.
+    # location: string,
+    # }) => any;
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
+        return "{" + ', '.join(
+            f'{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
+            for prop_name, prop_schema in properties
+        ) + "}"
+    
+    def visit(self, schema: dict):
+        def print_constant(v):
+            return json.dumps(v)
+
+        schema_type = schema.get('type')
+        schema_format = schema.get('format')
+
+        if 'oneOf' in schema or 'anyOf' in schema:
+            return '|'.join(self.visit(s) for s in schema.get('oneOf') or schema.get('anyOf'))
+
+        elif isinstance(schema_type, list):
+            return '|'.join(self.visit({'type': t}) for t in schema_type)
+
+        elif 'const' in schema:
+            return print_constant(schema['const'])
+
+        elif 'enum' in schema:
+            return '|'.join((print_constant(v) for v in schema['enum']))
+
+        elif schema_type in (None, 'object') and \
+              ('properties' in schema or \
+              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
+            required = set(schema.get('required', []))
+            properties = list(schema.get('properties', {}).items())
+            return self._build_object_rule(properties, required, schema.get('additionalProperties'))
+
+        elif schema_type in (None, 'object') and 'allOf' in schema:
+            required = set()
+            properties = []
+            def add_component(comp_schema, is_required):
+                if (ref := comp_schema.get('$ref')) is not None:
+                    comp_schema = self._refs[ref]
+
+                if 'properties' in comp_schema:
+                    for prop_name, prop_schema in comp_schema['properties'].items():
+                        properties.append((prop_name, prop_schema))
+                        if is_required:
+                            required.add(prop_name)
+
+            for t in schema['allOf']:
+                if 'anyOf' in t:
+                    for tt in t['anyOf']:
+                        add_component(tt, is_required=False)
+                else:
+                    add_component(t, is_required=True)
+
+            return self._build_object_rule(properties, required, additional_properties=[])
+
+        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
+            items = schema.get('items') or schema['prefixItems']
+            if isinstance(items, list):
+                return '[' + ', '.join(self.visit(item) for item in items) + '][]'
+            else:
+                return self.visit(items) + '[]'
+
+        elif schema_type in (None, 'string') and schema_format == 'date-time':
+            return 'Date'
+
+        elif (schema_type == 'object') or (len(schema) == 0):
+            return 'any'
+
+        else:
+            return 'number' if schema_type == 'integer' else schema_type

From ffc74360e211777b32e418a8c0e54f088d82f77d Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Tue, 26 Mar 2024 01:26:45 +0000
Subject: [PATCH 04/68] agents: scripts to run scripts as sandboxed fastapi
 servers

---
 .gitmodules                              |  3 +
 examples/agents/README.md                | 15 +++++
 examples/agents/fastify-requirements.txt |  5 ++
 examples/agents/fastify.py               | 63 ++++++++++++++++++
 examples/agents/hermes_function_calling  |  1 +
 examples/agents/requirements.txt         |  3 +
 examples/agents/run_sandboxed_tools.sh   | 82 ++++++++++++++++++++++++
 7 files changed, 172 insertions(+)
 create mode 100644 examples/agents/README.md
 create mode 100644 examples/agents/fastify-requirements.txt
 create mode 100644 examples/agents/fastify.py
 create mode 160000 examples/agents/hermes_function_calling
 create mode 100644 examples/agents/requirements.txt
 create mode 100755 examples/agents/run_sandboxed_tools.sh

diff --git a/.gitmodules b/.gitmodules
index b7e8b8ff2f64e..9d262566cf47b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "kompute"]
 	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
+[submodule "examples/agents/hermes_function_calling"]
+	path = examples/agents/hermes_function_calling
+	url = https://github.com/NousResearch/Hermes-Function-Calling
diff --git a/examples/agents/README.md b/examples/agents/README.md
new file mode 100644
index 0000000000000..eb743d0c11c04
--- /dev/null
+++ b/examples/agents/README.md
@@ -0,0 +1,15 @@
+
+Edit `examples/agents/hermes_function_calling/utils.py`:
+
+```py
+log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
+```
+
+Then run:
+
+```bash
+REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
+  examples/agents/run_sandboxed_tools.sh \
+    examples/agents/hermes_function_calling/functions.py \
+    -e LOG_FOLDER=/data/inference_logs
+```
\ No newline at end of file
diff --git a/examples/agents/fastify-requirements.txt b/examples/agents/fastify-requirements.txt
new file mode 100644
index 0000000000000..abd7fe8d1f62f
--- /dev/null
+++ b/examples/agents/fastify-requirements.txt
@@ -0,0 +1,5 @@
+fastapi[all]
+pydantic
+sse-starlette
+uvicorn[all]
+typer[all]
\ No newline at end of file
diff --git a/examples/agents/fastify.py b/examples/agents/fastify.py
new file mode 100644
index 0000000000000..48df2bfda75f1
--- /dev/null
+++ b/examples/agents/fastify.py
@@ -0,0 +1,63 @@
+'''
+    Binds the functions of a python script as a FastAPI server.
+
+    This is useful in combination w/ the examples/agent/run_sandboxed_tools.sh
+'''
+import os, sys, typing, importlib.util
+from anyio import Path
+import fastapi, uvicorn
+import typer
+
+# from langchain_core.tools import BaseTool
+
+def load_source_as_module(source):
+    i = 0
+    while (module_name := f'mod_{i}') in sys.modules:
+        i += 1
+
+    spec = importlib.util.spec_from_file_location(module_name, source)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+def bind_functions(app, module):
+    for k in dir(module):
+        if k.startswith('_'):
+            continue
+        if k == k.capitalize():
+            continue
+        v = getattr(module, k)
+        if not callable(v) or isinstance(v, typing.Type):
+            continue
+        if not hasattr(v, '__annotations__'):
+            continue
+
+        vt = type(v)
+        if vt.__module__ == 'langchain_core.tools' and vt.__name__.endswith('Tool') and hasattr(v, 'func') and callable(v.func):
+            v = v.func
+
+        print(f'INFO:     Binding /{k}')
+        try:
+            app.post(k)(v)
+        except Exception as e:
+            print(f'WARNING:    Failed to bind /{k}\n\t{e}')
+
+def main(files: typing.List[str], host: str = '0.0.0.0', port: int = 8000):
+    app = fastapi.FastAPI()
+
+    for f in files:
+        if f.endswith('.py'):
+            sys.path.insert(0, str(Path(f).parent))
+
+            module = load_source_as_module(f)
+        else:
+            module = importlib.import_module(f)
+
+        bind_functions(app, module)
+
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == '__main__':
+    typer.run(main)
+    
diff --git a/examples/agents/hermes_function_calling b/examples/agents/hermes_function_calling
new file mode 160000
index 0000000000000..b4f757e27d87f
--- /dev/null
+++ b/examples/agents/hermes_function_calling
@@ -0,0 +1 @@
+Subproject commit b4f757e27d87f4ab408f706f482c25a8e1508d59
diff --git a/examples/agents/requirements.txt b/examples/agents/requirements.txt
new file mode 100644
index 0000000000000..2ff0ce927fcbc
--- /dev/null
+++ b/examples/agents/requirements.txt
@@ -0,0 +1,3 @@
+jsonargparse
+pydantic
+typer[all]
\ No newline at end of file
diff --git a/examples/agents/run_sandboxed_tools.sh b/examples/agents/run_sandboxed_tools.sh
new file mode 100755
index 0000000000000..eb8eb252e395e
--- /dev/null
+++ b/examples/agents/run_sandboxed_tools.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#
+# Runs a Python script in a sandboxed environment and makes its functions available as a web service.
+#
+# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/agents/hermes_function_calling
+# python examples/agents/fastify.py examples/agents/hermes_function_calling/functions.py
+# REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
+set -euo pipefail
+
+script="$( realpath "$1" )"
+script_folder="$(dirname "$script")"
+shift 1
+
+function cleanup {      
+  rm -rf "$BUILD_DIR"
+  echo "Deleted $BUILD_DIR"
+}
+trap cleanup EXIT
+BUILD_DIR=$(mktemp -d)
+DATA_DIR="${DATA_DIR:-$HOME/.llama.cpp/sandbox}"
+SCRIPT_DIR=$( cd "$(dirname "$0")" ; pwd )
+
+REQUIREMENTS_FILE="${REQUIREMENTS_FILE:-}"
+if [[ -z "$REQUIREMENTS_FILE" && -f "$script_folder/requirements.txt" ]]; then
+    REQUIREMENTS_FILE="$script_folder/requirements.txt"
+fi
+if [[ -n "$REQUIREMENTS_FILE" ]]; then
+    cp "$REQUIREMENTS_FILE" "$BUILD_DIR/script-requirements.txt"
+else
+    touch $BUILD_DIR/script-requirements.txt
+fi
+
+echo "INFO: using DATA_DIR: $DATA_DIR"
+
+cp \
+    "$SCRIPT_DIR/fastify-requirements.txt" \
+    "$SCRIPT_DIR/fastify.py" \
+    "$BUILD_DIR"
+
+mkdir -p "$DATA_DIR"
+
+PORT=${PORT:-8088}
+
+# BASE_IMAGE=pytorch/pytorch:latest
+# BASE_IMAGE=python:3.10-slim
+BASE_IMAGE=python:3.11-slim
+# torch 
+# FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 
+# RUN apt-get update && \
+#     apt-get install -y python3-pip python3-dev && \
+#     rm -rf /var/lib/apt/lists/*
+
+echo "
+    FROM     $BASE_IMAGE
+    RUN      apt-get update
+    RUN      apt-get install -y gcc python3-dev git cmake
+    RUN      pip install --upgrade pip
+    RUN      pip install packaging numpy
+    RUN      mkdir /src /data
+
+    # Copy resources in increasing likelihood of change, to keep as much as possible cached
+    COPY     fastify-requirements.txt /root
+    RUN      pip install -r /root/fastify-requirements.txt
+    COPY     script-requirements.txt  /root
+    RUN      pip install -r /root/script-requirements.txt
+    COPY     fastify.py               /root
+
+    WORKDIR  /data
+    # ENTRYPOINT uvicorn fastify:app --reload
+    ENTRYPOINT PYTHONPATH=/src python /root/fastify.py --port=$PORT '/src/$( basename "$script" )'
+" | docker build "$BUILD_DIR" -f - -t llama.cpp/tools-base
+
+echo "#"
+echo "# Binding $script to http://localhost:$PORT/"
+echo "#"
+set -x
+docker run \
+    "$@" \
+    --mount "type=bind,source=$( realpath "$script_folder" ),target=/src,readonly" \
+    --mount "type=bind,source=$( realpath "$DATA_DIR" ),target=/data" \
+    -p "$PORT:$PORT" \
+    -it llama.cpp/tools-base
\ No newline at end of file

From d5d9993679b090092b65b080b06e2a793daa19c7 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Tue, 26 Mar 2024 20:58:03 +0000
Subject: [PATCH 05/68] server.py: default tools work!

---
 .gitmodules                                   |   3 -
 examples/agents/README.md                     |  15 --
 examples/agents/hermes_function_calling       |   1 -
 examples/agents/requirements.txt              |   3 -
 examples/openai/README.md                     |  34 ++-
 examples/openai/api.py                        |   9 +-
 examples/openai/chat_format.py                |  59 -----
 .../fastify-requirements.txt                  |   0
 examples/{agents => openai}/fastify.py        |   2 -
 examples/openai/prompt1.txt                   |  43 ++++
 examples/openai/prompting.py                  | 242 ++++++++++++++++++
 examples/openai/requirements.txt              |   4 +-
 .../{agents => openai}/run_sandboxed_tools.sh |   6 +-
 examples/openai/server.py                     | 166 +++---------
 examples/openai/test.sh                       |  79 ++++++
 15 files changed, 446 insertions(+), 220 deletions(-)
 delete mode 100644 examples/agents/README.md
 delete mode 160000 examples/agents/hermes_function_calling
 delete mode 100644 examples/agents/requirements.txt
 delete mode 100644 examples/openai/chat_format.py
 rename examples/{agents => openai}/fastify-requirements.txt (100%)
 rename examples/{agents => openai}/fastify.py (97%)
 create mode 100644 examples/openai/prompt1.txt
 create mode 100644 examples/openai/prompting.py
 rename examples/{agents => openai}/run_sandboxed_tools.sh (93%)
 create mode 100755 examples/openai/test.sh

diff --git a/.gitmodules b/.gitmodules
index 9d262566cf47b..b7e8b8ff2f64e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "kompute"]
 	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
-[submodule "examples/agents/hermes_function_calling"]
-	path = examples/agents/hermes_function_calling
-	url = https://github.com/NousResearch/Hermes-Function-Calling
diff --git a/examples/agents/README.md b/examples/agents/README.md
deleted file mode 100644
index eb743d0c11c04..0000000000000
--- a/examples/agents/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-Edit `examples/agents/hermes_function_calling/utils.py`:
-
-```py
-log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
-```
-
-Then run:
-
-```bash
-REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
-  examples/agents/run_sandboxed_tools.sh \
-    examples/agents/hermes_function_calling/functions.py \
-    -e LOG_FOLDER=/data/inference_logs
-```
\ No newline at end of file
diff --git a/examples/agents/hermes_function_calling b/examples/agents/hermes_function_calling
deleted file mode 160000
index b4f757e27d87f..0000000000000
--- a/examples/agents/hermes_function_calling
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b4f757e27d87f4ab408f706f482c25a8e1508d59
diff --git a/examples/agents/requirements.txt b/examples/agents/requirements.txt
deleted file mode 100644
index 2ff0ce927fcbc..0000000000000
--- a/examples/agents/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-jsonargparse
-pydantic
-typer[all]
\ No newline at end of file
diff --git a/examples/openai/README.md b/examples/openai/README.md
index 47c9c67cc633e..078b52bbadff3 100644
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@@ -1,15 +1,45 @@
-# examples.openai: OpenAI API-compatible server
+# examples.openai: OpenAI API-compatible server + agent / tools examples
 
 A simple Python server that sits above the C++ [../server](examples/server) and offers improved OAI compatibility.
 
 ## Usage
 
+Run a simple test:
+
+```bash
+# Spawns a Python server (which spawns a C++ Server) then hits it w/ a tool-calling request
+examples/openai/test.sh
+```
+
+To simply run the Python server (+ C++ server under the hood):
+
 ```bash
-python -m examples.openai -m some-model.gguf
+python -m examples.openai
+```
 
+## Tools usage (WIP)
 
+```bash
+git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
 ```
 
+Then edit `examples/agents/hermes_function_calling/utils.py`:
+
+```py
+log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
+```
+
+Then run tools in a sandbox:
+
+```bash
+REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
+  examples/agents/run_sandboxed_tools.sh \
+    examples/agents/hermes_function_calling/functions.py \
+    -e LOG_FOLDER=/data/inference_logs
+```
+
+TODO: reactor that reads OpenAPI definitions and does the tool calling
+
 ## Features
 
 The new examples/openai/server.py:
diff --git a/examples/openai/api.py b/examples/openai/api.py
index b883ecec41cbb..c44c6bfd1c4c4 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -1,9 +1,14 @@
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 from pydantic import BaseModel, Json
 
+class ToolCall(BaseModel):
+    name: str
+    arguments: Dict[str, Any]
+
 class Message(BaseModel):
     role: str
-    content: str
+    content: Optional[str]
+    tool_calls: Optional[list[ToolCall]] = None
 
 class ToolFunction(BaseModel):
     name: str
diff --git a/examples/openai/chat_format.py b/examples/openai/chat_format.py
deleted file mode 100644
index bb7d0c94c60f6..0000000000000
--- a/examples/openai/chat_format.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from enum import StrEnum
-import jinja2
-
-from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-
-def raise_exception(msg: str):
-    raise Exception(msg)
-
-class ToolStyle(StrEnum):
-    # https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
-    DEFAULT="Default",
-    # https://github.com/MeetKai/functionary
-    # TODO: look at https://github.com/ggerganov/llama.cpp/pull/5695
-    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
-    FUNCTIONARY_V2="Functionary V2",
-    # https://github.com/NousResearch/Hermes-Function-Calling
-    NOUS_RESEARCH_HERMES="Nous-Research-Hermes-Function-Calling",
-
-class ChatFormat: #(BaseModel):
-    def __init__(self, template: str, eos_token: str, bos_token: str):
-        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
-        self.template = env.from_string(template)
-        self.eos_token = eos_token
-        self.bos_token = bos_token
-
-        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
-
-        if "<|recipient|>' + tool_call['function']['name']" in template:
-            self.tool_style = ToolStyle.FUNCTIONARY_V2
-        else:
-            self.tool_style = ToolStyle.DEFAULT
-
-
-    def __str__(self):
-        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
-
-
-    @staticmethod
-    def from_gguf(metadata: GGUFKeyValues):
-        return ChatFormat(
-            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
-            bos_token = metadata[Keys.Tokenizer.BOS_ID],
-            eos_token = metadata[Keys.Tokenizer.EOS_ID])
-    # @staticmethod
-    # def from_gguf(model: Path):
-    #     reader = GGUFReader(model.as_posix())
-    #     return ChatFormat(
-    #         template = reader.fields[Keys.Tokenizer.CHAT_TEMPLATE].read(),
-    #         bos_token = reader.fields[Keys.Tokenizer.BOS_ID].read(),
-    #         eos_token = reader.fields[Keys.Tokenizer.EOS_ID].read())
-    
-    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
-        return self.template.render(
-            messages=messages,
-            eos_token=self.eos_token,
-            bos_token='' if omit_bos else self.bos_token,
-            raise_exception=raise_exception,
-            add_generation_prompt=add_generation_prompt,
-        )
diff --git a/examples/agents/fastify-requirements.txt b/examples/openai/fastify-requirements.txt
similarity index 100%
rename from examples/agents/fastify-requirements.txt
rename to examples/openai/fastify-requirements.txt
diff --git a/examples/agents/fastify.py b/examples/openai/fastify.py
similarity index 97%
rename from examples/agents/fastify.py
rename to examples/openai/fastify.py
index 48df2bfda75f1..8846a3823cbc7 100644
--- a/examples/agents/fastify.py
+++ b/examples/openai/fastify.py
@@ -8,8 +8,6 @@
 import fastapi, uvicorn
 import typer
 
-# from langchain_core.tools import BaseTool
-
 def load_source_as_module(source):
     i = 0
     while (module_name := f'mod_{i}') in sys.modules:
diff --git a/examples/openai/prompt1.txt b/examples/openai/prompt1.txt
new file mode 100644
index 0000000000000..afae47380a46a
--- /dev/null
+++ b/examples/openai/prompt1.txt
@@ -0,0 +1,43 @@
+<|im_start|>system
+Role:
+  You are a function calling AI agent with self-recursion.
+  You can call only one function at a time and analyse data you get from function response.
+  You are provided with function signatures within <tools></tools> XML tags.
+  The current date is: March 25, 2024.
+
+Objective:
+  You may use agentic frameworks for reasoning and planning to help with user query.
+  Please call a function and wait for function results to be provided to you in the next iteration.
+  Don't make assumptions about what values to plug into function arguments.
+  Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags.
+  Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet.
+  Analyze the data once you get the results and call another function.
+  At each iteration please continue adding the your analysis to previous summary.
+  Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
+
+Tools:
+  Here are the available tools:
+  <tools> 
+  {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."}},"required":["location","format"]}}}
+  {"type":"function","function":{"name":"get_n_day_weather_forecast","description":"Get an N-day weather forecast","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."},"num_days":{"type":"integer","description":"The number of days to forecast"}},"required":["location","format","num_days"]}}} 
+  </tools>
+  If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
+  <tool_call>
+  {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}}
+  </tool_call>
+  Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
+
+Instructions:
+  At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+  Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+  Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+  Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+  If you plan to continue with analysis, always call another function.
+  For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+  <tool_call>
+  {"arguments": <args-dict>, "name": <function-name>}
+  </tool_call>
+<|im_end|>
+<|im_start|>user
+what is the weather going to be like in San Francisco and Glasgow over the next 4 days (temperature in celsius for both)<|im_end|>
+<|im_start|>assistant
\ No newline at end of file
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
new file mode 100644
index 0000000000000..71912fed50040
--- /dev/null
+++ b/examples/openai/prompting.py
@@ -0,0 +1,242 @@
+from enum import Enum
+import jinja2
+import json
+from pathlib import Path
+import sys
+from typing import Optional, Tuple, Callable
+from typeguard import typechecked
+
+from examples.json_schema_to_grammar import SchemaConverter
+from examples.openai.api import Tool, Message
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.ts_converter import SchemaToTypeScriptConverter
+
+@typechecked
+def raise_exception(msg: str):
+    raise Exception(msg)
+
+class ChatFormat:
+    def __init__(self, template: str, eos_token: str, bos_token: str):
+        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
+        self.template = env.from_string(template)
+        self.eos_token = eos_token
+        self.bos_token = bos_token
+
+        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+
+        if "<|recipient|>' + tool_call['function']['name']" in template:
+            self.tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+        else:
+            self.tool_style = ToolsPromptStyle.TOOLS_LONG
+
+    def __str__(self):
+        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
+
+    @staticmethod
+    def from_gguf(metadata: GGUFKeyValues):
+        return ChatFormat(
+            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
+            bos_token = metadata[Keys.Tokenizer.BOS_ID],
+            eos_token = metadata[Keys.Tokenizer.EOS_ID])
+
+    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
+        return self.template.render(
+            messages=messages,
+            eos_token=self.eos_token,
+            bos_token='' if omit_bos else self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=add_generation_prompt,
+        )
+
+# While the API will be usable with a generic tools usage like OpenAI,
+# (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
+# each model may need specific prompting (and/or constrained output,
+# especially for models not fine-tuned for tool usage / function calling).
+class ToolsPromptStyle(Enum):
+    # Short prompt w/ <tools>schemas</tools>
+    TOOLS_SHORT = 1
+
+    # Longer prompt w/ <tools>schemas</tools>
+    TOOLS_LONG = 2
+
+    # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+    # Requires:
+    # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+    # - Set large context length as their prompts are super long
+    TOOLS_HERMES_2_PRO = 3
+
+    # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
+    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
+    # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
+    TYPESCRIPT_FUNCTIONARY_V2 = 4
+
+@typechecked
+def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> Message:
+
+    if chat_format.tool_style == ToolsPromptStyle.TOOLS_SHORT:
+        return Message(
+            role="system",
+            content='\n'.join([
+                'Here are the tools available:',
+                '<tools>',
+                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
+                '</tools>',
+            ])
+        )
+    
+    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_LONG:
+        return Message(
+            role="system",
+            content='\n'.join([
+                '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
+                '''You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:''',
+                '''<tools>''',
+                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
+                '''</tools>''',
+                '',
+                '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
+                '',
+                '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
+                '''<tool_call>''',
+                '''{"arguments": <args-dict>, "name": <function-name>}''',
+                '''</tool_call>''',
+            ])
+        )
+    
+    elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        ts_converter = SchemaToTypeScriptConverter()
+        
+        return Message(
+            role="system",
+            content='\n'.join([
+                '// Supported function definitions that should be called when necessary.'
+                'namespace functions {',
+                *[
+                    '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
+                    'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
+                    for tool in tools
+                ],
+                '} // namespace functions',
+            ])
+        )
+    
+    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
+        # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
+        path = str(Path(__file__).parent / "hermes_function_calling")
+        if path not in sys.path: sys.path.insert(0, path)
+        try:
+            from examples.openai.hermes_function_calling.prompter import PromptManager
+        except ImportError:
+            raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
+        
+        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in tools])
+        assert len(prompt) == 1 and prompt[0]["role"] == "system"
+        return Message(**prompt[0])
+    
+    else:
+        raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
+    
+@typechecked
+def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
+    return style in (
+        ToolsPromptStyle.TOOLS_SHORT,
+        ToolsPromptStyle.TOOLS_LONG,
+        ToolsPromptStyle.TOOLS_HERMES_2_PRO,
+    )
+
+@typechecked
+def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[Message]]]:
+
+    converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+
+    response_rule = converter.visit(response_schema, "response") if response_schema else None
+        
+    delimiter = '<%$[SAMPLE]$%>'
+    empty_prompt = chat_format.render([], add_generation_prompt=True).strip()
+    planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False).strip()
+    assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
+    [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
+
+    if tools:
+        if _outputs_tool_call_tags(chat_format.tool_style):
+            tool_rules = [
+                converter.visit(
+                    dict(
+                        type="object",
+                        properties=dict(
+                            name=dict(const=tool.function.name),
+                            arguments=tool.function.parameters,
+                        ),
+                        required=['name', 'arguments']
+                    ),
+                    f'{tool.function.name}-tool-call'
+                )
+                for tool in tools
+            ]
+
+            # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
+            # OR a tool-call message respecting the schema of any of the tools
+            converter._add_rule(
+                "root", 
+                converter._format_literal(prefix) + " (" +
+                    (response_rule or converter.not_literal("<tool_call>")) + " | " +
+                    converter._format_literal("<tool_call>") + " (" +
+                    ' | '.join(tool_rules) +
+                    ") " + converter._format_literal("</tool_call>") +
+                ")") # + converter._format_literal(suffix))
+            
+            @typechecked
+            def parse(s: str) -> Optional[Message]:
+                ls = s.lstrip()
+                if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
+                    if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
+                        tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
+                        return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
+                    return None
+                else:
+                    return Message(role="assistant", content=s)
+            
+            return (converter.format_grammar(), parse)
+
+        elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+            # Only allowing a single tool call at a time for now.
+            # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
+            converter._add_rule(
+                "root", 
+                converter._format_literal(prefix) + " (" +
+                    (response_rule or converter.not_literal("<|recipient|>")) + " | " +
+                    (' | '.join(
+                        converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
+                        converter.visit(tool.function.parameters, tool.function.name + '-args')
+                        for tool in tools
+                    )) +
+                    ") " +
+                ")") # + converter._format_literal(suffix))
+    
+            @typechecked
+            def parse(s: str) -> Optional[Message]:
+                raise NotImplementedError(f'TODO: parse tool_style {chat_format.tool_style}: {s}')
+
+            return (converter.format_grammar(), parse)
+
+    elif response_schema:
+        converter._add_rule("root", response_rule + ' ' + converter._format_literal(suffix))
+
+        @typechecked
+        def parse(s: str) -> Optional[Message]:
+            if response_rule.endswith(suffix):
+                return Message(role="assistant", content=s[:-len(suffix)])
+            
+        return (converter.format_grammar(), parse)
+
+    else:
+        converter._add_rule("root", converter._format_literal(prefix) + ' ' + converter._format_literal(suffix))
+
+        @typechecked
+        def parse(s: str) -> Optional[Message]:
+            if s.endswith(suffix):
+                return Message(role="assistant", content=s[:-len(suffix)])
+            return None
+
+        return (None, parse)
+        
diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
index 219fda4178b5e..b092bf19f9ba7 100644
--- a/examples/openai/requirements.txt
+++ b/examples/openai/requirements.txt
@@ -1,7 +1,7 @@
 fastapi[all]
 gguf
 jinja2
-jsonargparse
 pydantic
 sse-starlette
-uvicorn[all]
\ No newline at end of file
+uvicorn[all]
+typer[all]
\ No newline at end of file
diff --git a/examples/agents/run_sandboxed_tools.sh b/examples/openai/run_sandboxed_tools.sh
similarity index 93%
rename from examples/agents/run_sandboxed_tools.sh
rename to examples/openai/run_sandboxed_tools.sh
index eb8eb252e395e..88e61f568ad14 100755
--- a/examples/agents/run_sandboxed_tools.sh
+++ b/examples/openai/run_sandboxed_tools.sh
@@ -2,9 +2,9 @@
 #
 # Runs a Python script in a sandboxed environment and makes its functions available as a web service.
 #
-# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/agents/hermes_function_calling
-# python examples/agents/fastify.py examples/agents/hermes_function_calling/functions.py
-# REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
+# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+# python examples/openai/fastify.py examples/openai/hermes_function_calling/functions.py
+# REQUIREMENTS_FILE=<( cat examples/openai/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
 set -euo pipefail
 
 script="$( realpath "$1" )"
diff --git a/examples/openai/server.py b/examples/openai/server.py
index db075bddb6fe2..1a639e3a6e283 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -1,35 +1,39 @@
+# https://gist.github.com/ochafik/a3d4a5b9e52390544b205f37fb5a0df3
+# pip install "fastapi[all]" "uvicorn[all]" sse-starlette jsonargparse jinja2 pydantic
+
 import json, sys, subprocess, atexit
 from pathlib import Path
 
-# sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
-from examples.json_schema_to_grammar import SchemaConverter
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.api import Message, ChatCompletionRequest
+from examples.openai.prompting import ChatFormat, make_grammar, make_tools_prompt
 
-from typing import Optional
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
 import httpx
-from fastapi import Depends, FastAPI, Request, Response
 from starlette.responses import StreamingResponse
-from fastapi.responses import JSONResponse
-from jsonargparse import CLI
+from typing import Annotated, Optional
+import typer
+from typeguard import typechecked
 
-from examples.openai.ts_converter import SchemaToTypeScriptConverter
-from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-from examples.openai.api import Message, Tool, ToolFunction, ResponseFormat, ChatCompletionRequest
-from examples.openai.chat_format import ChatFormat, ToolStyle
-
-def _add_system_prompt(messages: list['Message'], system_prompt: str):
+@typechecked
+def _add_system_prompt(messages: list[Message], system_prompt: Message) -> list[Message]:
+    assert system_prompt.role == "system"
     # TODO: add to last system message, or create a new one just before the last user message
     system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
     if system_message is not None:
         (i, m) = system_message
-        messages[i].content = m.content + '\n' + system_prompt
+        return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
     else:
-        messages.insert(0, Message(role="system", content=system_prompt))
-    return messages
+        return [Message(role="system", content=system_prompt)] + messages
 
 def main(
-    model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
+    model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    # model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
+    # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
     main_server_endpoint: Optional[str] = None,
@@ -48,7 +52,7 @@ def main(
             "./server", "-m", model,
             "--host", main_server_host, "--port", f'{main_server_port}',
             '-ctk', 'q4_0', '-ctv', 'f16',
-            "-c", f"8192",
+            "-c", f"{2*8192}",
             # "-c", f"{context_length}",
         ])
         atexit.register(server_process.kill)
@@ -70,110 +74,10 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             response_schema = None
 
         messages = chat_request.messages
-        parser=None
-        grammar=None
-
-        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
-
-        response_rule = converter.visit(response_schema, "response") if response_schema else None
-          
-        
-        delimiter = '<%$[SAMPLE]$%>'
-        empty_prompt = chat_format.render([], add_generation_prompt=True)
-        planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False)
-        assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
-        [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
-
         if chat_request.tools:
-            if chat_format.tool_style in (ToolStyle.DEFAULT, ToolStyle.NOUS_RESEARCH_HERMES):
-                messages = _add_system_prompt(messages, '\n'.join([
-                    'Here are the tools available:',
-                    '<tools>',
-                    *(tool.model_dump_json() for tool in chat_request.tools),
-                    '</tools>',
-                ]))
-
-                tool_rules = [
-                    converter.visit(
-                        dict(
-                            type="object",
-                            properties=dict(
-                                name=dict(const=tool.function.name),
-                                arguments=tool.function.parameters,
-                            ),
-                            required=['name', 'arguments']
-                        ),
-                        f'{tool.function.name}-tool-call'
-                    )
-                    for tool in chat_request.tools
-                ]
-
-                # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
-                # OR a tool-call message respecting the schema of any of the tools
-                converter._add_rule(
-                    "root", 
-                    converter._format_literal(prefix) + " (" +
-                        (response_rule or converter.not_literal("<tool_call>")) + " | " +
-                        converter._format_literal("<tool_call>") + " (" +
-                        ' | '.join(tool_rules) +
-                        ") " + converter._format_literal("</tool_call>") +
-                    ") " + converter._format_literal(suffix))
-                grammar = converter.format_grammar()
-                
-                def parse(s: str):
-                    if '<tool_call>'.startswith(s):
-                        if s.startswith('<tool_call>') and s.endswith('</tool_call>' + suffix):
-                            s = s[len('<tool_call>'):-len('</tool_call>' + suffix)]
-                            return {"role": "assistant", "tool_calls": [json.loads(s)]}
-                        return None
-                    else:
-                        return {"role": "assistant", "content": s}
-                
-                parser = parse
-
-            elif chat_format.tool_style == ToolStyle.FUNCTIONARY_V2:
-                
-                ts_converter = SchemaToTypeScriptConverter()
-                
-                messages = _add_system_prompt(messages, '\n'.join([
-                    '// Supported function definitions that should be called when necessary.'
-                    'namespace functions {',
-                    *[
-                        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
-                        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
-                        for tool in chat_request.tools
-                    ],
-                    '} // namespace functions',
-                ]))
-
-                # Only allowing a single tool call at a time for now.
-                # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
-                converter._add_rule(
-                    "root", 
-                    converter._format_literal(prefix) + " (" +
-                        (response_rule or converter.not_literal("<|recipient|>")) + " | " +
-                        (' | '.join(
-                            converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
-                            converter.visit(tool.function.parameters, tool.function.name + '-args')
-                            for tool in chat_request.tools
-                        )) +
-                        ") " +
-                    ") " + converter._format_literal(suffix))
-                grammar = converter.format_grammar()
-            else:
-                raise NotImplementedError(f'Unsupported tool_style: {chat_format.tool_style}')
-
-        elif response_schema:
-            converter._add_rule('root', response_rule)
-            grammar = converter.format_grammar()
-
-            def parse(s):
-                if s.endswith(suffix):
-                    s = s[:-len(suffix)]
-                    return {"role": "assistant", "content": s}
-                return None
-            
-            parser = parse
+            messages = _add_system_prompt(messages, make_tools_prompt(chat_format, chat_request.tools))
+
+        (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
         if chat_format.strict_user_assistant_alternation:
             print("TODO: merge system messages into user messages")
@@ -182,11 +86,9 @@ def parse(s):
         # TODO: Test whether the template supports formatting tool_calls
             
         prompt = chat_format.render(messages, add_generation_prompt=True)
-        # print(prompt)
-        # print(grammar)
         print(json.dumps(dict(
-            prompt=prompt,
             stream=chat_request.stream,
+            prompt=prompt,
             grammar=grammar,
         ), indent=2))
         async with httpx.AsyncClient() as client:
@@ -195,14 +97,23 @@ def parse(s):
                 json=LlamaCppServerCompletionRequest(
                     prompt=prompt,
                     stream=chat_request.stream,
-                    n_predict=100,
+                    n_predict=300,
                     grammar=grammar,
                 ).model_dump(),
                 headers=headers,
                 timeout=None)
         
-        return StreamingResponse(generate_chunks(response), media_type="text/event-stream") if chat_request.stream \
-            else JSONResponse(response.json())
+        if chat_request.stream:
+            # TODO: Remove suffix from streamed response using partial parser.
+            assert not chat_request.tools and not chat_request.response_format, "Streaming not supported yet with tools or response_format"
+            return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
+        else:
+            result = response.json()
+            print(json.dumps(result, indent=2))
+            message = parser(result["content"])
+            assert message is not None, f"Failed to parse response: {response.text}"
+            return JSONResponse(message.model_dump())
+            # return JSONResponse(response.json())
 
     async def generate_chunks(response):
         async for chunk in response.aiter_bytes():
@@ -211,5 +122,4 @@ async def generate_chunks(response):
     uvicorn.run(app, host=host, port=port)
 
 if __name__ == "__main__":
-    CLI(main)
-
+    typer.run(main)
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
new file mode 100755
index 0000000000000..5fffc54d43568
--- /dev/null
+++ b/examples/openai/test.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -euo pipefail
+
+SERVER_PID=""
+function cleanup() {
+  if [ -n "$SERVER_PID" ]; then
+    echo "# Killing server"
+    kill $SERVER_PID
+    wait $SERVER_PID
+  fi
+}
+trap cleanup EXIT
+
+echo "# Starting the server"
+python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
+SERVER_PID=$!
+
+sleep 5
+
+echo "# Send a message to the chat API"
+
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "tools": [{
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      }
+                  },
+                  "required": ["location", "format"]
+              }
+          }
+      }, {
+          "type": "function",
+          "function": {
+              "name": "get_n_day_weather_forecast",
+              "description": "Get an N-day weather forecast",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      },
+                      "num_days": {
+                          "type": "integer",
+                          "description": "The number of days to forecast"
+                      }
+                  },
+                  "required": ["location", "format", "num_days"]
+              }
+          }
+      }],
+    "messages": [
+      {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
+      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days. Give the temperatyre in celsius for both locations."}
+    ]
+  }'
+

From 8afd4de17bc196dd588866445d49ac48d715396a Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 27 Mar 2024 00:12:14 +0000
Subject: [PATCH 06/68] server.py: make tools work w/ mixtral-8x7b-instruct

---
 examples/openai/README.md    |  4 +++
 examples/openai/prompting.py | 65 +++++++++++++++++++++++++++++-------
 examples/openai/server.py    | 13 +-------
 examples/openai/test.sh      |  5 +--
 4 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/examples/openai/README.md b/examples/openai/README.md
index 078b52bbadff3..8ddcf9a0c78ef 100644
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@@ -56,6 +56,10 @@ The new examples/openai/server.py:
 
 ## TODO
 
+- Support tool result messages
+
+- Reactor / 
+
 - Embedding endpoint w/ distinct server subprocess
 
 - Automatic/manual session caching
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 71912fed50040..0d7e0de560c1e 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -3,6 +3,7 @@
 import json
 from pathlib import Path
 import sys
+import re
 from typing import Optional, Tuple, Callable
 from typeguard import typechecked
 
@@ -15,6 +16,7 @@
 def raise_exception(msg: str):
     raise Exception(msg)
 
+@typechecked
 class ChatFormat:
     def __init__(self, template: str, eos_token: str, bos_token: str):
         env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
@@ -32,14 +34,43 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
     def __str__(self):
         return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
 
+    def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
+        assert system_prompt.role == "system"
+        # TODO: add to last system message, or create a new one just before the last user message
+        system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
+        if system_message is not None:
+            (i, m) = system_message
+            return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
+        else:
+            return [Message(role="system", content=system_prompt)] + messages
+
     @staticmethod
     def from_gguf(metadata: GGUFKeyValues):
+        tokens = metadata[Keys.Tokenizer.LIST]
         return ChatFormat(
             template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
-            bos_token = metadata[Keys.Tokenizer.BOS_ID],
-            eos_token = metadata[Keys.Tokenizer.EOS_ID])
+            bos_token = tokens[metadata[Keys.Tokenizer.BOS_ID]],
+            eos_token = tokens[metadata[Keys.Tokenizer.EOS_ID]])
 
-    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
+    def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
+        if self.strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
+            new_messages=[]
+            i = 0
+            n = len(messages)
+            while i < n:
+                if messages[i].role == 'system':
+                    assert messages[i+1].role == 'user'
+                    new_messages.append(Message(
+                        role="user",
+                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'))
+                    i += 2
+                else:
+                    new_messages.append(messages[i])
+                    i += 1
+            # print(f'new_messages={json.dumps(new_messages, indent=2)}')
+            messages = new_messages
+        print(f'messages={messages}')
+        
         return self.template.render(
             messages=messages,
             eos_token=self.eos_token,
@@ -144,6 +175,8 @@ def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
         ToolsPromptStyle.TOOLS_HERMES_2_PRO,
     )
 
+_tool_call_re = re.compile('<tool_call>(.*?)</tool_call>', re.DOTALL)
+                
 @typechecked
 def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[Message]]]:
 
@@ -152,8 +185,9 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
     response_rule = converter.visit(response_schema, "response") if response_schema else None
         
     delimiter = '<%$[SAMPLE]$%>'
-    empty_prompt = chat_format.render([], add_generation_prompt=True).strip()
-    planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False).strip()
+    user_msg = Message(role="user", content="Hey")
+    empty_prompt = chat_format.render([user_msg], add_generation_prompt=True).strip()
+    planted_prompt = chat_format.render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
     assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
     [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
@@ -187,14 +221,21 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
             
             @typechecked
             def parse(s: str) -> Optional[Message]:
-                ls = s.lstrip()
-                if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
-                    if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
-                        tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
-                        return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
-                    return None
-                else:
+                # ls = s.lstrip()
+                parts = _tool_call_re.split(s)
+                if len(parts) == 1:
                     return Message(role="assistant", content=s)
+                else:
+                    content = []
+                    tool_calls = []
+                    for i, part in enumerate(parts):
+                        if i % 2 == 0:
+                            content.append(part)
+                        else:
+                            tool_calls.append(json.loads(part))
+                            
+                    content = ''.join(content).strip()
+                    return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls)
             
             return (converter.format_grammar(), parse)
 
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 1a639e3a6e283..ca1e3eab57991 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -19,17 +19,6 @@
 import typer
 from typeguard import typechecked
 
-@typechecked
-def _add_system_prompt(messages: list[Message], system_prompt: Message) -> list[Message]:
-    assert system_prompt.role == "system"
-    # TODO: add to last system message, or create a new one just before the last user message
-    system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
-    if system_message is not None:
-        (i, m) = system_message
-        return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
-    else:
-        return [Message(role="system", content=system_prompt)] + messages
-
 def main(
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
     # model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
@@ -75,7 +64,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         messages = chat_request.messages
         if chat_request.tools:
-            messages = _add_system_prompt(messages, make_tools_prompt(chat_format, chat_request.tools))
+            messages = chat_format.add_system_prompt(messages, make_tools_prompt(chat_format, chat_request.tools))
 
         (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 5fffc54d43568..397682247add7 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -12,7 +12,8 @@ function cleanup() {
 trap cleanup EXIT
 
 echo "# Starting the server"
-python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
+python -m examples.openai --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf  &
+# python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
 SERVER_PID=$!
 
 sleep 5
@@ -73,7 +74,7 @@ curl http://localhost:8080/v1/chat/completions \
       }],
     "messages": [
       {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
-      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days. Give the temperatyre in celsius for both locations."}
+      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
     ]
   }'
 

From aa9605c7514531d6c2bbfef43a8e4bf801c925c7 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 27 Mar 2024 01:50:26 +0000
Subject: [PATCH 07/68] server.py: kinda api-compliant output, disabled grammar

---
 examples/openai/api.py          | 29 ++++++++++++-
 examples/openai/prompting.py    | 76 +++++++++++++++++++++++++++------
 examples/openai/server.py       | 45 ++++++++++++++-----
 examples/openai/test.sh         |  8 ++--
 examples/openai/ts_converter.py | 11 +++--
 5 files changed, 136 insertions(+), 33 deletions(-)

diff --git a/examples/openai/api.py b/examples/openai/api.py
index c44c6bfd1c4c4..0d7ddc1118331 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -1,10 +1,15 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Literal, Optional, Union
 from pydantic import BaseModel, Json
 
-class ToolCall(BaseModel):
+class FunctionCall(BaseModel):
     name: str
     arguments: Dict[str, Any]
 
+class ToolCall(BaseModel):
+    id: Optional[str] = None
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
 class Message(BaseModel):
     role: str
     content: Optional[str]
@@ -30,3 +35,23 @@ class ChatCompletionRequest(BaseModel):
     response_format: Optional[ResponseFormat] = None
     temperature: float = 1.0
     stream: bool = False
+
+class Choice(BaseModel):
+    index: int
+    message: Message
+    logprobs: Optional[Json] = None
+    finish_reason: Union[Literal["stop"], Literal["tool_calls"]]
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: list[Choice]
+    usage: Usage
+    system_fingerprint: str
\ No newline at end of file
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 0d7e0de560c1e..ea6572d7be1c0 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -2,13 +2,14 @@
 import jinja2
 import json
 from pathlib import Path
-import sys
+import random
 import re
+import sys
 from typing import Optional, Tuple, Callable
 from typeguard import typechecked
 
 from examples.json_schema_to_grammar import SchemaConverter
-from examples.openai.api import Tool, Message
+from examples.openai.api import Tool, Message, FunctionCall, ToolCall
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 
@@ -42,7 +43,7 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
             (i, m) = system_message
             return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
         else:
-            return [Message(role="system", content=system_prompt)] + messages
+            return [system_prompt] + messages
 
     @staticmethod
     def from_gguf(metadata: GGUFKeyValues):
@@ -69,7 +70,7 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                     i += 1
             # print(f'new_messages={json.dumps(new_messages, indent=2)}')
             messages = new_messages
-        print(f'messages={messages}')
+        # print(f'messages={messages}')
         
         return self.template.render(
             messages=messages,
@@ -175,10 +176,15 @@ def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
         ToolsPromptStyle.TOOLS_HERMES_2_PRO,
     )
 
-_tool_call_re = re.compile('<tool_call>(.*?)</tool_call>', re.DOTALL)
-                
+_tool_call_re = re.compile(
+    '<tool_call>(.*?)</tool_call>', re.DOTALL)
+_recipient_content_re = re.compile(r'(?:(?:<\|(?:stop|from)\|>)+ *assistant\n<\|recipient\|>|^) *([^ <|>\n]+) *\n<\|content\|>(.*?)(?:$|<\|stop\|>\s*$|(?=(?:<\|(?:stop|from)\|>)+ *assistant\n))', re.DOTALL)
+
+def gen_callid():
+    return f'call_{random.randint(0, 1000000)}'
+
 @typechecked
-def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[Message]]]:
+def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[list[Message]]]]:
 
     converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
 
@@ -191,6 +197,13 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
     assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
     [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
+    def strip_suffix(s: str) -> str:
+        if s.endswith(suffix):
+            return s[:-len(suffix)]
+        else:
+            print(f"Expected suffix ({suffix}) not found: {s}")
+            return s
+
     if tools:
         if _outputs_tool_call_tags(chat_format.tool_style):
             tool_rules = [
@@ -221,6 +234,8 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
             
             @typechecked
             def parse(s: str) -> Optional[Message]:
+                s = strip_suffix(s)
+
                 # ls = s.lstrip()
                 parts = _tool_call_re.split(s)
                 if len(parts) == 1:
@@ -232,10 +247,21 @@ def parse(s: str) -> Optional[Message]:
                         if i % 2 == 0:
                             content.append(part)
                         else:
-                            tool_calls.append(json.loads(part))
+                            tool_calls.append(
+                                ToolCall(
+                                    id=gen_callid(),
+                                    function=FunctionCall(**json.loads(part))))
                             
                     content = ''.join(content).strip()
                     return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls)
+                            
+                # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
+                #     if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
+                #         tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
+                #         return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
+                #     return None
+                # else:
+                #     return Message(role="assistant", content=s)
             
             return (converter.format_grammar(), parse)
 
@@ -256,7 +282,30 @@ def parse(s: str) -> Optional[Message]:
     
             @typechecked
             def parse(s: str) -> Optional[Message]:
-                raise NotImplementedError(f'TODO: parse tool_style {chat_format.tool_style}: {s}')
+                s = strip_suffix(s)
+                
+                parts = _recipient_content_re.split(s)
+                if len(parts) == 1:
+                    return Message(role="assistant", content=s)
+                else:
+                    text_content = []
+                    tool_calls: list[ToolCall] = []
+                    for i in range((len(parts) - 1) // 3):
+                        assert parts[i * 3].strip() == '', f'Unexpected content before tool call: {parts[i * 3]}'
+                        recipient = parts[i * 3 + 1].strip()
+                        content = parts[i * 3 + 2]
+                        if recipient == 'all':
+                            text_content.append(content)
+                        else:
+                            tool_calls.append(
+                                ToolCall(
+                                    id=gen_callid(),
+                                    function=FunctionCall(name=recipient, arguments=json.loads(content))))
+                    
+                    assert parts[-1].strip() == '', f'Unexpected content after tool calls: {parts[-1]}'
+
+                    content = '\n'.join(text_content).strip()
+                    return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls if tool_calls else None)
 
             return (converter.format_grammar(), parse)
 
@@ -265,8 +314,8 @@ def parse(s: str) -> Optional[Message]:
 
         @typechecked
         def parse(s: str) -> Optional[Message]:
-            if response_rule.endswith(suffix):
-                return Message(role="assistant", content=s[:-len(suffix)])
+            s = strip_suffix(s)
+            return Message(role="assistant", content=s)
             
         return (converter.format_grammar(), parse)
 
@@ -275,9 +324,8 @@ def parse(s: str) -> Optional[Message]:
 
         @typechecked
         def parse(s: str) -> Optional[Message]:
-            if s.endswith(suffix):
-                return Message(role="assistant", content=s[:-len(suffix)])
-            return None
+            s = strip_suffix(s)
+            return Message(role="assistant", content=s)
 
         return (None, parse)
         
diff --git a/examples/openai/server.py b/examples/openai/server.py
index ca1e3eab57991..8635da9e50bff 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -3,22 +3,27 @@
 
 import json, sys, subprocess, atexit
 from pathlib import Path
+import time
 
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-from examples.openai.api import Message, ChatCompletionRequest
+from examples.openai.api import ChatCompletionResponse, Choice, Message, ChatCompletionRequest, Usage
 from examples.openai.prompting import ChatFormat, make_grammar, make_tools_prompt
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 import httpx
+import random
 from starlette.responses import StreamingResponse
 from typing import Annotated, Optional
 import typer
 from typeguard import typechecked
 
+def generate_id(prefix):
+    return f"{prefix}{random.randint(0, 1 << 32)}"
+
 def main(
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
     # model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
@@ -68,17 +73,13 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
-        if chat_format.strict_user_assistant_alternation:
-            print("TODO: merge system messages into user messages")
-            # new_messages = []
-
         # TODO: Test whether the template supports formatting tool_calls
             
         prompt = chat_format.render(messages, add_generation_prompt=True)
         print(json.dumps(dict(
             stream=chat_request.stream,
             prompt=prompt,
-            grammar=grammar,
+            # grammar=grammar,
         ), indent=2))
         async with httpx.AsyncClient() as client:
             response = await client.post(
@@ -87,7 +88,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                     prompt=prompt,
                     stream=chat_request.stream,
                     n_predict=300,
-                    grammar=grammar,
+                    # grammar=grammar,
                 ).model_dump(),
                 headers=headers,
                 timeout=None)
@@ -98,11 +99,35 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
         else:
             result = response.json()
+            if 'content' not in result:
+                # print(json.dumps(result, indent=2))
+                return JSONResponse(result)
+
             print(json.dumps(result, indent=2))
+            # print(json.dumps(result.get('content'), indent=2))
             message = parser(result["content"])
-            assert message is not None, f"Failed to parse response: {response.text}"
-            return JSONResponse(message.model_dump())
-            # return JSONResponse(response.json())
+            assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
+
+            prompt_tokens=result['timings']['prompt_n']
+            completion_tokens=result['timings']['predicted_n']
+            return JSONResponse(ChatCompletionResponse(
+                id=generate_id('chatcmpl-'),
+                object="chat.completion",
+                created=int(time.time()),
+                model=chat_request.model,
+                choices=[Choice(
+                    index=0,
+                    message=message,
+                    
+                    finish_reason="stop" if message.tool_calls is None else "tool_calls",
+                )],
+                usage=Usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=prompt_tokens + completion_tokens,
+                ),
+                system_fingerprint='...'
+            ).model_dump())
 
     async def generate_chunks(response):
         async for chunk in response.aiter_bytes():
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 397682247add7..7dcc93e45326a 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -12,7 +12,9 @@ function cleanup() {
 trap cleanup EXIT
 
 echo "# Starting the server"
-python -m examples.openai --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf  &
+
+python -m examples.openai --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf  &
+# python -m examples.openai --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf  &
 # python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
 SERVER_PID=$!
 
@@ -73,8 +75,8 @@ curl http://localhost:8080/v1/chat/completions \
           }
       }],
     "messages": [
-      {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
-      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
+      {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
     ]
   }'
 
+#   {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index d018118cbab8d..c0d99d0a481ea 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -1,5 +1,5 @@
 from typing import Any, List, Set, Tuple, Union
-from jsonargparse import CLI
+import json
 
 class SchemaToTypeScriptConverter:
     # TODO: comments for arguments!
@@ -14,11 +14,14 @@ class SchemaToTypeScriptConverter:
     # // where to get weather.
     # location: string,
     # }) => any;
-    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
-        return "{" + ', '.join(
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], additional_properties: Union[bool, Any]):
+        return "{" + ', '.join([
             f'{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
             for prop_name, prop_schema in properties
-        ) + "}"
+        ] + (
+            [f"[key: string]: {self.visit(additional_properties)}"]
+            if additional_properties is not None else []
+        )) + "}"
     
     def visit(self, schema: dict):
         def print_constant(v):

From a4062935a51cb7633456ffe3335de576d0880d6a Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 27 Mar 2024 23:08:30 +0000
Subject: [PATCH 08/68] server.py: reenable grammar, accommodate mistral's
 escaped underscores

---
 examples/openai/prompting.py | 136 +++++++++++++++++++++++++++--------
 examples/openai/server.py    |  32 ++++-----
 examples/openai/test.sh      |  52 ++++++++++++--
 3 files changed, 166 insertions(+), 54 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index ea6572d7be1c0..d8ecd5174c8ba 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -128,10 +128,12 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
                 '',
                 '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
                 '',
-                '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
+                # '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
+                '''To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:''',
                 '''<tool_call>''',
                 '''{"arguments": <args-dict>, "name": <function-name>}''',
                 '''</tool_call>''',
+                '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it.''',
             ])
         )
     
@@ -201,17 +203,21 @@ def strip_suffix(s: str) -> str:
         if s.endswith(suffix):
             return s[:-len(suffix)]
         else:
-            print(f"Expected suffix ({suffix}) not found: {s}")
+            sys.stderr.write(f"Expected suffix ({suffix}) not found: {s}\n")
             return s
 
     if tools:
         if _outputs_tool_call_tags(chat_format.tool_style):
+
+            escapes_underscores = chat_format.tool_style != ToolsPromptStyle.TOOLS_HERMES_2_PRO
+
             tool_rules = [
                 converter.visit(
                     dict(
                         type="object",
                         properties=dict(
-                            name=dict(const=tool.function.name),
+                            name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
+                                else dict(const=tool.function.name),
                             arguments=tool.function.parameters,
                         ),
                         required=['name', 'arguments']
@@ -221,22 +227,45 @@ def strip_suffix(s: str) -> str:
                 for tool in tools
             ]
 
-            # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
-            # OR a tool-call message respecting the schema of any of the tools
+            def format_literal(s: str) -> str:
+                if escapes_underscores:
+                    return ' "\\\\"? "_" '.join((converter._format_literal(part) for part in s.split('_')))
+                else:
+                    return converter._format_literal(s)
+
+            tool_call_rule = converter._add_rule(
+                'tool_call',
+                format_literal("<tool_call>") + " (" +
+                ' | '.join(tool_rules) +
+                ") " + format_literal("</tool_call>"))
+            
+            # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
+            # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
+            content_rule = converter._add_rule('content', '[^<] | "<" [^t<]? | "<t" [^o<]?')
+            # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
             converter._add_rule(
-                "root", 
-                converter._format_literal(prefix) + " (" +
-                    (response_rule or converter.not_literal("<tool_call>")) + " | " +
-                    converter._format_literal("<tool_call>") + " (" +
-                    ' | '.join(tool_rules) +
-                    ") " + converter._format_literal("</tool_call>") +
-                ")") # + converter._format_literal(suffix))
+                'root',
+                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?')
+          
+            # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
+            # # OR a tool-call message respecting the schema of any of the tools
+            # converter._add_rule(
+            #     "root", 
+            #     converter._format_literal(prefix) + " (" +
+            #         (response_rule or converter.not_literal("<tool_call>")) + " | " +
+            #         converter._format_literal("<tool_call>") + " (" +
+            #         ' | '.join(tool_rules) +
+            #         ") " + converter._format_literal("</tool_call>") +
+            #     ")") # + converter._format_literal(suffix))
             
             @typechecked
             def parse(s: str) -> Optional[Message]:
                 s = strip_suffix(s)
 
-                # ls = s.lstrip()
+                if r'<tool\_call>' in s:
+                    # Some weird escaping of underscores is happening w/ Mixtral 8x7B Instruct
+                    s = s.replace(r'\_', '_')
+
                 parts = _tool_call_re.split(s)
                 if len(parts) == 1:
                     return Message(role="assistant", content=s)
@@ -247,13 +276,17 @@ def parse(s: str) -> Optional[Message]:
                         if i % 2 == 0:
                             content.append(part)
                         else:
+                            try:
+                                fc = json.loads(part)
+                            except json.JSONDecodeError:
+                                raise ValueError(f'Failed to parse tool call as JSON: {part}\nFull string: {s}')
                             tool_calls.append(
                                 ToolCall(
                                     id=gen_callid(),
-                                    function=FunctionCall(**json.loads(part))))
+                                    function=FunctionCall(**fc)))
                             
-                    content = ''.join(content).strip()
-                    return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls)
+                    content = '(...)'.join(content).strip()
+                    return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
                             
                 # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
                 #     if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
@@ -268,17 +301,54 @@ def parse(s: str) -> Optional[Message]:
         elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
             # Only allowing a single tool call at a time for now.
             # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
+
+            tool_rules = [
+                converter._add_rule(
+                    tool.function.name + '-call',
+                    converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
+                    converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
+                    converter._format_literal('\n'))
+                # converter.visit(
+                #     dict(
+                #         type="object",
+                #         properties=dict(
+                #             name=dict(const=tool.function.name),
+                #             arguments=tool.function.parameters,
+                #         ),
+                #         required=['name', 'arguments']
+                #     ),
+                #     f'{tool.function.name}-tool-call'
+                # )
+                for i, tool in enumerate(tools)
+            ]
+
+            not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
+            content_without_start_rule = converter._add_rule('content_without_start', converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
+            start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
+            content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
+            tool_call_without_start_rule = converter._add_rule(
+                'tool_call_without_start',
+                ' | '.join(tool_rules))
+                #   + ' ' +
+                # converter.not_literal("all", dotall=False) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' + not_from_rule + '*')
+            tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
+            # converter._add_rule('root', f'({content_without_start_rule} ({content_rule})* ({tool_call_rule}+ {content_rule}*)? | {tool_call_without_start_rule} (* {tool_call_rule}{content_rule}*')
             converter._add_rule(
-                "root", 
-                converter._format_literal(prefix) + " (" +
-                    (response_rule or converter.not_literal("<|recipient|>")) + " | " +
-                    (' | '.join(
-                        converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
-                        converter.visit(tool.function.parameters, tool.function.name + '-args')
-                        for tool in tools
-                    )) +
-                    ") " +
-                ")") # + converter._format_literal(suffix))
+                'root',
+                f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
+                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*')
+
+            # converter._add_rule(
+            #     "root", 
+            #     converter._format_literal(prefix) + " (" +
+            #         (response_rule or converter.not_literal("<|recipient|>")) + " | " +
+            #         (' | '.join(
+            #             converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
+            #             converter.visit(tool.function.parameters, tool.function.name + '-args')
+            #             for tool in tools
+            #         )) +
+            #         ") " +
+            #     ")") # + converter._format_literal(suffix))
     
             @typechecked
             def parse(s: str) -> Optional[Message]:
@@ -297,17 +367,25 @@ def parse(s: str) -> Optional[Message]:
                         if recipient == 'all':
                             text_content.append(content)
                         else:
+                            try:
+                                arguments = json.loads(content)
+                            except json.JSONDecodeError:
+                                raise ValueError(f'Failed to parse tool call content as JSON: {content}')
                             tool_calls.append(
                                 ToolCall(
                                     id=gen_callid(),
-                                    function=FunctionCall(name=recipient, arguments=json.loads(content))))
+                                    function=FunctionCall(name=recipient, arguments=arguments)))
+                            
                     
-                    assert parts[-1].strip() == '', f'Unexpected content after tool calls: {parts[-1]}'
+                    assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
 
                     content = '\n'.join(text_content).strip()
-                    return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls if tool_calls else None)
+                    return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
 
             return (converter.format_grammar(), parse)
+        
+        else:
+            raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
 
     elif response_schema:
         converter._add_rule("root", response_rule + ' ' + converter._format_literal(suffix))
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 8635da9e50bff..349ad6c7d2b21 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -30,27 +30,28 @@ def main(
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
-    main_server_endpoint: Optional[str] = None,
-    main_server_host: str = "localhost",
-    main_server_port: Optional[int] = 8081,
+    cpp_server_endpoint: Optional[str] = None,
+    cpp_server_host: str = "localhost",
+    cpp_server_port: Optional[int] = 8081,
 ):
     import uvicorn
 
     metadata = GGUFKeyValues(model)
     context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
     chat_format = ChatFormat.from_gguf(metadata)
-    print(chat_format)
+    # print(chat_format)
 
-    if not main_server_endpoint:
+    if not cpp_server_endpoint:
+        sys.stderr.write(f"# Starting C++ server with model {model} on {cpp_server_host}:{cpp_server_port}\n")
         server_process = subprocess.Popen([
             "./server", "-m", model,
-            "--host", main_server_host, "--port", f'{main_server_port}',
+            "--host", cpp_server_host, "--port", f'{cpp_server_port}',
             '-ctk', 'q4_0', '-ctv', 'f16',
             "-c", f"{2*8192}",
             # "-c", f"{context_length}",
-        ])
+        ], stdout=sys.stderr)
         atexit.register(server_process.kill)
-        main_server_endpoint = f"http://{main_server_host}:{main_server_port}"
+        cpp_server_endpoint = f"http://{cpp_server_host}:{cpp_server_port}"
 
     app = FastAPI()
 
@@ -74,21 +75,17 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
         # TODO: Test whether the template supports formatting tool_calls
-            
+        sys.stderr.write(f'\n{grammar}\n\n')
+
         prompt = chat_format.render(messages, add_generation_prompt=True)
-        print(json.dumps(dict(
-            stream=chat_request.stream,
-            prompt=prompt,
-            # grammar=grammar,
-        ), indent=2))
         async with httpx.AsyncClient() as client:
             response = await client.post(
-                f"{main_server_endpoint}/completions",
+                f"{cpp_server_endpoint}/completions",
                 json=LlamaCppServerCompletionRequest(
                     prompt=prompt,
                     stream=chat_request.stream,
                     n_predict=300,
-                    # grammar=grammar,
+                    grammar=grammar,
                 ).model_dump(),
                 headers=headers,
                 timeout=None)
@@ -103,7 +100,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 # print(json.dumps(result, indent=2))
                 return JSONResponse(result)
 
-            print(json.dumps(result, indent=2))
+            sys.stderr.write(json.dumps(result, indent=2) + "\n")
             # print(json.dumps(result.get('content'), indent=2))
             message = parser(result["content"])
             assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
@@ -118,7 +115,6 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 choices=[Choice(
                     index=0,
                     message=message,
-                    
                     finish_reason="stop" if message.tool_calls is None else "tool_calls",
                 )],
                 usage=Usage(
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 7dcc93e45326a..3f5d38cd1d6e0 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -4,23 +4,60 @@ set -euo pipefail
 SERVER_PID=""
 function cleanup() {
   if [ -n "$SERVER_PID" ]; then
-    echo "# Killing server"
+    echo "# Killing server" >&2
     kill $SERVER_PID
     wait $SERVER_PID
   fi
 }
 trap cleanup EXIT
 
-echo "# Starting the server"
+echo "# Starting the server" >&2
 
-python -m examples.openai --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf  &
-# python -m examples.openai --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf  &
-# python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
+args=(
+    # --cpp_server_endpoint "http://localhost:8081"
+    
+    # --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
+    
+    # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
+    # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
+
+    # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf
+    --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
+)
+python -m examples.openai "${args[@]}" &
 SERVER_PID=$!
 
 sleep 5
 
-echo "# Send a message to the chat API"
+echo "# Send a message to the chat API" >&2
+
+# curl http://localhost:8080/v1/chat/completions \
+#   -H "Content-Type: application/json" \
+#   -H "Authorization: Bearer $OPENAI_API_KEY" \
+#   -d '{
+#     "model": "gpt-3.5-turbo",
+#     "tools": [{
+#           "type": "function",
+#           "function": {
+#               "name": "get_current_weather",
+#               "description": "Get the current weather",
+#               "parameters": {
+#                   "type": "object",
+#                   "properties": {
+#                       "location": {
+#                           "type": "string",
+#                           "description": "The city and state, e.g. San Francisco, CA"
+#                       }
+#                   },
+#                   "required": ["location"]
+#               }
+#           }
+#       }],
+#     "messages": [
+#       {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
+#     ]
+#   }' | \
+#   jq .
 
 curl http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -77,6 +114,7 @@ curl http://localhost:8080/v1/chat/completions \
     "messages": [
       {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
     ]
-  }'
+  }' | \
+  jq .
 
 #   {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},

From 63a384deaf2c593c1d27d283be1c5585d4d36b66 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 28 Mar 2024 00:42:12 +0000
Subject: [PATCH 09/68] server.py: raise n_predict

---
 examples/openai/prompting.py | 4 +++-
 examples/openai/server.py    | 2 +-
 examples/openai/test.sh      | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index d8ecd5174c8ba..ab3ce89a8c25e 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -323,7 +323,9 @@ def parse(s: str) -> Optional[Message]:
             ]
 
             not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
-            content_without_start_rule = converter._add_rule('content_without_start', converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
+            content_without_start_rule = converter._add_rule(
+                'content_without_start',
+                converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
             start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
             content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
             tool_call_without_start_rule = converter._add_rule(
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 349ad6c7d2b21..21c30623d8300 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -84,7 +84,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 json=LlamaCppServerCompletionRequest(
                     prompt=prompt,
                     stream=chat_request.stream,
-                    n_predict=300,
+                    n_predict=1000,
                     grammar=grammar,
                 ).model_dump(),
                 headers=headers,
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 3f5d38cd1d6e0..44a6c44de1bfc 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -16,13 +16,13 @@ echo "# Starting the server" >&2
 args=(
     # --cpp_server_endpoint "http://localhost:8081"
     
-    # --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
+    --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
     
     # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
     # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
 
     # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf
-    --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
+    # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
 )
 python -m examples.openai "${args[@]}" &
 SERVER_PID=$!

From 5f3de16116db536fe33d0859a79ff96e4d4f9d7e Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 28 Mar 2024 23:57:14 +0000
Subject: [PATCH 10/68] server.py: pass all request options, comments in ts
 sigs, render tool calls

---
 examples/openai/api.py          | 29 +++++++++++--
 examples/openai/prompting.py    | 74 +++++++++++++++++++++------------
 examples/openai/server.py       | 32 ++++++++++----
 examples/openai/ts_converter.py | 13 +++++-
 4 files changed, 107 insertions(+), 41 deletions(-)

diff --git a/examples/openai/api.py b/examples/openai/api.py
index 0d7ddc1118331..dd8da09a254c0 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -1,5 +1,5 @@
 from typing import Any, Dict, Literal, Optional, Union
-from pydantic import BaseModel, Json
+from pydantic import BaseModel, Json, TypeAdapter
 
 class FunctionCall(BaseModel):
     name: str
@@ -31,10 +31,33 @@ class ResponseFormat(BaseModel):
 class ChatCompletionRequest(BaseModel):
     model: str
     tools: Optional[list[Tool]] = None
-    messages: list[Message]
+    messages: list[Message] = None
+    prompt: Optional[str] = None
     response_format: Optional[ResponseFormat] = None
-    temperature: float = 1.0
+
     stream: bool = False
+    cache_prompt: Optional[bool] = None
+    n_predict: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    min_p: Optional[float] = None
+    tfs_z: Optional[float] = None
+    typical_p: Optional[float] = None
+    temperature: float = 1.0
+    dynatemp_range: Optional[float] = None
+    dynatemp_exponent: Optional[float] = None
+    repeat_last_n: Optional[int] = None
+    repeat_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    presense_penalty: Optional[float] = None
+    mirostat: Optional[bool] = None
+    mirostat_tau: Optional[float] = None
+    mirostat_eta: Optional[float] = None
+    penalize_nl: Optional[bool] = None
+    n_keep: Optional[int] = None
+    seed: Optional[int] = None
+    n_probs: Optional[int] = None
+    min_keep: Optional[int] = None
 
 class Choice(BaseModel):
     index: int
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index ab3ce89a8c25e..e26ca92297089 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -41,7 +41,7 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
         system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
         if system_message is not None:
             (i, m) = system_message
-            return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
+            return messages[:i] + [Message(role="system", content=system_prompt.content + '\n' + m.content)] + messages[i+1:]
         else:
             return [system_prompt] + messages
 
@@ -63,8 +63,16 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                     assert messages[i+1].role == 'user'
                     new_messages.append(Message(
                         role="user",
-                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'))
+                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'
+                    ))
                     i += 2
+                elif messages[i].role == 'assistant' and messages[i].tool_calls and messages[i].content:
+                    tc = '\n'.join(f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>' for tc in messages[i].tool_calls)
+                    new_messages.append(Message(
+                        role="assistant",
+                        content=f'{messages[i].content}\n{tc}'
+                    ))
+                    i += 1
                 else:
                     new_messages.append(messages[i])
                     i += 1
@@ -72,13 +80,15 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
             messages = new_messages
         # print(f'messages={messages}')
         
-        return self.template.render(
+        result = self.template.render(
             messages=messages,
             eos_token=self.eos_token,
             bos_token='' if omit_bos else self.bos_token,
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
+        sys.stderr.write(f'\n# RENDERED:\n\n{result}\n\n')
+        return result
 
 # While the API will be usable with a generic tools usage like OpenAI,
 # (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
@@ -120,38 +130,29 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
         return Message(
             role="system",
             content='\n'.join([
-                '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
+                # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
                 '''You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:''',
                 '''<tools>''',
-                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
+                _tools_typescript_signatures(tools),
+                # _tools_schema_signatures(tools, indent=indent),
                 '''</tools>''',
                 '',
-                '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
-                '',
+                # '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
+                # '',
                 # '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
                 '''To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:''',
                 '''<tool_call>''',
-                '''{"arguments": <args-dict>, "name": <function-name>}''',
+                '''{"name": <function-name>, "arguments": <args-dict>}''',
                 '''</tool_call>''',
-                '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it.''',
+                # '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
             ])
         )
     
     elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        ts_converter = SchemaToTypeScriptConverter()
-        
         return Message(
             role="system",
-            content='\n'.join([
-                '// Supported function definitions that should be called when necessary.'
-                'namespace functions {',
-                *[
-                    '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
-                    'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
-                    for tool in tools
-                ],
-                '} // namespace functions',
-            ])
+            content= '// Supported function definitions that should be called when necessary.\n' +
+                _tools_typescript_signatures(tools)
         )
     
     elif chat_format.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
@@ -170,6 +171,20 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
     else:
         raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
     
+def _tools_typescript_signatures(tools: list[Tool]) -> str:
+    ts_converter = SchemaToTypeScriptConverter()
+    return 'namespace functions {' + '\n'.join(
+        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
+        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
+        for tool in tools
+    ) + '} // namespace functions'
+
+def _tools_schema_signatures(tools: list[Tool], indent=None) -> str:
+    return '\n'.join(
+        json.dumps(tool.model_dump(), indent=indent)
+        for tool in tools
+    )
+
 @typechecked
 def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
     return style in (
@@ -199,6 +214,8 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
     assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
     [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
+    allow_parallel_calls = False
+
     def strip_suffix(s: str) -> str:
         if s.endswith(suffix):
             return s[:-len(suffix)]
@@ -235,17 +252,19 @@ def format_literal(s: str) -> str:
 
             tool_call_rule = converter._add_rule(
                 'tool_call',
-                format_literal("<tool_call>") + " (" +
+                format_literal("<tool_call>") + " space (" +
                 ' | '.join(tool_rules) +
-                ") " + format_literal("</tool_call>"))
+                ")  space " + format_literal("</tool_call>"))# + ' space')
             
             # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
             # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
-            content_rule = converter._add_rule('content', '[^<] | "<" [^t<]? | "<t" [^o<]?')
+            content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
             # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
             converter._add_rule(
                 'root',
-                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?')
+                # tool_call_rule)
+                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if allow_parallel_calls \
+                    else f'{content_rule}* {tool_call_rule}?')
           
             # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
             # # OR a tool-call message respecting the schema of any of the tools
@@ -285,7 +304,7 @@ def parse(s: str) -> Optional[Message]:
                                     id=gen_callid(),
                                     function=FunctionCall(**fc)))
                             
-                    content = '(...)'.join(content).strip()
+                    content = '\n'.join(content).strip()
                     return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
                             
                 # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
@@ -338,7 +357,8 @@ def parse(s: str) -> Optional[Message]:
             converter._add_rule(
                 'root',
                 f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
-                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*')
+                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if allow_parallel_calls \
+                    else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
 
             # converter._add_rule(
             #     "root", 
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 21c30623d8300..ad39106251a9e 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -59,8 +59,9 @@ def main(
     async def chat_completions(request: Request, chat_request: ChatCompletionRequest):
         headers = {
             "Content-Type": "application/json",
-            "Authorization": request.headers.get("Authorization"),
         }
+        if (auth := request.headers.get("Authorization")):
+            headers["Authorization"] = auth
 
         if chat_request.response_format is not None:
             assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
@@ -75,18 +76,31 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
         # TODO: Test whether the template supports formatting tool_calls
-        sys.stderr.write(f'\n{grammar}\n\n')
 
         prompt = chat_format.render(messages, add_generation_prompt=True)
+        
+        sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
+        sys.stderr.write(f'\n# GRAMMAR:\n\n{grammar}\n\n')
+        
+        data = LlamaCppServerCompletionRequest(
+            **{
+                k: v
+                for k, v in chat_request.model_dump().items()
+                if k not in (
+                    "prompt",
+                    "tools",
+                    "messages",
+                    "response_format",
+                )
+            },
+            prompt=prompt,
+            grammar=grammar,
+        ).model_dump()
+        sys.stderr.write(json.dumps(data, indent=2) + "\n")
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{cpp_server_endpoint}/completions",
-                json=LlamaCppServerCompletionRequest(
-                    prompt=prompt,
-                    stream=chat_request.stream,
-                    n_predict=1000,
-                    grammar=grammar,
-                ).model_dump(),
+                json=data,
                 headers=headers,
                 timeout=None)
         
@@ -96,11 +110,11 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
         else:
             result = response.json()
+            sys.stderr.write("# RESULT:\n\n" + json.dumps(result, indent=2) + "\n\n")
             if 'content' not in result:
                 # print(json.dumps(result, indent=2))
                 return JSONResponse(result)
 
-            sys.stderr.write(json.dumps(result, indent=2) + "\n")
             # print(json.dumps(result.get('content'), indent=2))
             message = parser(result["content"])
             assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index c0d99d0a481ea..e29e83507fef5 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -14,12 +14,21 @@ class SchemaToTypeScriptConverter:
     # // where to get weather.
     # location: string,
     # }) => any;
+    def _desc_comment(self, schema: dict):
+        desc = schema.get("description", "").replace("\n", "\n// ") if 'description' in schema else None
+        return f'// {desc}\n' if desc else ''
+
     def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], additional_properties: Union[bool, Any]):
+        if additional_properties == True:
+            additional_properties = {}
+        elif additional_properties == False:
+            additional_properties = None
+
         return "{" + ', '.join([
-            f'{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
+            f'{self._desc_comment(prop_schema)}{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
             for prop_name, prop_schema in properties
         ] + (
-            [f"[key: string]: {self.visit(additional_properties)}"]
+            [f"{self._desc_comment(additional_properties) if additional_properties else ''}[key: string]: {self.visit(additional_properties)}"]
             if additional_properties is not None else []
         )) + "}"
     

From 59b411406fb519536450b7da286b325babeeb787 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 02:47:33 +0000
Subject: [PATCH 11/68] server.py: refactor chat handlers

---
 examples/openai/api.py       |   2 +-
 examples/openai/prompting.py | 768 +++++++++++++++++++++--------------
 examples/openai/server.py    |  30 +-
 3 files changed, 483 insertions(+), 317 deletions(-)

diff --git a/examples/openai/api.py b/examples/openai/api.py
index dd8da09a254c0..7c4a446b8d2c0 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -18,7 +18,7 @@ class Message(BaseModel):
 class ToolFunction(BaseModel):
     name: str
     description: str
-    parameters: Any
+    parameters: dict[str, Any]
 
 class Tool(BaseModel):
     type: str
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index e26ca92297089..60dab69bafa71 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -1,11 +1,14 @@
+from abc import ABC, abstractmethod
 from enum import Enum
+from functools import wraps
 import jinja2
 import json
 from pathlib import Path
 import random
 import re
 import sys
-from typing import Optional, Tuple, Callable
+from typing import Any, Dict, Literal, Optional, Tuple, Callable, Union
+from pydantic import BaseModel
 from typeguard import typechecked
 
 from examples.json_schema_to_grammar import SchemaConverter
@@ -18,22 +21,52 @@ def raise_exception(msg: str):
     raise Exception(msg)
 
 @typechecked
-class ChatFormat:
+class ChatTemplate(BaseModel):
+    template: str
+
+    @property
+    def tool_style(self) -> 'ToolsPromptStyle':
+        return self._tool_style
+    
     def __init__(self, template: str, eos_token: str, bos_token: str):
+        super().__init__(template=template
+                         )
         env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
-        self.template = env.from_string(template)
-        self.eos_token = eos_token
-        self.bos_token = bos_token
+        self._template = env.from_string(template)
+        self._eos_token = eos_token
+        self._bos_token = bos_token
 
-        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+        self._strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
 
         if "<|recipient|>' + tool_call['function']['name']" in template:
-            self.tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+            self._tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+        else:
+            self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
+            # self._tool_style = ToolsPromptStyle.TOOLS_LONG
+
+        # TODO: Test whether the template supports formatting tool_calls
+        
+        delimiter = '<%$[SAMPLE]$%>'
+        user_msg = Message(role="user", content="Hey")
+        empty_prompt = self.render([user_msg], add_generation_prompt=True).strip()
+        planted_prompt = self.render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
+        assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
+        [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
+
+        sys.stderr.write(f"\n# prefix={prefix}\n# suffix={suffix}\n\n")
+
+        self._prefix = prefix
+        self._suffix = suffix
+
+    def strip_suffix(self, s: str) -> str:
+        if s.endswith(self._suffix):
+            return s[:-len(self._suffix)]
         else:
-            self.tool_style = ToolsPromptStyle.TOOLS_LONG
+            sys.stderr.write(f"Expected suffix ({self._suffix}) not found: {s}\n")
+            return s
 
     def __str__(self):
-        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
+        return f"ChatTemplate(template={self.template}, eos_token={self._eos_token}, bos_token={self._bos_token})"
 
     def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
         assert system_prompt.role == "system"
@@ -48,13 +81,13 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
     @staticmethod
     def from_gguf(metadata: GGUFKeyValues):
         tokens = metadata[Keys.Tokenizer.LIST]
-        return ChatFormat(
+        return ChatTemplate(
             template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
             bos_token = tokens[metadata[Keys.Tokenizer.BOS_ID]],
             eos_token = tokens[metadata[Keys.Tokenizer.EOS_ID]])
 
     def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
-        if self.strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
+        if self._strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
             new_messages=[]
             i = 0
             n = len(messages)
@@ -80,10 +113,10 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
             messages = new_messages
         # print(f'messages={messages}')
         
-        result = self.template.render(
+        result = self._template.render(
             messages=messages,
-            eos_token=self.eos_token,
-            bos_token='' if omit_bos else self.bos_token,
+            eos_token=self._eos_token,
+            bos_token='' if omit_bos else self._bos_token,
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
@@ -95,67 +128,176 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
 # each model may need specific prompting (and/or constrained output,
 # especially for models not fine-tuned for tool usage / function calling).
 class ToolsPromptStyle(Enum):
-    # Short prompt w/ <tools>schemas</tools>
+    # Short prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
     TOOLS_SHORT = 1
 
-    # Longer prompt w/ <tools>schemas</tools>
+    # Longer prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
     TOOLS_LONG = 2
 
+    # Bespoke constrained output format that favours thought and reasoning
+    # while allowing unambiguous parsing of parallel tool calling.
+    TOOLS_BESPOKE = 3
+
     # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+    # <tool_call>...</tool_call> output
     # Requires:
     # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
     # - Set large context length as their prompts are super long
-    TOOLS_HERMES_2_PRO = 3
+    TOOLS_HERMES_2_PRO = 4
+
+    # Seems to want to escape underscores in tool names and in the <tool\_call>...</tool\_call> tags
+    TOOLS_MISTRAL = 5
 
     # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
     # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
     # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
-    TYPESCRIPT_FUNCTIONARY_V2 = 4
+    TYPESCRIPT_FUNCTIONARY_V2 = 6
+
+class ChatHandlerArgs(BaseModel):
+    chat_template: ChatTemplate
+    response_schema: Optional[dict] = None
+    tools: Optional[list[Tool]] = None
+
+class ChatHandler(ABC):
+    def __init__(self, args: ChatHandlerArgs):
+        self.args = args
+        self.output_format_prompt: Optional[Message] = None
+        self.grammar: Optional[str] = None
+
+    @abstractmethod
+    def parse(self, s: str) -> Optional[Message]:
+        raise NotImplementedError()
+
+class NoToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs):
+        super().__init__(args)
+        assert not args.tools
+        
+        if args.response_schema:
+            self.output_format_prompt = Message(
+                role="system",
+                content=_please_respond_with_schema(args.response_schema)
+            )
+            converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+            self.grammar = converter.visit(args.response_schema, '')
+        else:
+            self.output_format_prompt = None
+            self.grammar = None
+
+    @typechecked
+    def parse(self, s: str) -> Optional[Message]:
+        return Message(role="assistant", content=s)
+
+class ToolCallTagsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs, escapes_underscores: bool, allow_parallel_calls: bool):
+        super().__init__(args)
+
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+        tool_rules = [
+            converter.visit(
+                dict(
+                    type="object",
+                    properties=dict(
+                        name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
+                            else dict(const=tool.function.name),
+                        arguments=tool.function.parameters,
+                    ),
+                    required=['name', 'arguments']
+                ),
+                f'{tool.function.name}-tool-call'
+            )
+            for tool in self.args.tools
+        ]
+
+        def format_literal(s: str) -> str:
+            if escapes_underscores:
+                return ' "\\\\"? "_" '.join((converter._format_literal(part) for part in s.split('_')))
+            else:
+                return converter._format_literal(s)
+
+        tool_call_rule = converter._add_rule(
+            'tool_call',
+            format_literal("<tool_call>") + " space (" +
+            ' | '.join(tool_rules) +
+            ")  space " + format_literal("</tool_call>"))# + ' space')
+        
+        # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
+        # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
+        content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
+        # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
+        converter._add_rule(
+            'root',
+            # tool_call_rule)
+            f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if allow_parallel_calls \
+                else f'{content_rule}* {tool_call_rule}?')
+        self.grammar = converter.format_grammar()
+        
+        # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
+        # # OR a tool-call message respecting the schema of any of the tools
+        # converter._add_rule(
+        #     "root", 
+        #     converter._format_literal(prefix) + " (" +
+        #         (response_rule or converter.not_literal("<tool_call>")) + " | " +
+        #         converter._format_literal("<tool_call>") + " (" +
+        #         ' | '.join(tool_rules) +
+        #         ") " + converter._format_literal("</tool_call>") +
+        #     ")") # + converter._format_literal(suffix))
+        
+    @typechecked
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
 
-@typechecked
-def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> Message:
+        if r'<tool\_call>' in s:
+            # Some weird escaping of underscores is happening w/ Mixtral 8x7B Instruct
+            s = s.replace(r'\_', '_')
 
-    if chat_format.tool_style == ToolsPromptStyle.TOOLS_SHORT:
-        return Message(
-            role="system",
-            content='\n'.join([
-                'Here are the tools available:',
-                '<tools>',
-                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
-                '</tools>',
-            ])
-        )
-    
-    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_LONG:
-        return Message(
-            role="system",
-            content='\n'.join([
-                # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
-                '''You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:''',
-                '''<tools>''',
-                _tools_typescript_signatures(tools),
-                # _tools_schema_signatures(tools, indent=indent),
-                '''</tools>''',
-                '',
-                # '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
-                # '',
-                # '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
-                '''To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:''',
-                '''<tool_call>''',
-                '''{"name": <function-name>, "arguments": <args-dict>}''',
-                '''</tool_call>''',
-                # '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
-            ])
-        )
-    
-    elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        return Message(
+        parts = _tool_call_re.split(s)
+        if len(parts) == 1:
+            return Message(role="assistant", content=s)
+        else:
+            content = []
+            tool_calls = []
+            for i, part in enumerate(parts):
+                if i % 2 == 0:
+                    content.append(part)
+                else:
+                    try:
+                        fc = json.loads(part)
+                    except json.JSONDecodeError:
+                        raise ValueError(f'Failed to parse tool call as JSON: {part}\nFull string: {s}')
+                    tool_calls.append(
+                        ToolCall(
+                            id=gen_callid(),
+                            function=FunctionCall(**fc)))
+                    
+            content = '\n'.join(content).strip()
+            return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
+                    
+        # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
+        #     if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
+        #         tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
+        #         return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
+        #     return None
+        # else:
+        #     return Message(role="assistant", content=s)
+
+class TemplatedToolsChatHandler(ToolCallTagsChatHandler):
+    def __init__(self, args: ChatHandlerArgs, template: str, escapes_underscores=False, allow_parallel_calls=True):
+        super().__init__(args, escapes_underscores=escapes_underscores, allow_parallel_calls=allow_parallel_calls)
+        assert '{tools}' in template, 'Template must contain "{tools}"'
+
+        self.output_format_prompt = Message(
             role="system",
-            content= '// Supported function definitions that should be called when necessary.\n' +
-                _tools_typescript_signatures(tools)
+            content=template.replace(
+                '{tools}',
+                '\n'.join(json.dumps(tool.model_dump(), indent=2) for tool in self.args.tools),
+            )
         )
-    
-    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
+
+class Hermes2ProToolsChatHandler(ToolCallTagsChatHandler):
+    def __init__(self, args: ChatHandlerArgs):
+        super().__init__(args, escapes_underscores=False, allow_parallel_calls=False)
+
         # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
         path = str(Path(__file__).parent / "hermes_function_calling")
         if path not in sys.path: sys.path.insert(0, path)
@@ -166,16 +308,276 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
         
         prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in tools])
         assert len(prompt) == 1 and prompt[0]["role"] == "system"
-        return Message(**prompt[0])
+        self.output_format_prompt = Message(**prompt[0])
+
+class FunctionaryToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
+        super().__init__(args)
+        
+        # Only allowing a single tool call at a time for now.
+        # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
+
+        self.output_format_prompt = Message(
+            role="system",
+            content= '// Supported function definitions that should be called when necessary.\n' +
+                _tools_typescript_signatures(args.tools)
+        )
     
-    else:
-        raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+        tool_rules = [
+            converter._add_rule(
+                tool.function.name + '-call',
+                converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
+                converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
+                converter._format_literal('\n'))
+            # converter.visit(
+            #     dict(
+            #         type="object",
+            #         properties=dict(
+            #             name=dict(const=tool.function.name),
+            #             arguments=tool.function.parameters,
+            #         ),
+            #         required=['name', 'arguments']
+            #     ),
+            #     f'{tool.function.name}-tool-call'
+            # )
+            for i, tool in enumerate(self.args.tools)
+        ]
+
+        not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
+        content_without_start_rule = converter._add_rule(
+            'content_without_start',
+            converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
+        start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
+        content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
+        tool_call_without_start_rule = converter._add_rule(
+            'tool_call_without_start',
+            ' | '.join(tool_rules))
+            #   + ' ' +
+            # converter.not_literal("all", dotall=False) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' + not_from_rule + '*')
+        tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
+        # converter._add_rule('root', f'({content_without_start_rule} ({content_rule})* ({tool_call_rule}+ {content_rule}*)? | {tool_call_without_start_rule} (* {tool_call_rule}{content_rule}*')
+        converter._add_rule(
+            'root',
+            f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
+            f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if allow_parallel_calls \
+                else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
+
+        self.grammar = converter.format_grammar()
+        # converter._add_rule(
+        #     "root", 
+        #     converter._format_literal(prefix) + " (" +
+        #         (response_rule or converter.not_literal("<|recipient|>")) + " | " +
+        #         (' | '.join(
+        #             converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
+        #             converter.visit(tool.function.parameters, tool.function.name + '-args')
+        #             for tool in tools
+        #         )) +
+        #         ") " +
+        #     ")") # + converter._format_literal(suffix))
     
+    @typechecked
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
+        
+        parts = _recipient_content_re.split(s)
+        if len(parts) == 1:
+            return Message(role="assistant", content=s)
+        else:
+            text_content = []
+            tool_calls: list[ToolCall] = []
+            for i in range((len(parts) - 1) // 3):
+                assert parts[i * 3].strip() == '', f'Unexpected content before tool call: {parts[i * 3]}'
+                recipient = parts[i * 3 + 1].strip()
+                content = parts[i * 3 + 2]
+                if recipient == 'all':
+                    text_content.append(content)
+                else:
+                    try:
+                        arguments = json.loads(content)
+                    except json.JSONDecodeError:
+                        raise ValueError(f'Failed to parse tool call content as JSON: {content}')
+                    tool_calls.append(
+                        ToolCall(
+                            id=gen_callid(),
+                            function=FunctionCall(name=recipient, arguments=arguments)))
+                    
+            
+            assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
+
+            content = '\n'.join(text_content).strip()
+            return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
+
+def _make_bespoke_schema(response_schema, tool_call_schema):
+    return {
+        "type": "object",
+        "properties": {
+            # "original_goal": {"title": "Original Goal", "type": "string"},
+            "thought": {
+                # "title": "Thought about how the next step brings us closer to achieving the original goal",
+                "type": "string"
+            },
+            "next_step": {
+                "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+                "oneOf": [
+                    {
+                        "title": "Tool Calls",
+                        "properties": {
+                            # "type": {
+                            #     "const": "tool_calls"
+                            # },
+                            "tool_calls": {
+                                "type": "array",
+                                "items": tool_call_schema
+                            }
+                        },
+                        "required": ["tool_calls"]
+                    },
+                    {
+                        "title": "Result (achieving original goal)",
+                        "properties": {
+                            "result": response_schema,
+                        },
+                        "required": ["result"]
+                    },
+                ]
+            },
+        },
+        "required": ["original_goal", "thought", "next_step"]
+    }
+
+class BespokeToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs):
+        super().__init__(args)
+        
+        # args.response_schema = args.response_schema or {}
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+
+        response_schema = args.response_schema or {"type": "string"}
+        converter.visit(
+            _make_bespoke_schema(
+                response_schema,
+                {
+                    "oneOf": [
+                        {
+                            "type": "object",
+                            "properties": {
+                                "name": {"const": tool.function.name},
+                                "arguments": tool.function.parameters,
+                            },
+                            "required": ["name", "arguments"]
+                        }
+                        for tool in self.args.tools
+                    ]
+                }
+            ),
+            '',
+        )
+        self.grammar = converter.format_grammar()
+
+        self.output_format_prompt = Message(
+            role="system",
+            content='\n'.join([
+                'You are a function calling AI model.',
+                'Here are the tools available:',
+                _tools_schema_signatures(self.args.tools, indent=2),
+                _please_respond_with_schema(
+                    _make_bespoke_schema(
+                        response_schema,
+                        {
+                            "properties": {
+                                "name": {
+                                    "title": "Name of the tool to call",
+                                    "type": "string"
+                                },
+                                "arguments": {
+                                    "title": "Arguments to pass to the tool",
+                                    "type": "object"
+                                }
+                            },
+                            "required": ["name", "arguments"]
+                        }
+                    )
+                ),
+            ])
+        )
+
+    @typechecked
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
+        try:
+            data = json.loads(s)
+        except json.JSONDecodeError:
+            raise ValueError(f'Failed to parse data as JSON: {s}')
+
+        next_step = data['next_step']
+        if 'result' in next_step:
+            return Message(role="assistant", content=json.dumps(next_step['result']))
+        elif 'tool_calls' in next_step:
+            return Message(
+                role="assistant",
+                content=data["thought"],
+                tool_calls=[
+                    ToolCall(id=gen_callid(), function=FunctionCall(**tc))
+                    for tc in next_step['tool_calls']
+                ]
+            )
+        else:
+            raise ValueError(f'Unexpected data: {data}')
+
+_SHORT_TEMPLATE='\n'.join([
+    'Here are the tools available:',
+    '<tools>',
+    '{tools}',
+    '</tools>',
+])
+
+_LONG_TEMPLATE='\n'.join([
+    # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
+    'You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
+    '<tools>',
+    '{tools}',
+    '</tools>',
+    '',
+    # 'Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}',
+    # '',
+    # 'For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:',
+    'To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:',
+    '<tool_call>',
+    '{"name": <function-name>, "arguments": <args-dict>}',
+    '</tool_call>',
+    # 'This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
+])
+
+def get_chat_handler(args: ChatHandlerArgs, allow_parallel_calls=False) -> ChatHandler:
+    if not args.tools:
+        return NoToolsChatHandler(args)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        return FunctionaryToolsChatHandler(args)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_SHORT:
+        return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, allow_parallel_calls=allow_parallel_calls)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_LONG:
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, allow_parallel_calls=allow_parallel_calls)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_MISTRAL:
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, escapes_underscores=True, allow_parallel_calls=allow_parallel_calls)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_BESPOKE:
+        return BespokeToolsChatHandler(args)
+    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
+        return Hermes2ProToolsChatHandler(args)
+    else:
+        raise ValueError(f"Unsupported tool call style: {args.chat_template.tool_style}")
+
+_ts_converter = SchemaToTypeScriptConverter()
+
+def _please_respond_with_schema(schema: dict) -> str:
+    # sig = json.dumps(schema, indent=2)
+    sig = _ts_converter.visit(schema)
+    return f'Please respond in JSON format with the following schema: {sig}'
+
 def _tools_typescript_signatures(tools: list[Tool]) -> str:
-    ts_converter = SchemaToTypeScriptConverter()
     return 'namespace functions {' + '\n'.join(
         '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
-        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
+        'type ' + tool.function.name + ' = (_: ' + _ts_converter.visit(tool.function.parameters) + ") => any;\n"
         for tool in tools
     ) + '} // namespace functions'
 
@@ -185,247 +587,9 @@ def _tools_schema_signatures(tools: list[Tool], indent=None) -> str:
         for tool in tools
     )
 
-@typechecked
-def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
-    return style in (
-        ToolsPromptStyle.TOOLS_SHORT,
-        ToolsPromptStyle.TOOLS_LONG,
-        ToolsPromptStyle.TOOLS_HERMES_2_PRO,
-    )
-
 _tool_call_re = re.compile(
     '<tool_call>(.*?)</tool_call>', re.DOTALL)
 _recipient_content_re = re.compile(r'(?:(?:<\|(?:stop|from)\|>)+ *assistant\n<\|recipient\|>|^) *([^ <|>\n]+) *\n<\|content\|>(.*?)(?:$|<\|stop\|>\s*$|(?=(?:<\|(?:stop|from)\|>)+ *assistant\n))', re.DOTALL)
 
 def gen_callid():
     return f'call_{random.randint(0, 1000000)}'
-
-@typechecked
-def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[list[Message]]]]:
-
-    converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
-
-    response_rule = converter.visit(response_schema, "response") if response_schema else None
-        
-    delimiter = '<%$[SAMPLE]$%>'
-    user_msg = Message(role="user", content="Hey")
-    empty_prompt = chat_format.render([user_msg], add_generation_prompt=True).strip()
-    planted_prompt = chat_format.render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
-    assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
-    [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
-
-    allow_parallel_calls = False
-
-    def strip_suffix(s: str) -> str:
-        if s.endswith(suffix):
-            return s[:-len(suffix)]
-        else:
-            sys.stderr.write(f"Expected suffix ({suffix}) not found: {s}\n")
-            return s
-
-    if tools:
-        if _outputs_tool_call_tags(chat_format.tool_style):
-
-            escapes_underscores = chat_format.tool_style != ToolsPromptStyle.TOOLS_HERMES_2_PRO
-
-            tool_rules = [
-                converter.visit(
-                    dict(
-                        type="object",
-                        properties=dict(
-                            name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
-                                else dict(const=tool.function.name),
-                            arguments=tool.function.parameters,
-                        ),
-                        required=['name', 'arguments']
-                    ),
-                    f'{tool.function.name}-tool-call'
-                )
-                for tool in tools
-            ]
-
-            def format_literal(s: str) -> str:
-                if escapes_underscores:
-                    return ' "\\\\"? "_" '.join((converter._format_literal(part) for part in s.split('_')))
-                else:
-                    return converter._format_literal(s)
-
-            tool_call_rule = converter._add_rule(
-                'tool_call',
-                format_literal("<tool_call>") + " space (" +
-                ' | '.join(tool_rules) +
-                ")  space " + format_literal("</tool_call>"))# + ' space')
-            
-            # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
-            # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
-            content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
-            # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
-            converter._add_rule(
-                'root',
-                # tool_call_rule)
-                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if allow_parallel_calls \
-                    else f'{content_rule}* {tool_call_rule}?')
-          
-            # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
-            # # OR a tool-call message respecting the schema of any of the tools
-            # converter._add_rule(
-            #     "root", 
-            #     converter._format_literal(prefix) + " (" +
-            #         (response_rule or converter.not_literal("<tool_call>")) + " | " +
-            #         converter._format_literal("<tool_call>") + " (" +
-            #         ' | '.join(tool_rules) +
-            #         ") " + converter._format_literal("</tool_call>") +
-            #     ")") # + converter._format_literal(suffix))
-            
-            @typechecked
-            def parse(s: str) -> Optional[Message]:
-                s = strip_suffix(s)
-
-                if r'<tool\_call>' in s:
-                    # Some weird escaping of underscores is happening w/ Mixtral 8x7B Instruct
-                    s = s.replace(r'\_', '_')
-
-                parts = _tool_call_re.split(s)
-                if len(parts) == 1:
-                    return Message(role="assistant", content=s)
-                else:
-                    content = []
-                    tool_calls = []
-                    for i, part in enumerate(parts):
-                        if i % 2 == 0:
-                            content.append(part)
-                        else:
-                            try:
-                                fc = json.loads(part)
-                            except json.JSONDecodeError:
-                                raise ValueError(f'Failed to parse tool call as JSON: {part}\nFull string: {s}')
-                            tool_calls.append(
-                                ToolCall(
-                                    id=gen_callid(),
-                                    function=FunctionCall(**fc)))
-                            
-                    content = '\n'.join(content).strip()
-                    return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
-                            
-                # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
-                #     if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
-                #         tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
-                #         return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
-                #     return None
-                # else:
-                #     return Message(role="assistant", content=s)
-            
-            return (converter.format_grammar(), parse)
-
-        elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-            # Only allowing a single tool call at a time for now.
-            # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
-
-            tool_rules = [
-                converter._add_rule(
-                    tool.function.name + '-call',
-                    converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
-                    converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
-                    converter._format_literal('\n'))
-                # converter.visit(
-                #     dict(
-                #         type="object",
-                #         properties=dict(
-                #             name=dict(const=tool.function.name),
-                #             arguments=tool.function.parameters,
-                #         ),
-                #         required=['name', 'arguments']
-                #     ),
-                #     f'{tool.function.name}-tool-call'
-                # )
-                for i, tool in enumerate(tools)
-            ]
-
-            not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
-            content_without_start_rule = converter._add_rule(
-                'content_without_start',
-                converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
-            start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
-            content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
-            tool_call_without_start_rule = converter._add_rule(
-                'tool_call_without_start',
-                ' | '.join(tool_rules))
-                #   + ' ' +
-                # converter.not_literal("all", dotall=False) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' + not_from_rule + '*')
-            tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
-            # converter._add_rule('root', f'({content_without_start_rule} ({content_rule})* ({tool_call_rule}+ {content_rule}*)? | {tool_call_without_start_rule} (* {tool_call_rule}{content_rule}*')
-            converter._add_rule(
-                'root',
-                f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
-                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if allow_parallel_calls \
-                    else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
-
-            # converter._add_rule(
-            #     "root", 
-            #     converter._format_literal(prefix) + " (" +
-            #         (response_rule or converter.not_literal("<|recipient|>")) + " | " +
-            #         (' | '.join(
-            #             converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
-            #             converter.visit(tool.function.parameters, tool.function.name + '-args')
-            #             for tool in tools
-            #         )) +
-            #         ") " +
-            #     ")") # + converter._format_literal(suffix))
-    
-            @typechecked
-            def parse(s: str) -> Optional[Message]:
-                s = strip_suffix(s)
-                
-                parts = _recipient_content_re.split(s)
-                if len(parts) == 1:
-                    return Message(role="assistant", content=s)
-                else:
-                    text_content = []
-                    tool_calls: list[ToolCall] = []
-                    for i in range((len(parts) - 1) // 3):
-                        assert parts[i * 3].strip() == '', f'Unexpected content before tool call: {parts[i * 3]}'
-                        recipient = parts[i * 3 + 1].strip()
-                        content = parts[i * 3 + 2]
-                        if recipient == 'all':
-                            text_content.append(content)
-                        else:
-                            try:
-                                arguments = json.loads(content)
-                            except json.JSONDecodeError:
-                                raise ValueError(f'Failed to parse tool call content as JSON: {content}')
-                            tool_calls.append(
-                                ToolCall(
-                                    id=gen_callid(),
-                                    function=FunctionCall(name=recipient, arguments=arguments)))
-                            
-                    
-                    assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
-
-                    content = '\n'.join(text_content).strip()
-                    return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
-
-            return (converter.format_grammar(), parse)
-        
-        else:
-            raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
-
-    elif response_schema:
-        converter._add_rule("root", response_rule + ' ' + converter._format_literal(suffix))
-
-        @typechecked
-        def parse(s: str) -> Optional[Message]:
-            s = strip_suffix(s)
-            return Message(role="assistant", content=s)
-            
-        return (converter.format_grammar(), parse)
-
-    else:
-        converter._add_rule("root", converter._format_literal(prefix) + ' ' + converter._format_literal(suffix))
-
-        @typechecked
-        def parse(s: str) -> Optional[Message]:
-            s = strip_suffix(s)
-            return Message(role="assistant", content=s)
-
-        return (None, parse)
-        
diff --git a/examples/openai/server.py b/examples/openai/server.py
index ad39106251a9e..fbd2f22da46f1 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -5,12 +5,14 @@
 from pathlib import Path
 import time
 
+from pydantic import TypeAdapter
+
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.api import ChatCompletionResponse, Choice, Message, ChatCompletionRequest, Usage
-from examples.openai.prompting import ChatFormat, make_grammar, make_tools_prompt
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, get_chat_handler, ChatHandler
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
@@ -38,8 +40,8 @@ def main(
 
     metadata = GGUFKeyValues(model)
     context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
-    chat_format = ChatFormat.from_gguf(metadata)
-    # print(chat_format)
+    chat_template = ChatTemplate.from_gguf(metadata)
+    # print(chat_template)
 
     if not cpp_server_endpoint:
         sys.stderr.write(f"# Starting C++ server with model {model} on {cpp_server_host}:{cpp_server_port}\n")
@@ -69,18 +71,17 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         else:
             response_schema = None
 
+        chat_handler = get_chat_handler(ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools))
+        
         messages = chat_request.messages
-        if chat_request.tools:
-            messages = chat_format.add_system_prompt(messages, make_tools_prompt(chat_format, chat_request.tools))
-
-        (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
+        if chat_handler.output_format_prompt:
+            messages = chat_template.add_system_prompt(messages, chat_handler.output_format_prompt)
 
-        # TODO: Test whether the template supports formatting tool_calls
-
-        prompt = chat_format.render(messages, add_generation_prompt=True)
+        prompt = chat_template.render(messages, add_generation_prompt=True)
         
+        sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
         sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
-        sys.stderr.write(f'\n# GRAMMAR:\n\n{grammar}\n\n')
+        sys.stderr.write(f'\n# GRAMMAR:\n\n{chat_handler.grammar}\n\n')
         
         data = LlamaCppServerCompletionRequest(
             **{
@@ -94,9 +95,10 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 )
             },
             prompt=prompt,
-            grammar=grammar,
+            grammar=chat_handler.grammar,
         ).model_dump()
-        sys.stderr.write(json.dumps(data, indent=2) + "\n")
+        # sys.stderr.write(json.dumps(data, indent=2) + "\n")
+
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{cpp_server_endpoint}/completions",
@@ -116,7 +118,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 return JSONResponse(result)
 
             # print(json.dumps(result.get('content'), indent=2))
-            message = parser(result["content"])
+            message = chat_handler.parse(result["content"])
             assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
 
             prompt_tokens=result['timings']['prompt_n']

From 253b68d9a7072342c15b775bb971177526712066 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 03:24:29 +0000
Subject: [PATCH 12/68] server.py: crude reactor

---
 examples/openai/api.py       |   4 +
 examples/openai/prompting.py |  31 +++-
 examples/openai/reactor.py   | 344 +++++++++++++++++++++++++++++++++++
 examples/openai/test.sh      |  33 +---
 4 files changed, 373 insertions(+), 39 deletions(-)
 create mode 100644 examples/openai/reactor.py

diff --git a/examples/openai/api.py b/examples/openai/api.py
index 7c4a446b8d2c0..98d710d9cd596 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -10,8 +10,12 @@ class ToolCall(BaseModel):
     type: Literal["function"] = "function"
     function: FunctionCall
 
+ToolCallsTypeAdapter = TypeAdapter(list[ToolCall])
+
 class Message(BaseModel):
     role: str
+    name: Optional[str] = None
+    tool_call_id: Optional[str] = None
     content: Optional[str]
     tool_calls: Optional[list[ToolCall]] = None
 
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 60dab69bafa71..8657861a1fc32 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -41,8 +41,10 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         if "<|recipient|>' + tool_call['function']['name']" in template:
             self._tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
         else:
-            self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
-            # self._tool_style = ToolsPromptStyle.TOOLS_LONG
+            # self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
+
+            self._tool_style = ToolsPromptStyle.TOOLS_LONG
+            # self._tool_style = ToolsPromptStyle.TOOLS_MISTRAL
 
         # TODO: Test whether the template supports formatting tool_calls
         
@@ -87,6 +89,8 @@ def from_gguf(metadata: GGUFKeyValues):
             eos_token = tokens[metadata[Keys.Tokenizer.EOS_ID]])
 
     def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
+        sys.stderr.write(f'# strict_user_assistant_alternation={self._strict_user_assistant_alternation}\n')
+        sys.stderr.write(f'# messages=' + "\n".join(json.dumps(m.model_dump(), indent=2) for m in messages) + '\n')
         if self._strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
             new_messages=[]
             i = 0
@@ -106,6 +110,12 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                         content=f'{messages[i].content}\n{tc}'
                     ))
                     i += 1
+                elif messages[i].role == 'tool':
+                    new_messages.append(Message(
+                        role="user",
+                        content=f'TOOL(name={messages[i].name}, id={messages[i].tool_call_id}): {messages[i].content}',
+                    ))  
+                    i += 1
                 else:
                     new_messages.append(messages[i])
                     i += 1
@@ -408,12 +418,13 @@ def parse(self, s: str) -> Optional[Message]:
             content = '\n'.join(text_content).strip()
             return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
 
-def _make_bespoke_schema(response_schema, tool_call_schema):
+def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls=False):
     return {
         "type": "object",
         "properties": {
-            # "original_goal": {"title": "Original Goal", "type": "string"},
-            "thought": {
+            "original_goal": {"title": "Original Goal", "type": "string"},
+            "thought_about_next_step_only": {
+                "title": "Thought about next step",
                 # "title": "Thought about how the next step brings us closer to achieving the original goal",
                 "type": "string"
             },
@@ -421,14 +432,14 @@ def _make_bespoke_schema(response_schema, tool_call_schema):
                 "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
                 "oneOf": [
                     {
-                        "title": "Tool Calls",
+                        # "title": "Tool Calls",
                         "properties": {
                             # "type": {
                             #     "const": "tool_calls"
                             # },
                             "tool_calls": {
-                                "type": "array",
-                                "items": tool_call_schema
+                                "prefixItems": tool_call_schema if allow_parallel_calls \
+                                    else [tool_call_schema],
                             }
                         },
                         "required": ["tool_calls"]
@@ -443,7 +454,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema):
                 ]
             },
         },
-        "required": ["original_goal", "thought", "next_step"]
+        "required": ["original_goal", "thought_about_next_step_only", "next_step"]
     }
 
 class BespokeToolsChatHandler(ChatHandler):
@@ -516,7 +527,7 @@ def parse(self, s: str) -> Optional[Message]:
         elif 'tool_calls' in next_step:
             return Message(
                 role="assistant",
-                content=data["thought"],
+                content=data["thought_about_next_step_only"],
                 tool_calls=[
                     ToolCall(id=gen_callid(), function=FunctionCall(**tc))
                     for tc in next_step['tool_calls']
diff --git a/examples/openai/reactor.py b/examples/openai/reactor.py
new file mode 100644
index 0000000000000..7aae066ebe15a
--- /dev/null
+++ b/examples/openai/reactor.py
@@ -0,0 +1,344 @@
+# Usage:
+#! ./server -m some-model.gguf &
+#! pip install pydantic
+#! python examples/json-schema-pydantic-example.py
+#
+# TODO:
+# - https://github.com/NousResearch/Hermes-Function-Calling
+#
+# <|im_start|>system
+# You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags
+# You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+# <tools> {'type': 'function', 'function': {'name': 'get_stock_fundamentals',
+# 'description': 'get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\n\n    Args:\n    symbol (str): The stock symbol.\n\n    Returns:\n    dict: A dictionary containing fundamental data.', 'parameters': {'type': 'object', 'properties': {'symbol': {'type': 'string'}}, 'required': ['symbol']}}} 
+# </tools> Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']} For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+# <tool_call>
+# {'arguments': <args-dict>, 'name': <function-name>}
+# </tool_call><|im_end|>
+
+from dataclasses import dataclass
+import subprocess
+import sys
+from pydantic import BaseModel, TypeAdapter
+from annotated_types import MinLen
+from typing import Annotated, Callable, List, Union, Literal, Optional, Type, get_args, get_origin
+import json, requests
+
+from examples.openai.api import ToolCallsTypeAdapter
+
+def type_to_str(t):
+    origin = get_origin(t)
+    if origin is None:
+        return t.__name__
+    args = get_args(t)
+    return origin.__name__ + (
+        f'[{", ".join(type_to_str(a) for a in args)}]' if args else ''
+    )
+
+def build_union_type_adapter(*types):
+    src = '\n'.join([
+        'from pydantic import TypeAdapter',
+        'from typing import Union',
+        f'_out = TypeAdapter(Union[{", ".join(type_to_str(t) for t in types)}])',
+    ])
+    globs = {
+        **globals(),
+        **{t.__name__: t for t in types},
+    }
+    exec(src, globs)
+    return globs['_out']
+
+class Thought(BaseModel):
+    thought: str
+
+
+def build_tool_call_adapter2(final_output_type, *tools):
+    lines = [
+        'from pydantic import BaseModel, TypeAdapter',
+        'from typing import Literal, Union',
+    ]
+    globs = {
+        **globals(),
+        **locals(),
+        final_output_type.__name__: final_output_type,
+    }
+    tool_calls = []
+    for fn in tools:
+        # TODO: escape fn.__doc__ and fn.__doc__ to avoid comment or metadata injection!
+        fn_name = fn.__name__
+        fn_doc = fn.__doc__.replace('"""', "'''") if fn.__doc__ else None
+        name = fn_name.replace('_', ' ').title().replace(' ', '')
+        lines += [
+            f'class {name}ToolArgs(BaseModel):',
+            *(f'  {k}: {type_to_str(v)}' for k, v in fn.__annotations__.items() if k != 'return'),
+            f'class {name}ToolCall(BaseModel):',
+            *([f'  """{fn_doc}"""'] if fn_doc else []),
+            f'  name: Literal["{fn_name}"]',
+            f'  arguments: {name}ToolArgs',
+            f'class {name}Tool(BaseModel):',
+            # *([f'  """{fn_doc}"""'] if fn_doc else []),
+            f'  id: str',
+            f'  type: Literal["function"]',
+            f'  function: {name}ToolCall',
+            f'  def __call__(self) -> {type_to_str(fn.__annotations__.get("return"))}:',
+            f'    return {fn_name}(**self.function.arguments.dict())',
+        ]
+        tool_calls.append(f'{name}Tool')
+    
+    lines += [
+        # 'class FinalResult(BaseModel):',
+        # f'  result: {type_to_str(final_output_type)}',
+        # 'class Response(BaseModel):',
+        # f'  """A response that starts with a thought about whether we need tools or not, the plan about tool usage (maybe a sequence of tool calls), and then either a final result (of type {final_output_type.__name__}) or a first tool call"""',
+        # f'  original_goal: str',
+        # f'  thought_process: str',
+        # # f'  thought: str',
+        # f'  next_step: Union[FinalResult, {", ".join(tool_calls)}]',
+        # f'response_adapter = TypeAdapter(Response)'
+        f'response_adapter = TypeAdapter(Union[{", ".join(tool_calls)}])',
+    ]
+
+    exec('\n'.join(lines), globs)
+    return globs['response_adapter']
+
+def create_completion2(*, response_model=None, max_tool_iterations=None, tools=[], endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
+    '''
+    Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
+    (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+
+    The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
+    '''
+    if response_model:
+        type_adapter = TypeAdapter(response_model)
+        schema = type_adapter.json_schema()
+        # messages = [{
+        #     "role": "system",
+        #     "content": f"Respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
+        # }] + messages
+        # print("Completion: ", json.dumps(messages, indent=2))
+        # print("SCHEMA: " + json.dumps(schema, indent=2))
+        response_format={"type": "json_object", "schema": schema }
+
+    tool_call_adapter = build_tool_call_adapter2(response_model, *tools)
+    tool_adapters = [(fn, TypeAdapter(fn)) for fn in tools]
+    tools_schemas = [{
+        "type": "function",
+        "function": {
+            "name": fn.__name__,
+            "description": fn.__doc__,
+            "parameters": ta.json_schema()
+        }
+    } for (fn, ta) in tool_adapters]
+
+    # messages = [{
+    #     "role": "system",
+    #     "content": '\n'.join([
+    # #         "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.",
+    # #         "You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:",
+    # #         f'<tools>{json.dumps(tools_schemas)}</tools>',
+    #         'Before calling each tool, you think clearly and briefly about why and how you are using the tool.',
+    #         f"Respond in JSON format with the following schema: {json.dumps(schema, indent=2)}" if schema else "",
+    #     ])
+    # }] + messages
+
+    i = 0
+    while (max_tool_iterations is None or i < max_tool_iterations):
+        body=dict(
+            messages=messages,
+            response_format=response_format,
+            tools=tools_schemas,
+            **kwargs
+        )
+        # sys.stderr.write(f'# REQUEST: {json.dumps(body, indent=2)}\n')
+        response = requests.post(
+            endpoint,
+            headers={"Content-Type": "application/json"},
+            json=body,
+        )
+        if response.status_code != 200:
+            raise Exception(f"Request failed ({response.status_code}): {response.text}")
+
+        # sys.stderr.write(f"\n# RESPONSE:\n\n<<<{response.text}>>>\n\n")
+        data = response.json()
+        if 'error' in data:
+            raise Exception(data['error']['message'])
+
+        # sys.stderr.write(f"\n# RESPONSE DATA:\n\n{json.dumps(data, indent=2)}\n\n")
+        # print(json.dumps(data, indent=2))
+        choice = data["choices"][0]
+
+        content = choice["message"].get("content")
+        if choice.get("finish_reason") == "tool_calls":
+            # sys.stderr.write(f'\n# TOOL CALLS:\n{json.dumps(choice["message"]["tool_calls"], indent=2)}\n\n')
+            # tool_calls =ToolCallsTypeAdapter.validate_json(json.dumps(choice["tool_calls"]))
+            messages.append(choice["message"])
+            for tool_call in choice["message"]["tool_calls"]:
+                # id = tool_call.get("id")
+                # if id:
+                #     del tool_call["id"]
+
+                if content:
+                    print(f'💭 {content}')
+
+                tc = tool_call_adapter.validate_json(json.dumps(tool_call))
+                
+                pretty_call = f'{tc.function.name}({", ".join(f"{k}={v}" for k, v in tc.function.arguments.model_dump().items())})'
+                sys.stdout.write(f'⚙️  {pretty_call}')
+                result = tc()
+                sys.stdout.write(f" -> {result}\n")
+                messages.append({
+                    "tool_call_id": tc.id,
+                    "role": "tool",
+                    "name": tc.function.name,
+                    # "content": f'{result}',
+                    "content": f'{pretty_call} = {result}',
+                })
+        else:
+            assert content
+            # print(content)
+            # print(json.dumps(json.loads(content), indent=2))
+            result = type_adapter.validate_json(content) if type_adapter else content
+            # if isinstance(result, Thought):
+            #     print(f'💭 {result.thought}')
+            #     messages.append({
+            #         "role": "assistant",
+            #         "content": json.dumps(result.model_dump(), indent=2),
+            #     })
+            # else:
+            return result
+
+        i += 1
+
+    if max_tool_iterations is not None:
+        raise Exception(f"Failed to get a valid response after {max_tool_iterations} tool calls")
+
+if __name__ == '__main__':
+
+    class QAPair(BaseModel):
+        question: str
+        concise_answer: str
+        justification: str
+
+    class PyramidalSummary(BaseModel):
+        title: str
+        summary: str
+        question_answers: Annotated[List[QAPair], MinLen(2)]
+        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
+
+    # print("# Summary\n", create_completion(
+    #     model="...",
+    #     response_model=PyramidalSummary,
+    #     messages=[{
+    #         "role": "user",
+    #         "content": f"""
+    #             You are a highly efficient corporate document summarizer.
+    #             Create a pyramidal summary of an imaginary internal document about our company processes
+    #             (starting high-level, going down to each sub sections).
+    #             Keep questions short, and answers even shorter (trivia / quizz style).
+    #         """
+    #     }]))
+    
+    import math
+
+    def eval_python_expression(expr: str) -> float:
+        """
+            Evaluate a Python expression reliably.
+            This can be used to compute complex nested mathematical expressions, or any python, really.
+        """
+        print("# Evaluating expression: ", expr)
+        return "0.0"
+
+    def add(a: float, b: float) -> float:
+        """
+            Add a and b reliably.
+            Don't use this tool to compute the square of a number (use multiply or pow instead)
+        """
+        return a + b
+    
+    # def say(something: str) -> str:
+    #     """
+    #         Just says something. Used to say each thought out loud
+    #     """
+    #     return subprocess.check_call(["say", something])
+
+    def multiply(a: float, b: float) -> float:
+        """Multiply a with b reliably"""
+        return a * b
+
+    def divide(a: float, b: float) -> float:
+        """Divide a by b reliably"""
+        return a / b
+
+    def pow(value: float, power: float) -> float:
+        """
+            Raise a value to a power (exponent) reliably.
+            The square of x is pow(x, 2), its cube is pow(x, 3), etc.
+        """
+        return math.pow(value, power)
+
+    result = create_completion2(
+        model="...",
+        response_model=str,
+        tools=[add, multiply, divide, pow], #, say],#, eval_python_expression],
+        # tools=[eval_python_expression],
+        temperature=0.0,
+        # repetition_penalty=1.0,
+        n_predict=1000,
+        top_k=1,
+        top_p=0.0,
+        # logit_bias={
+        #     i: 10.0
+        #     for i in range(1, 259)
+        # },
+        messages=[{
+        #     "role": "system",
+        #     "content": f"""
+        #         You are a reliable assistant. You think step by step and think before using tools
+        #     """
+        # }, {
+            "role": "user",
+            # "content": f"""
+            #     What is 10 squared?
+            # """
+            "content": f"""
+                What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
+
+                Keep your goal in mind at every step.
+            """
+                # Think step by step, start expressing the problem as an arithmetic expression
+        }])
+    
+    # result = create_completion(
+    #     model="...",
+    #     response_model=float,
+    #     tools=[add, multiply, divide, pow], #, say],#, eval_python_expression],
+    #     temperature=0.0,
+    #     # logit_bias={
+    #     #     i: 10.0
+    #     #     for i in range(1, 259)
+    #     # },
+    #     messages=[{
+    #         "role": "user",
+    #         # "content": f"""
+    #         #     What is 10 squared?
+    #         # """
+    #         "content": f"""
+    #             What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
+    #         """
+    #             # Think step by step, start expressing the problem as an arithmetic expression
+    #     }])
+    
+    # 💭 First, I need to square the number 2535. For this, I will use the 'pow' tool.
+    # ⚙️  pow(args={'value': 2535.0, 'power': 2.0})-> 6426225.0
+    # 💭 Now that I have the square of 2535, I need to add it to 32222000403.0 and store the result.
+    # ⚙️  add(args={'a': 6426225.0, 'b': 32222000403.0})-> 32228426628.0
+    # 💭 Now that I have the sum of 2535 squared and 32222000403, I need to multiply it by 1.5.
+    # ⚙️  pow(args={'value': 32228426628.0, 'power': 1.5})-> 5785736571757004.0
+    # 💭 Now that I have the result of the sum multiplied by 1.5, I need to divide it by 3 to get a third of the result.
+    # ⚙️  divide(args={'a': 5785736571757004.0, 'b': 3.0})-> 1928578857252334.8
+    # 💭 I have now calculated a third of the result, which is 1928578857252334.8. I can now share this as the final answer.
+    # Result:  1928578857252334.8
+
+    expected_result = (2535 ** 2 + 32222000403) * 1.5 / 3.0
+    print("➡️", result)
+    assert math.fabs(result - expected_result) < 0.0001, f"Expected {expected_result}, got {result}"
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 44a6c44de1bfc..4dca39adecc3f 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -16,9 +16,9 @@ echo "# Starting the server" >&2
 args=(
     # --cpp_server_endpoint "http://localhost:8081"
     
-    --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
+    # --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
     
-    # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
+    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
     # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
 
     # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf
@@ -31,33 +31,8 @@ sleep 5
 
 echo "# Send a message to the chat API" >&2
 
-# curl http://localhost:8080/v1/chat/completions \
-#   -H "Content-Type: application/json" \
-#   -H "Authorization: Bearer $OPENAI_API_KEY" \
-#   -d '{
-#     "model": "gpt-3.5-turbo",
-#     "tools": [{
-#           "type": "function",
-#           "function": {
-#               "name": "get_current_weather",
-#               "description": "Get the current weather",
-#               "parameters": {
-#                   "type": "object",
-#                   "properties": {
-#                       "location": {
-#                           "type": "string",
-#                           "description": "The city and state, e.g. San Francisco, CA"
-#                       }
-#                   },
-#                   "required": ["location"]
-#               }
-#           }
-#       }],
-#     "messages": [
-#       {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
-#     ]
-#   }' | \
-#   jq .
+python -m examples.openai.reactor
+exit
 
 curl http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \

From e874565a1354288b7cd8eb2529ca46ddd604c582 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 16:17:59 +0000
Subject: [PATCH 13/68] agent: split code from openai example

---
 examples/agent/README.md                      | 175 +++++++++
 examples/agent/__main__.py                    |   6 +
 examples/agent/agent.py                       | 243 +++++++++++++
 .../fastify-requirements.txt                  |   0
 examples/{openai => agent}/fastify.py         |  27 +-
 .../{openai => agent}/run_sandboxed_tools.sh  |  22 +-
 examples/agent/tools/example_math_tools.py    |  23 ++
 examples/agent/tools/example_python_tools.py  |   8 +
 examples/agent/tools/example_summaries.py     |  16 +
 examples/agent/tools/example_weather_tools.py |  36 ++
 examples/agent/tools/std_tools.py             |  78 ++++
 examples/agent/utils.py                       |  41 +++
 examples/openai/README.md                     | 196 +++++++---
 examples/openai/__main__.py                   |   5 +-
 examples/openai/api.py                        |  30 +-
 examples/openai/prompting.py                  | 129 ++++---
 examples/openai/reactor.py                    | 344 ------------------
 examples/openai/server.py                     |  65 ++--
 18 files changed, 923 insertions(+), 521 deletions(-)
 create mode 100644 examples/agent/README.md
 create mode 100644 examples/agent/__main__.py
 create mode 100644 examples/agent/agent.py
 rename examples/{openai => agent}/fastify-requirements.txt (100%)
 rename examples/{openai => agent}/fastify.py (56%)
 rename examples/{openai => agent}/run_sandboxed_tools.sh (82%)
 create mode 100644 examples/agent/tools/example_math_tools.py
 create mode 100644 examples/agent/tools/example_python_tools.py
 create mode 100644 examples/agent/tools/example_summaries.py
 create mode 100644 examples/agent/tools/example_weather_tools.py
 create mode 100644 examples/agent/tools/std_tools.py
 create mode 100644 examples/agent/utils.py
 delete mode 100644 examples/openai/reactor.py

diff --git a/examples/agent/README.md b/examples/agent/README.md
new file mode 100644
index 0000000000000..c774bfb31035e
--- /dev/null
+++ b/examples/agent/README.md
@@ -0,0 +1,175 @@
+# examples.agent: Interactive agent that can use Python tools!
+
+Have any LLM use local (sandboxed) tools, with a simple CLI.
+
+```bash
+python -m examples.agent \
+    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+    --tools examples/agent/tools/example_math_tools.py \
+    --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?"
+```
+<!-- --format float \ -->
+
+<details>
+<summary>Show output</summary>
+
+```bash
+💭 First, I will calculate the square of 2535, then add it to 32222000403. After that, I will multiply the result by 1.5 and finally, I will divide the result by 3.
+⚙️  pow(value=2535, power=2) -> 6426225.0
+💭 Now that I have calculated the square of 2535, I will calculate the sum of 6426225 and 32222000403.
+⚙️  add(a=6426225, b=32222000403) -> 32228426628
+💭 Now that I have calculated the sum, I will multiply it by 1.5.
+⚙️  multiply(a=32228426628, b=1.5) -> 48342639942.0
+💭 Now that I have calculated the product, I will divide it by 3.
+⚙️  divide(a=48342639942.0, b=3) -> 16114213314.0
+➡️ "\nThe result of the calculation is 16114213314.0."
+```
+
+</details>
+
+```bash
+python -m examples.agent \
+    --tools examples/agent/tools/example_weather_tools.py \
+    --goal "What is the weather going to be like in San Francisco and Glasgow over the next 4 days."
+```
+
+<details>
+<summary>Show output</summary>
+
+```bash
+```
+
+</details>
+
+
+```bash
+python -m examples.agent \
+    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+    --std_tools \
+    --goal "Wait 10sec then say Hi out loud"
+```
+
+<details>
+<summary>Show output</summary>
+
+```bash
+```
+
+</details>
+
+## Prerequisites
+
+Note: To get conda, just install Miniforge (it's OSS): https://github.com/conda-forge/miniforge
+
+```bash
+conda create -n agent python=3.11
+conda activate agent
+pip install -r examples/agent/requirements.txt
+pip install -r examples/openai/requirements.txt
+```
+
+## Components
+
+This example relies on the new [OpenAI compatibility server](../openai).
+
+```
+  agent.py  →  examples.openai  →  server.cpp
+            →  safe_tools.py
+            → ( run_sandboxed_tools.sh :  Docker  →  fastify.py )  →  unsafe_tools.py  →  code interpreter, etc...
+``` 
+
+The agent can use tools written in Python, or (soon) exposed under OpenAPI endpoints. Only has standard Python deps (e.g. no langchain)
+
+- Can call into any OpenAI endpoint that supports tool calling, spawns a local one if `--endpoint` isn't specified
+(can pass all llama.cpp params)
+
+- [Standard tools](./tools/std.py) include "safe" TTS, wait for/until helpers, and *requesting user input*.
+
+- Tools are often "unsafe" (e.g. [Python execution functions](./tools/unsafe_python_tools.py)),
+so we provide a script to run them in a Docker-sandboxed environment, exposed as an OpenAPI server:
+
+    ```bash
+    examples/openai/run_sandboxed_tools.sh \
+        examples/agent/tools/unsafe_python_tools.py 6666 &
+
+    python -m examples.openai.reactor \
+        --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+        --tools http://localhost:6666 \
+        --goal "Whats cos(123) / 23 * 12.6 ?"
+    ```
+
+    - [fastify.py](./fastify.py) turns a python module into an OpenAPI endpoint using FastAPI
+
+    - [run_sandboxed_tools.sh](./run_sandboxed_tools.sh) builds and runs a Docker environment with fastify inside it, and exposes its port locally
+
+- Beyond just "tools", output format can be constrained using JSON schemas or Pydantic types
+
+    ```bash
+    python -m examples.agent \
+        --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+        --tools examples/agent/tools/example_summaries.py \
+        --format PyramidalSummary \
+        --goal "Create a pyramidal summary of Mankind's recent advancements"
+    ```
+
+## Launch parts separately
+
+If you'd like to debug each binary separately (rather than have an agent spawing an OAI compat proxy spawning a C++ server), you can run these commands:
+
+```bash
+# C++ server
+make -j server
+./server --model mixtral.gguf --port 8081
+
+# OpenAI compatibility layer
+python -m examples.openai \
+    --port 8080
+    --endpoint http://localhost:8081 \
+    --template_hf_model_id_fallback mistralai/Mixtral-8x7B-Instruct-v0.1
+
+# Or have the OpenAI compatibility layer spawn the C++ server under the hood:
+#   python -m examples.openai --model mixtral.gguf
+
+# Agent itself:
+python -m examples.agent --endpoint http://localhost:8080 \
+```
+
+## Use existing tools (WIP)
+
+```bash
+git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+```
+
+Then edit `examples/agents/hermes_function_calling/utils.py`:
+
+```py
+log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
+```
+
+Then run tools in a sandbox:
+
+```bash
+REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
+  examples/agents/run_sandboxed_tools.sh \
+    examples/agents/hermes_function_calling/functions.py \
+    -e LOG_FOLDER=/data/inference_logs
+```
+
+## TODO
+
+- Add model URL / HF loading support
+
+- Add Embedding endpoint + storage / retrieval tools (Faiss? ScaNN?), or spontaneous RAG
+
+- Auto discover tools exposed by an OpenAPI endpoint
+
+- Add a Python notebook tool example
+
+- Update `run_sandboxed_tools.sh` to support dev mode (`uvicorn fastify:app --reload`)
+
+- Follow-ups (depending on the vibe)
+
+    - Remove OAI support from server
+
+    - Remove non-Python json schema to grammar converters
+
diff --git a/examples/agent/__main__.py b/examples/agent/__main__.py
new file mode 100644
index 0000000000000..299acbd43259e
--- /dev/null
+++ b/examples/agent/__main__.py
@@ -0,0 +1,6 @@
+import typer
+
+from examples.agent.agent import main
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
new file mode 100644
index 0000000000000..b4feed43518d3
--- /dev/null
+++ b/examples/agent/agent.py
@@ -0,0 +1,243 @@
+import atexit
+from pathlib import Path
+import subprocess
+import sys
+from time import sleep
+import typer
+from pydantic import Json, TypeAdapter
+from typing import Annotated, Callable, List, Union, Optional, Type
+import json, requests
+
+from examples.json_schema_to_grammar import SchemaConverter
+from examples.agent.tools.std_tools import StandardTools
+from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, Tool, ToolFunction
+from examples.agent.utils import collect_functions, load_module
+
+def _get_params_schema(fn: Callable, verbose):
+    converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+    schema = TypeAdapter(fn).json_schema()
+    # Do NOT call converter.resolve_refs(schema) here. Let the server resolve local refs.
+    if verbose:
+        sys.stderr.write(f'# PARAMS SCHEMA: {json.dumps(schema, indent=2)}\n')
+    return schema
+
+def completion_with_tool_usage(
+        *,
+        response_model: Optional[Union[Json, Type]]=None,
+        max_tool_iterations: Optional[int]=None,
+        tools: List[Callable],
+        endpoint: str,
+        messages: List[Message],
+        auth: Optional[str],
+        verbose: bool,
+        **kwargs):
+    '''
+    Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
+    (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+
+    The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
+    '''
+    response_format = None
+    type_adapter = None
+    if response_model:
+        if isinstance(response_model, dict):
+            schema = response_model
+        else:
+            type_adapter = TypeAdapter(response_model)
+            schema = type_adapter.json_schema()
+        response_format={"type": "json_object", "schema": schema }
+
+    tool_map = {fn.__name__: fn for fn in tools}
+    tools_schemas = [
+        Tool(
+            type="function",
+            function=ToolFunction(
+                name=fn.__name__,
+                description=fn.__doc__,
+                parameters=_get_params_schema(fn, verbose=verbose)
+            )
+        )
+        for fn in tools
+    ]
+
+    i = 0
+    while (max_tool_iterations is None or i < max_tool_iterations):
+        request = ChatCompletionRequest(
+            messages=messages,
+            response_format=response_format,
+            tools=tools_schemas,
+            **kwargs,
+        )
+        if verbose:
+            sys.stderr.write(f'# REQUEST: {request.model_dump_json(indent=2)}\n')
+        headers = {
+            "Content-Type": "application/json",
+        }
+        if auth:
+            headers["Authorization"] = auth
+        response = requests.post(
+            endpoint,
+            headers=headers,
+            json=request.model_dump(),
+        )
+        if response.status_code != 200:
+            raise Exception(f"Request failed ({response.status_code}): {response.text}")
+
+        response = ChatCompletionResponse(**response.json())
+        if verbose:
+            sys.stderr.write(f'# RESPONSE: {response.model_dump_json(indent=2)}\n')
+        if response.error:
+            raise Exception(f'Inference failed: {response.error.message}')
+
+        assert len(response.choices) == 1
+        choice = response.choices[0]
+
+        content = choice.message.content
+        if choice.finish_reason == "tool_calls":
+            messages.append(choice.message)
+            for tool_call in choice.message.tool_calls:
+                if content:
+                    print(f'💭 {content}')
+
+                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v}" for k, v in tool_call.function.arguments.items())})'
+                sys.stdout.write(f'⚙️  {pretty_call}')
+                tool_result = tool_map[tool_call.function.name](**tool_call.function.arguments)
+                sys.stdout.write(f" -> {tool_result}\n")
+                messages.append(Message(
+                    tool_call_id=tool_call.id,
+                    role="tool",
+                    name=tool_call.function.name,
+                    # content=f'{tool_result}',
+                    content=f'{pretty_call} = {tool_result}',
+                ))
+        else:
+            assert content
+            result = type_adapter.validate_json(content) if type_adapter else content
+            return result
+
+        i += 1
+
+    if max_tool_iterations is not None:
+        raise Exception(f"Failed to get a valid response after {max_tool_iterations} tool calls")
+
+
+def main(
+    goal: Annotated[str, typer.Option()],
+    tools: Optional[List[str]] = None,
+    format: Annotated[str, typer.Option(help="The output format: either a Python type (e.g. 'float' or a Pydantic model defined in one of the tool files), or a JSON schema, e.g. '{\"format\": \"date\"}'")] = None,
+    max_iterations: Optional[int] = 10,
+    std_tools: Optional[bool] = False,
+    auth: Optional[str] = None,
+    verbose: bool = False,
+
+    model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    endpoint: Optional[str] = None,
+    context_length: Optional[int] = None,
+    # endpoint: str = 'http://localhost:8080/v1/chat/completions',
+
+    n_predict: Optional[int] = 1000,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    min_p: Optional[float] = None,
+    tfs_z: Optional[float] = None,
+    typical_p: Optional[float] = None,
+    temperature: Optional[float] = 0,
+    dynatemp_range: Optional[float] = None,
+    dynatemp_exponent: Optional[float] = None,
+    repeat_last_n: Optional[int] = None,
+    repeat_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presense_penalty: Optional[float] = None,
+    mirostat: Optional[bool] = None,
+    mirostat_tau: Optional[float] = None,
+    mirostat_eta: Optional[float] = None,
+    penalize_nl: Optional[bool] = None,
+    n_keep: Optional[int] = None,
+    seed: Optional[int] = None,
+    n_probs: Optional[int] = None,
+    min_keep: Optional[int] = None,
+):
+    if not endpoint:
+        server_port = 8080
+        server_host = 'localhost'
+        endpoint: str = f'http://{server_host}:{server_port}/v1/chat/completions'
+        if verbose:
+            sys.stderr.write(f"# Starting C++ server with model {model} on {endpoint}\n")
+        cmd = [
+            "python", "-m", "examples.openai.server",
+            "--model", model,
+            *(['--verbose'] if verbose else []),
+            *([f'--context_length={context_length}'] if context_length else []),
+        ]
+        print(cmd)
+        server_process = subprocess.Popen(cmd, stdout=sys.stderr)
+        atexit.register(server_process.kill)
+        sleep(5)
+    
+    tool_functions = []
+    types = {}
+    for f in tools:
+        module = load_module(f)
+        tool_functions.extend(collect_functions(module))
+        types.update({
+            k: v
+            for k, v in module.__dict__.items()
+            if isinstance(v, type)
+        })
+
+    if std_tools:
+        tool_functions.extend(collect_functions(StandardTools))
+    
+    response_model = None#str
+    if format:
+        if format in types:
+            response_model = types[format]
+        elif format == 'json':
+            response_model = {}
+        else:
+            try:
+                response_model = json.loads(format)
+            except:
+                response_model = eval(format)
+    
+    
+    result = completion_with_tool_usage(
+        model="...",
+        endpoint=endpoint,
+        response_model=response_model,
+        max_tool_iterations=max_tool_iterations,
+        tools=tool_functions,
+        auth=auth,
+        verbose=verbose,
+
+        n_predict=n_predict,
+        top_k=top_k,
+        top_p=top_p,
+        min_p=min_p,
+        tfs_z=tfs_z,
+        typical_p=typical_p,
+        temperature=temperature,
+        dynatemp_range=dynatemp_range,
+        dynatemp_exponent=dynatemp_exponent,
+        repeat_last_n=repeat_last_n,
+        repeat_penalty=repeat_penalty,
+        frequency_penalty=frequency_penalty,
+        presense_penalty=presense_penalty,
+        mirostat=mirostat,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        penalize_nl=penalize_nl,
+        n_keep=n_keep,
+        seed=seed,
+        n_probs=n_probs,
+        min_keep=min_keep,
+        messages=[{
+            "role": "user",
+            "content": goal,
+        }]
+    )
+    print(result if response_model else f'➡️ {result}')
+
+if __name__ == '__main__':
+    typer.run(main)
+
diff --git a/examples/openai/fastify-requirements.txt b/examples/agent/fastify-requirements.txt
similarity index 100%
rename from examples/openai/fastify-requirements.txt
rename to examples/agent/fastify-requirements.txt
diff --git a/examples/openai/fastify.py b/examples/agent/fastify.py
similarity index 56%
rename from examples/openai/fastify.py
rename to examples/agent/fastify.py
index 8846a3823cbc7..18186e83067e2 100644
--- a/examples/openai/fastify.py
+++ b/examples/agent/fastify.py
@@ -3,21 +3,11 @@
 
     This is useful in combination w/ the examples/agent/run_sandboxed_tools.sh
 '''
-import os, sys, typing, importlib.util
-from anyio import Path
 import fastapi, uvicorn
 import typer
+from typing import Type, List
 
-def load_source_as_module(source):
-    i = 0
-    while (module_name := f'mod_{i}') in sys.modules:
-        i += 1
-
-    spec = importlib.util.spec_from_file_location(module_name, source)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-    return module
+from examples.agent.utils import load_module
 
 def bind_functions(app, module):
     for k in dir(module):
@@ -26,7 +16,7 @@ def bind_functions(app, module):
         if k == k.capitalize():
             continue
         v = getattr(module, k)
-        if not callable(v) or isinstance(v, typing.Type):
+        if not callable(v) or isinstance(v, Type):
             continue
         if not hasattr(v, '__annotations__'):
             continue
@@ -41,18 +31,11 @@ def bind_functions(app, module):
         except Exception as e:
             print(f'WARNING:    Failed to bind /{k}\n\t{e}')
 
-def main(files: typing.List[str], host: str = '0.0.0.0', port: int = 8000):
+def main(files: List[str], host: str = '0.0.0.0', port: int = 8000):
     app = fastapi.FastAPI()
 
     for f in files:
-        if f.endswith('.py'):
-            sys.path.insert(0, str(Path(f).parent))
-
-            module = load_source_as_module(f)
-        else:
-            module = importlib.import_module(f)
-
-        bind_functions(app, module)
+        bind_functions(app, load_module(f))
 
     uvicorn.run(app, host=host, port=port)
 
diff --git a/examples/openai/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
similarity index 82%
rename from examples/openai/run_sandboxed_tools.sh
rename to examples/agent/run_sandboxed_tools.sh
index 88e61f568ad14..40f873d2ff34a 100755
--- a/examples/openai/run_sandboxed_tools.sh
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -35,23 +35,16 @@ echo "INFO: using DATA_DIR: $DATA_DIR"
 cp \
     "$SCRIPT_DIR/fastify-requirements.txt" \
     "$SCRIPT_DIR/fastify.py" \
+    "$SCRIPT_DIR/utils.py" \
     "$BUILD_DIR"
 
 mkdir -p "$DATA_DIR"
 
-PORT=${PORT:-8088}
-
-# BASE_IMAGE=pytorch/pytorch:latest
-# BASE_IMAGE=python:3.10-slim
-BASE_IMAGE=python:3.11-slim
-# torch 
-# FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 
-# RUN apt-get update && \
-#     apt-get install -y python3-pip python3-dev && \
-#     rm -rf /var/lib/apt/lists/*
+readonly PORT=${PORT:-8088}
+readonly LLAMA_IMAGE_NAME=llama.cpp/tools-base
 
 echo "
-    FROM     $BASE_IMAGE
+    FROM     ${BASE_IMAGE:-python:3.11-slim}
     RUN      apt-get update
     RUN      apt-get install -y gcc python3-dev git cmake
     RUN      pip install --upgrade pip
@@ -63,12 +56,11 @@ echo "
     RUN      pip install -r /root/fastify-requirements.txt
     COPY     script-requirements.txt  /root
     RUN      pip install -r /root/script-requirements.txt
-    COPY     fastify.py               /root
+    COPY     fastify.py utils.py      /root
 
     WORKDIR  /data
-    # ENTRYPOINT uvicorn fastify:app --reload
     ENTRYPOINT PYTHONPATH=/src python /root/fastify.py --port=$PORT '/src/$( basename "$script" )'
-" | docker build "$BUILD_DIR" -f - -t llama.cpp/tools-base
+" | docker build "$BUILD_DIR" -f - -t "$LLAMA_IMAGE_NAME"
 
 echo "#"
 echo "# Binding $script to http://localhost:$PORT/"
@@ -79,4 +71,4 @@ docker run \
     --mount "type=bind,source=$( realpath "$script_folder" ),target=/src,readonly" \
     --mount "type=bind,source=$( realpath "$DATA_DIR" ),target=/data" \
     -p "$PORT:$PORT" \
-    -it llama.cpp/tools-base
\ No newline at end of file
+    -it "$LLAMA_IMAGE_NAME"
\ No newline at end of file
diff --git a/examples/agent/tools/example_math_tools.py b/examples/agent/tools/example_math_tools.py
new file mode 100644
index 0000000000000..4361328bc1c0f
--- /dev/null
+++ b/examples/agent/tools/example_math_tools.py
@@ -0,0 +1,23 @@
+import math
+
+def add(a: float, b: float) -> float:
+    """
+        Add a and b reliably.
+        Don't use this tool to compute the square of a number (use multiply or pow instead)
+    """
+    return a + b
+
+def multiply(a: float, b: float) -> float:
+    """Multiply a with b reliably"""
+    return a * b
+
+def divide(a: float, b: float) -> float:
+    """Divide a by b reliably"""
+    return a / b
+
+def pow(value: float, power: float) -> float:
+    """
+        Raise a value to a power (exponent) reliably.
+        The square of x is pow(x, 2), its cube is pow(x, 3), etc.
+    """
+    return math.pow(value, power)
diff --git a/examples/agent/tools/example_python_tools.py b/examples/agent/tools/example_python_tools.py
new file mode 100644
index 0000000000000..2b2d60e51f888
--- /dev/null
+++ b/examples/agent/tools/example_python_tools.py
@@ -0,0 +1,8 @@
+import math
+
+def eval_python_expression(expr: str) -> float:
+    """
+        Evaluate a Python expression reliably.
+        This can be used to compute complex nested mathematical expressions, or any python, really.
+    """
+    return eval(expr)
diff --git a/examples/agent/tools/example_summaries.py b/examples/agent/tools/example_summaries.py
new file mode 100644
index 0000000000000..a1df1121b7713
--- /dev/null
+++ b/examples/agent/tools/example_summaries.py
@@ -0,0 +1,16 @@
+
+from typing import Annotated, List, Optional
+from annotated_types import MinLen
+from pydantic import BaseModel
+
+
+class QAPair(BaseModel):
+    question: str
+    concise_answer: str
+    justification: str
+
+class PyramidalSummary(BaseModel):
+    title: str
+    summary: str
+    question_answers: Annotated[List[QAPair], MinLen(2)]
+    sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
diff --git a/examples/agent/tools/example_weather_tools.py b/examples/agent/tools/example_weather_tools.py
new file mode 100644
index 0000000000000..0436ac1ab3d2f
--- /dev/null
+++ b/examples/agent/tools/example_weather_tools.py
@@ -0,0 +1,36 @@
+
+import random
+from typing import Literal
+
+
+def _weather(w: str, temp, f):
+    return f'{w}, {temp}C' if format == 'celsius' \
+        else f'{w}, {(temp * 9/5) + 32}F'
+
+def get_current_weather(location: str, format: Literal["celsius", "fahrenheit"]) -> str:
+      '''
+        Get the current weather
+
+        Args:
+            location: The city and state, e.g. San Francisco, CA
+            format: The temperature unit to use. Infer this from the users location.
+      '''
+      return _weather('Sunny', 31, format)
+
+def get_n_day_weather_forecast(location: str, format: Literal["celsius", "fahrenheit"], num_days: int) -> str:
+    '''
+        Get an N-day weather forecast
+
+        Args:
+            location: The city and state, e.g. San Francisco, CA
+            format: The temperature unit to use. Infer this from the users location.
+            num_days: The number of days to forecast
+    '''
+    random.seed(123)
+    return '\n'.join([
+        f'{num_days} forecast for {location}:',
+        *(
+            f'- in {i} day{"s" if i > 1 else ""}: {_weather("Sunny" if i % 2 == 0 else "Cloudy", random.randrange(15, 35), format)}'
+            for i in range(1, num_days)
+        )
+    ])
diff --git a/examples/agent/tools/std_tools.py b/examples/agent/tools/std_tools.py
new file mode 100644
index 0000000000000..39ce40eca2985
--- /dev/null
+++ b/examples/agent/tools/std_tools.py
@@ -0,0 +1,78 @@
+import atexit
+from datetime import date
+import datetime
+import subprocess
+import sys
+from time import sleep
+import time
+import typer
+from pydantic import BaseModel, Json, TypeAdapter
+from annotated_types import MinLen
+from typing import Annotated, Callable, List, Union, Literal, Optional, Type, get_args, get_origin
+import json, requests
+
+class Duration(BaseModel):
+    seconds: Optional[int] = None
+    minutes: Optional[int] = None
+    hours: Optional[int] = None
+    days: Optional[int] = None
+    months: Optional[int] = None
+    years: Optional[int] = None
+
+    @property
+    def get_total_seconds(self) -> int:
+        return sum([
+            self.seconds or 0,
+            (self.minutes or 0)*60,
+            (self.hours or 0)*3600,
+            (self.days or 0)*86400,
+            (self.months or 0)*2592000,
+            (self.years or 0)*31536000,
+        ])
+
+class WaitForDuration(BaseModel):
+    duration: Duration
+
+class WaitForDate(BaseModel):
+    until: date
+
+    def __call__(self):
+        # Get the current date
+        current_date = datetime.date.today()
+
+        if self.until < current_date:
+            raise ValueError("Target date cannot be in the past.")
+
+        time_diff = datetime.datetime.combine(self.until, datetime.time.min) - datetime.datetime.combine(current_date, datetime.time.min)
+
+        days, seconds = time_diff.days, time_diff.seconds
+
+        sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {d}...\n")
+        time.sleep(days * 86400 + seconds)
+        sys.stderr.write(f"Reached the target date: {self.until}\n")
+        
+
+class StandardTools:
+
+    @staticmethod
+    def ask_user(question: str) -> str:
+        '''
+            Ask the user a question and return the answer.
+            This allows getting additional information, requesting disambiguation, etc.
+        '''
+        return typer.prompt(question)
+    
+    @staticmethod
+    def wait(_for: Union[WaitForDuration, WaitForDate]) -> None:
+        '''
+            Wait for a certain amount of time before continuing.
+            This can be used to wait for a specific duration or until a specific date.
+        '''
+        return _for()
+    
+    @staticmethod
+    def say_out_loud(something: str) -> str:
+        """
+            Just says something. Used to say each thought out loud
+        """
+        return subprocess.check_call(["say", something])
diff --git a/examples/agent/utils.py b/examples/agent/utils.py
new file mode 100644
index 0000000000000..4eff7f6ad72a1
--- /dev/null
+++ b/examples/agent/utils.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+import sys
+import importlib.util
+from typing import Type
+
+def load_source_as_module(source):
+    i = 0
+    while (module_name := f'mod_{i}') in sys.modules:
+        i += 1
+
+    spec = importlib.util.spec_from_file_location(module_name, source)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+def load_module(f: str):
+    if f.endswith('.py'):
+        sys.path.insert(0, str(Path(f).parent))
+
+        return load_source_as_module(f)
+    else:
+        return importlib.import_module(f)
+
+def collect_functions(module):
+    for k in dir(module):
+        if k.startswith('_'):
+            continue
+        if k == k.capitalize():
+            continue
+        v = getattr(module, k)
+        if not callable(v) or isinstance(v, Type):
+            continue
+        if not hasattr(v, '__annotations__'):
+            continue
+
+        vt = type(v)
+        if vt.__module__ == 'langchain_core.tools' and vt.__name__.endswith('Tool') and hasattr(v, 'func') and callable(v.func):
+            v = v.func
+
+        yield v
diff --git a/examples/openai/README.md b/examples/openai/README.md
index 8ddcf9a0c78ef..1efdd3d6ffd9c 100644
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@@ -1,87 +1,189 @@
-# examples.openai: OpenAI API-compatible server + agent / tools examples
+# examples.agent: Interactive agent that can use Python tools!
 
-A simple Python server that sits above the C++ [../server](examples/server) and offers improved OAI compatibility.
-
-## Usage
-
-Run a simple test:
+New Python OpenAI API compatibility server, which calls into the C++ server under the hood:
 
 ```bash
-# Spawns a Python server (which spawns a C++ Server) then hits it w/ a tool-calling request
-examples/openai/test.sh
+python -m examples.openai.server --model model.gguf
 ```
 
-To simply run the Python server (+ C++ server under the hood):
-
-```bash
-python -m examples.openai
-```
+## Prerequisites
 
-## Tools usage (WIP)
+Note: To get conda, just install Miniforge (it's OSS): https://github.com/conda-forge/miniforge
 
 ```bash
-git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+conda create -n agent python=3.11
+conda activate agent
+pip install -r examples/openai/requirements.txt
 ```
 
-Then edit `examples/agents/hermes_function_calling/utils.py`:
+## Features
 
-```py
-log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
-```
+The new [examples/openai/server.py](./server.py):
 
-Then run tools in a sandbox:
+- Supports grammar-constrained tool calling for **all** models (incl. Mixtral 7x8B)
 
-```bash
-REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
-  examples/agents/run_sandboxed_tools.sh \
-    examples/agents/hermes_function_calling/functions.py \
-    -e LOG_FOLDER=/data/inference_logs
-```
+    - Optimised support for Functionary & Nous Hermes, easy to extend to other tool-calling schemes
 
-TODO: reactor that reads OpenAPI definitions and does the tool calling
+    - Generic support w/ JSON schema that guides the model towards tool usage (at the cost of extra tokens):
 
-## Features
+        ```ts
+          {
+            // original_thought: string,
+            thought_about_next_step_only: string,
+            next_step: {tool_calls: {name: string, arguments: any}} | {result: T}
+          }
+          // Where T is the output JSON schema, or 'any'
+        ```
+      
+        - Option to publicise schemas to models as TypeScript signatures (as for Functionary) or JSON schema.
 
-The new examples/openai/server.py:
+        - Supports models that require user/assistant alternance (like Mixtral Instruct) by merging system messages into user messages.
 
-- Uses llama.cpp C++ server as a backend (spawns it or connects to existing)
+- Spawns the C++ [llama.cpp server](../server) under the hood (unless passed `--endpoint`), but only uses its non-chat endpoint
 
-- Uses actual jinja2 chat templates read from the models
+  (depending on the prompting strategy, we weave the tool & output schema along with the chat template into the raw model grammar constraints)
 
-- Supports grammar-constrained output for both JSON response format and tool calls
+- Uses the actual Jinja2 templates stored in the GGUF models
 
-- Tool calling “works” w/ all models (even non-specialized ones like Mixtral 7x8B)
+- Will eventually also spawn `whisper.cpp` and another server subprocess for the embeddings endpoint
 
-    - Optimised support for Functionary & Nous Hermes, easy to extend to other tool-calling fine-tunes
+Rationale: the C++ server lacks some OpenAI compatibility features (and can't realistically keep up with prompt templates w/o bringing in too many dependencies), this new layer could allow focusing the C++ server on serving efficiency and delegate OAI compliance to a layer easier to maintain.
 
-## TODO
+## Test
 
-- Support tool result messages
+If you want to see tools in action, look at the [agent example](../agent). Otherwise:
 
-- Reactor / 
+Start the server in Terminal 1:
 
-- Embedding endpoint w/ distinct server subprocess
+```bash
+python -m examples.openai --model  ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
+```
+
+Query it in Terminal 2 (or use it from any framework that makes use of tools: note tool calls are guaranteed to comply to the schema, so retries are likely not necessary!):
 
-- Automatic/manual session caching
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "tools": [{
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      }
+                  },
+                  "required": ["location", "format"]
+              }
+          }
+      }, {
+          "type": "function",
+          "function": {
+              "name": "get_n_day_weather_forecast",
+              "description": "Get an N-day weather forecast",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      },
+                      "num_days": {
+                          "type": "integer",
+                          "description": "The number of days to forecast"
+                      }
+                  },
+                  "required": ["location", "format", "num_days"]
+              }
+          }
+      }],
+    "messages": [
+      {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
+      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days"}
+    ]
+  }'
+```
 
-    - Spawns the main C++ CLI under the hood
+<details>
+<summary>Show output</summary>
+
+```json
+{
+  "id": "chatcmpl-3095057176",
+  "object": "chat.completion",
+  "created": 1711726921,
+  "model": "gpt-3.5-turbo",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "name": null,
+        "tool_call_id": null,
+        "content": "In order to provide the required information, I need to call the get_n_day_weather_forecast function twice, once for San Francisco and once for Glasgow.",
+        "tool_calls": [
+          {
+            "id": "call_970977",
+            "type": "function",
+            "function": {
+              "name": "get_n_day_weather_forecast",
+              "arguments": {
+                "location": "San Francisco, CA",
+                "format": "celsius",
+                "num_days": 4
+              }
+            }
+          }
+        ]
+      },
+      "logprobs": null,
+      "finish_reason": "tool_calls"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 546,
+    "completion_tokens": 118,
+    "total_tokens": 664
+  },
+  "system_fingerprint": "...",
+  "error": null
+}
+```
 
-    - Support precaching long prompts from CLI
+</details>
 
-    - Instant incremental inference in long threads
+## TODO
 
-- Improve examples/agent:
+- Embedding endpoint w/ distinct server subprocess
 
-    - Interactive agent CLI that auto-discovers tools from OpenAPI endpoints
+- Evaluate options for session caching
 
-    - Script that wraps any Python source as a container-sandboxed OpenAPI endpoint (allowing running ~unsafe code w/ tools)
+    - Pass session id & store / read from file?
+    
+    - Support parent session ids for trees of thought?
 
-    - Basic memory / RAG / python interpreter tools
+    - Support precaching long prompts from CLI / read session files?
 
 - Follow-ups
 
     - Remove OAI support from server
 
-    - Remove non-Python json schema to grammar converters
+    - Remove non-Python json-schema-to-grammar versions
 
     - Reach out to frameworks to advertise new option. 
diff --git a/examples/openai/__main__.py b/examples/openai/__main__.py
index 5204826b2dc21..601eee3c4c6a6 100644
--- a/examples/openai/__main__.py
+++ b/examples/openai/__main__.py
@@ -1,8 +1,7 @@
-
-from jsonargparse import CLI
+import typer
 
 from examples.openai.server import main
 
 if __name__ == "__main__":
-    CLI(main)
+    typer.run(main)
 
diff --git a/examples/openai/api.py b/examples/openai/api.py
index 98d710d9cd596..b95eb17fae7a8 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -1,3 +1,4 @@
+from abc import ABC
 from typing import Any, Dict, Literal, Optional, Union
 from pydantic import BaseModel, Json, TypeAdapter
 
@@ -10,8 +11,6 @@ class ToolCall(BaseModel):
     type: Literal["function"] = "function"
     function: FunctionCall
 
-ToolCallsTypeAdapter = TypeAdapter(list[ToolCall])
-
 class Message(BaseModel):
     role: str
     name: Optional[str] = None
@@ -32,15 +31,7 @@ class ResponseFormat(BaseModel):
     type: str
     json_schema: Optional[Any] = None
 
-class ChatCompletionRequest(BaseModel):
-    model: str
-    tools: Optional[list[Tool]] = None
-    messages: list[Message] = None
-    prompt: Optional[str] = None
-    response_format: Optional[ResponseFormat] = None
-
-    stream: bool = False
-    cache_prompt: Optional[bool] = None
+class LlamaCppParams(BaseModel):
     n_predict: Optional[int] = None
     top_k: Optional[int] = None
     top_p: Optional[float] = None
@@ -63,6 +54,16 @@ class ChatCompletionRequest(BaseModel):
     n_probs: Optional[int] = None
     min_keep: Optional[int] = None
 
+class ChatCompletionRequest(LlamaCppParams):
+    model: str
+    tools: Optional[list[Tool]] = None
+    messages: list[Message] = None
+    prompt: Optional[str] = None
+    response_format: Optional[ResponseFormat] = None
+
+    stream: bool = False
+    cache_prompt: Optional[bool] = None
+
 class Choice(BaseModel):
     index: int
     message: Message
@@ -74,6 +75,10 @@ class Usage(BaseModel):
     completion_tokens: int
     total_tokens: int
 
+class CompletionError(BaseModel):
+    message: str
+    # code: int
+
 class ChatCompletionResponse(BaseModel):
     id: str
     object: Literal["chat.completion"]
@@ -81,4 +86,5 @@ class ChatCompletionResponse(BaseModel):
     model: str
     choices: list[Choice]
     usage: Usage
-    system_fingerprint: str
\ No newline at end of file
+    system_fingerprint: str
+    error: Optional[CompletionError] = None
\ No newline at end of file
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 8657861a1fc32..a6d71e36fe0c4 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -9,18 +9,46 @@
 import sys
 from typing import Any, Dict, Literal, Optional, Tuple, Callable, Union
 from pydantic import BaseModel
-from typeguard import typechecked
+# from typeguard import typechecked
 
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.openai.api import Tool, Message, FunctionCall, ToolCall
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 
-@typechecked
+# While the API will be usable with a generic tools usage like OpenAI,
+# (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
+# each model may need specific prompting (and/or constrained output,
+# especially for models not fine-tuned for tool usage / function calling).
+class ToolsPromptStyle(Enum):
+    # Short prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
+    TOOLS_SHORT = 1
+
+    # Longer prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
+    TOOLS_LONG = 2
+
+    # Bespoke constrained output format that favours thought and reasoning
+    # while allowing unambiguous parsing of parallel tool calling.
+    TOOLS_BESPOKE = 3
+
+    # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+    # <tool_call>...</tool_call> output
+    # Requires:
+    # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+    # - Set large context length as their prompts are super long
+    TOOLS_HERMES_2_PRO = 4
+
+    # Seems to want to escape underscores in tool names and in the <tool\_call>...</tool\_call> tags
+    TOOLS_MISTRAL = 5
+
+    # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
+    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
+    # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
+    TYPESCRIPT_FUNCTIONARY_V2 = 6
+
 def raise_exception(msg: str):
     raise Exception(msg)
 
-@typechecked
 class ChatTemplate(BaseModel):
     template: str
 
@@ -41,9 +69,9 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         if "<|recipient|>' + tool_call['function']['name']" in template:
             self._tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
         else:
-            # self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
-
-            self._tool_style = ToolsPromptStyle.TOOLS_LONG
+            self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
+            # self._tool_style = ToolsPromptStyle.TOOLS_LONG
+            # self._tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
             # self._tool_style = ToolsPromptStyle.TOOLS_MISTRAL
 
         # TODO: Test whether the template supports formatting tool_calls
@@ -55,7 +83,7 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
         [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
-        sys.stderr.write(f"\n# prefix={prefix}\n# suffix={suffix}\n\n")
+        # sys.stderr.write(f"\n# prefix={prefix}\n# suffix={suffix}\n\n")
 
         self._prefix = prefix
         self._suffix = suffix
@@ -82,15 +110,27 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
 
     @staticmethod
     def from_gguf(metadata: GGUFKeyValues):
+        if Keys.Tokenizer.CHAT_TEMPLATE not in metadata:
+            raise NotImplementedError(f'Only supporting models with {Keys.Tokenizer.CHAT_TEMPLATE} entry in their GGUF key-values (TODO: add default template, maybe pick llama2\'s?)')
+        
         tokens = metadata[Keys.Tokenizer.LIST]
         return ChatTemplate(
             template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
             bos_token = tokens[metadata[Keys.Tokenizer.BOS_ID]],
             eos_token = tokens[metadata[Keys.Tokenizer.EOS_ID]])
 
+    @staticmethod
+    def from_huggingface(model_id: str):
+        from transformers import LlamaTokenizer
+        tokenizer = LlamaTokenizer.from_pretrained(model_id)
+        return ChatTemplate(
+            template = tokenizer.chat_template or tokenizer.default_chat_template,
+            bos_token = tokenizer.bos_token,
+            eos_token = tokenizer.eos_token)
+
     def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
-        sys.stderr.write(f'# strict_user_assistant_alternation={self._strict_user_assistant_alternation}\n')
-        sys.stderr.write(f'# messages=' + "\n".join(json.dumps(m.model_dump(), indent=2) for m in messages) + '\n')
+        # sys.stderr.write(f'# strict_user_assistant_alternation={self._strict_user_assistant_alternation}\n')
+        # sys.stderr.write(f'# messages=' + "\n".join(json.dumps(m.model_dump(), indent=2) for m in messages) + '\n')
         if self._strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
             new_messages=[]
             i = 0
@@ -113,7 +153,7 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                 elif messages[i].role == 'tool':
                     new_messages.append(Message(
                         role="user",
-                        content=f'TOOL(name={messages[i].name}, id={messages[i].tool_call_id}): {messages[i].content}',
+                        content=f'TOOL RESULT(name={messages[i].name}, id={messages[i].tool_call_id}): {messages[i].content}',
                     ))  
                     i += 1
                 else:
@@ -130,39 +170,9 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
-        sys.stderr.write(f'\n# RENDERED:\n\n{result}\n\n')
+        # sys.stderr.write(f'\n# RENDERED:\n\n{result}\n\n')
         return result
 
-# While the API will be usable with a generic tools usage like OpenAI,
-# (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
-# each model may need specific prompting (and/or constrained output,
-# especially for models not fine-tuned for tool usage / function calling).
-class ToolsPromptStyle(Enum):
-    # Short prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
-    TOOLS_SHORT = 1
-
-    # Longer prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
-    TOOLS_LONG = 2
-
-    # Bespoke constrained output format that favours thought and reasoning
-    # while allowing unambiguous parsing of parallel tool calling.
-    TOOLS_BESPOKE = 3
-
-    # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
-    # <tool_call>...</tool_call> output
-    # Requires:
-    # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
-    # - Set large context length as their prompts are super long
-    TOOLS_HERMES_2_PRO = 4
-
-    # Seems to want to escape underscores in tool names and in the <tool\_call>...</tool\_call> tags
-    TOOLS_MISTRAL = 5
-
-    # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
-    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
-    # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
-    TYPESCRIPT_FUNCTIONARY_V2 = 6
-
 class ChatHandlerArgs(BaseModel):
     chat_template: ChatTemplate
     response_schema: Optional[dict] = None
@@ -189,12 +199,14 @@ def __init__(self, args: ChatHandlerArgs):
                 content=_please_respond_with_schema(args.response_schema)
             )
             converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
-            self.grammar = converter.visit(args.response_schema, '')
+            schema = converter.resolve_refs(args.response_schema, 'response')
+            converter.visit(schema, '')
+            self.grammar = converter.format_grammar()
         else:
             self.output_format_prompt = None
             self.grammar = None
 
-    @typechecked
+    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         return Message(role="assistant", content=s)
 
@@ -203,21 +215,24 @@ def __init__(self, args: ChatHandlerArgs, escapes_underscores: bool, allow_paral
         super().__init__(args)
 
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
-        tool_rules = [
-            converter.visit(
+        tool_rules = []
+        for tool in self.args.tools:
+            
+            parameters_schema = tool.function.parameters
+            parameters_schema = converter.resolve_refs(parameters_schema, tool.function.name)
+            
+            tool_rules.append(converter.visit(
                 dict(
                     type="object",
                     properties=dict(
                         name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
                             else dict(const=tool.function.name),
-                        arguments=tool.function.parameters,
+                        arguments=parameters_schema,
                     ),
                     required=['name', 'arguments']
                 ),
                 f'{tool.function.name}-tool-call'
-            )
-            for tool in self.args.tools
-        ]
+            ))
 
         def format_literal(s: str) -> str:
             if escapes_underscores:
@@ -253,7 +268,7 @@ def format_literal(s: str) -> str:
         #         ") " + converter._format_literal("</tool_call>") +
         #     ")") # + converter._format_literal(suffix))
         
-    @typechecked
+    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
 
@@ -386,7 +401,7 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
         #         ") " +
         #     ")") # + converter._format_literal(suffix))
     
-    @typechecked
+    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
         
@@ -422,7 +437,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls
     return {
         "type": "object",
         "properties": {
-            "original_goal": {"title": "Original Goal", "type": "string"},
+            # "original_goal": {"title": "Original Goal", "type": "string"},
             "thought_about_next_step_only": {
                 "title": "Thought about next step",
                 # "title": "Thought about how the next step brings us closer to achieving the original goal",
@@ -455,6 +470,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls
             },
         },
         "required": ["original_goal", "thought_about_next_step_only", "next_step"]
+        # "required": ["next_step"]
     }
 
 class BespokeToolsChatHandler(ChatHandler):
@@ -513,7 +529,7 @@ def __init__(self, args: ChatHandlerArgs):
             ])
         )
 
-    @typechecked
+    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
         try:
@@ -527,7 +543,7 @@ def parse(self, s: str) -> Optional[Message]:
         elif 'tool_calls' in next_step:
             return Message(
                 role="assistant",
-                content=data["thought_about_next_step_only"],
+                content=data["thought_about_next_step_only"] if "thought_about_next_step_only" in data else None,
                 tool_calls=[
                     ToolCall(id=gen_callid(), function=FunctionCall(**tc))
                     for tc in next_step['tool_calls']
@@ -545,7 +561,8 @@ def parse(self, s: str) -> Optional[Message]:
 
 _LONG_TEMPLATE='\n'.join([
     # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
-    'You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
+    # 'You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
+    'Call one or more functions to assist with the user query, every time this is possible. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
     '<tools>',
     '{tools}',
     '</tools>',
@@ -564,7 +581,7 @@ def get_chat_handler(args: ChatHandlerArgs, allow_parallel_calls=False) -> ChatH
     if not args.tools:
         return NoToolsChatHandler(args)
     elif args.chat_template.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        return FunctionaryToolsChatHandler(args)
+        return FunctionaryToolsChatHandler(args, allow_parallel_calls=False)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_SHORT:
         return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, allow_parallel_calls=allow_parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_LONG:
diff --git a/examples/openai/reactor.py b/examples/openai/reactor.py
deleted file mode 100644
index 7aae066ebe15a..0000000000000
--- a/examples/openai/reactor.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Usage:
-#! ./server -m some-model.gguf &
-#! pip install pydantic
-#! python examples/json-schema-pydantic-example.py
-#
-# TODO:
-# - https://github.com/NousResearch/Hermes-Function-Calling
-#
-# <|im_start|>system
-# You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags
-# You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-# <tools> {'type': 'function', 'function': {'name': 'get_stock_fundamentals',
-# 'description': 'get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\n\n    Args:\n    symbol (str): The stock symbol.\n\n    Returns:\n    dict: A dictionary containing fundamental data.', 'parameters': {'type': 'object', 'properties': {'symbol': {'type': 'string'}}, 'required': ['symbol']}}} 
-# </tools> Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']} For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
-# <tool_call>
-# {'arguments': <args-dict>, 'name': <function-name>}
-# </tool_call><|im_end|>
-
-from dataclasses import dataclass
-import subprocess
-import sys
-from pydantic import BaseModel, TypeAdapter
-from annotated_types import MinLen
-from typing import Annotated, Callable, List, Union, Literal, Optional, Type, get_args, get_origin
-import json, requests
-
-from examples.openai.api import ToolCallsTypeAdapter
-
-def type_to_str(t):
-    origin = get_origin(t)
-    if origin is None:
-        return t.__name__
-    args = get_args(t)
-    return origin.__name__ + (
-        f'[{", ".join(type_to_str(a) for a in args)}]' if args else ''
-    )
-
-def build_union_type_adapter(*types):
-    src = '\n'.join([
-        'from pydantic import TypeAdapter',
-        'from typing import Union',
-        f'_out = TypeAdapter(Union[{", ".join(type_to_str(t) for t in types)}])',
-    ])
-    globs = {
-        **globals(),
-        **{t.__name__: t for t in types},
-    }
-    exec(src, globs)
-    return globs['_out']
-
-class Thought(BaseModel):
-    thought: str
-
-
-def build_tool_call_adapter2(final_output_type, *tools):
-    lines = [
-        'from pydantic import BaseModel, TypeAdapter',
-        'from typing import Literal, Union',
-    ]
-    globs = {
-        **globals(),
-        **locals(),
-        final_output_type.__name__: final_output_type,
-    }
-    tool_calls = []
-    for fn in tools:
-        # TODO: escape fn.__doc__ and fn.__doc__ to avoid comment or metadata injection!
-        fn_name = fn.__name__
-        fn_doc = fn.__doc__.replace('"""', "'''") if fn.__doc__ else None
-        name = fn_name.replace('_', ' ').title().replace(' ', '')
-        lines += [
-            f'class {name}ToolArgs(BaseModel):',
-            *(f'  {k}: {type_to_str(v)}' for k, v in fn.__annotations__.items() if k != 'return'),
-            f'class {name}ToolCall(BaseModel):',
-            *([f'  """{fn_doc}"""'] if fn_doc else []),
-            f'  name: Literal["{fn_name}"]',
-            f'  arguments: {name}ToolArgs',
-            f'class {name}Tool(BaseModel):',
-            # *([f'  """{fn_doc}"""'] if fn_doc else []),
-            f'  id: str',
-            f'  type: Literal["function"]',
-            f'  function: {name}ToolCall',
-            f'  def __call__(self) -> {type_to_str(fn.__annotations__.get("return"))}:',
-            f'    return {fn_name}(**self.function.arguments.dict())',
-        ]
-        tool_calls.append(f'{name}Tool')
-    
-    lines += [
-        # 'class FinalResult(BaseModel):',
-        # f'  result: {type_to_str(final_output_type)}',
-        # 'class Response(BaseModel):',
-        # f'  """A response that starts with a thought about whether we need tools or not, the plan about tool usage (maybe a sequence of tool calls), and then either a final result (of type {final_output_type.__name__}) or a first tool call"""',
-        # f'  original_goal: str',
-        # f'  thought_process: str',
-        # # f'  thought: str',
-        # f'  next_step: Union[FinalResult, {", ".join(tool_calls)}]',
-        # f'response_adapter = TypeAdapter(Response)'
-        f'response_adapter = TypeAdapter(Union[{", ".join(tool_calls)}])',
-    ]
-
-    exec('\n'.join(lines), globs)
-    return globs['response_adapter']
-
-def create_completion2(*, response_model=None, max_tool_iterations=None, tools=[], endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
-    '''
-    Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
-    (llama.cpp server, llama-cpp-python, Anyscale / Together...)
-
-    The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
-    '''
-    if response_model:
-        type_adapter = TypeAdapter(response_model)
-        schema = type_adapter.json_schema()
-        # messages = [{
-        #     "role": "system",
-        #     "content": f"Respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
-        # }] + messages
-        # print("Completion: ", json.dumps(messages, indent=2))
-        # print("SCHEMA: " + json.dumps(schema, indent=2))
-        response_format={"type": "json_object", "schema": schema }
-
-    tool_call_adapter = build_tool_call_adapter2(response_model, *tools)
-    tool_adapters = [(fn, TypeAdapter(fn)) for fn in tools]
-    tools_schemas = [{
-        "type": "function",
-        "function": {
-            "name": fn.__name__,
-            "description": fn.__doc__,
-            "parameters": ta.json_schema()
-        }
-    } for (fn, ta) in tool_adapters]
-
-    # messages = [{
-    #     "role": "system",
-    #     "content": '\n'.join([
-    # #         "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.",
-    # #         "You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:",
-    # #         f'<tools>{json.dumps(tools_schemas)}</tools>',
-    #         'Before calling each tool, you think clearly and briefly about why and how you are using the tool.',
-    #         f"Respond in JSON format with the following schema: {json.dumps(schema, indent=2)}" if schema else "",
-    #     ])
-    # }] + messages
-
-    i = 0
-    while (max_tool_iterations is None or i < max_tool_iterations):
-        body=dict(
-            messages=messages,
-            response_format=response_format,
-            tools=tools_schemas,
-            **kwargs
-        )
-        # sys.stderr.write(f'# REQUEST: {json.dumps(body, indent=2)}\n')
-        response = requests.post(
-            endpoint,
-            headers={"Content-Type": "application/json"},
-            json=body,
-        )
-        if response.status_code != 200:
-            raise Exception(f"Request failed ({response.status_code}): {response.text}")
-
-        # sys.stderr.write(f"\n# RESPONSE:\n\n<<<{response.text}>>>\n\n")
-        data = response.json()
-        if 'error' in data:
-            raise Exception(data['error']['message'])
-
-        # sys.stderr.write(f"\n# RESPONSE DATA:\n\n{json.dumps(data, indent=2)}\n\n")
-        # print(json.dumps(data, indent=2))
-        choice = data["choices"][0]
-
-        content = choice["message"].get("content")
-        if choice.get("finish_reason") == "tool_calls":
-            # sys.stderr.write(f'\n# TOOL CALLS:\n{json.dumps(choice["message"]["tool_calls"], indent=2)}\n\n')
-            # tool_calls =ToolCallsTypeAdapter.validate_json(json.dumps(choice["tool_calls"]))
-            messages.append(choice["message"])
-            for tool_call in choice["message"]["tool_calls"]:
-                # id = tool_call.get("id")
-                # if id:
-                #     del tool_call["id"]
-
-                if content:
-                    print(f'💭 {content}')
-
-                tc = tool_call_adapter.validate_json(json.dumps(tool_call))
-                
-                pretty_call = f'{tc.function.name}({", ".join(f"{k}={v}" for k, v in tc.function.arguments.model_dump().items())})'
-                sys.stdout.write(f'⚙️  {pretty_call}')
-                result = tc()
-                sys.stdout.write(f" -> {result}\n")
-                messages.append({
-                    "tool_call_id": tc.id,
-                    "role": "tool",
-                    "name": tc.function.name,
-                    # "content": f'{result}',
-                    "content": f'{pretty_call} = {result}',
-                })
-        else:
-            assert content
-            # print(content)
-            # print(json.dumps(json.loads(content), indent=2))
-            result = type_adapter.validate_json(content) if type_adapter else content
-            # if isinstance(result, Thought):
-            #     print(f'💭 {result.thought}')
-            #     messages.append({
-            #         "role": "assistant",
-            #         "content": json.dumps(result.model_dump(), indent=2),
-            #     })
-            # else:
-            return result
-
-        i += 1
-
-    if max_tool_iterations is not None:
-        raise Exception(f"Failed to get a valid response after {max_tool_iterations} tool calls")
-
-if __name__ == '__main__':
-
-    class QAPair(BaseModel):
-        question: str
-        concise_answer: str
-        justification: str
-
-    class PyramidalSummary(BaseModel):
-        title: str
-        summary: str
-        question_answers: Annotated[List[QAPair], MinLen(2)]
-        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
-
-    # print("# Summary\n", create_completion(
-    #     model="...",
-    #     response_model=PyramidalSummary,
-    #     messages=[{
-    #         "role": "user",
-    #         "content": f"""
-    #             You are a highly efficient corporate document summarizer.
-    #             Create a pyramidal summary of an imaginary internal document about our company processes
-    #             (starting high-level, going down to each sub sections).
-    #             Keep questions short, and answers even shorter (trivia / quizz style).
-    #         """
-    #     }]))
-    
-    import math
-
-    def eval_python_expression(expr: str) -> float:
-        """
-            Evaluate a Python expression reliably.
-            This can be used to compute complex nested mathematical expressions, or any python, really.
-        """
-        print("# Evaluating expression: ", expr)
-        return "0.0"
-
-    def add(a: float, b: float) -> float:
-        """
-            Add a and b reliably.
-            Don't use this tool to compute the square of a number (use multiply or pow instead)
-        """
-        return a + b
-    
-    # def say(something: str) -> str:
-    #     """
-    #         Just says something. Used to say each thought out loud
-    #     """
-    #     return subprocess.check_call(["say", something])
-
-    def multiply(a: float, b: float) -> float:
-        """Multiply a with b reliably"""
-        return a * b
-
-    def divide(a: float, b: float) -> float:
-        """Divide a by b reliably"""
-        return a / b
-
-    def pow(value: float, power: float) -> float:
-        """
-            Raise a value to a power (exponent) reliably.
-            The square of x is pow(x, 2), its cube is pow(x, 3), etc.
-        """
-        return math.pow(value, power)
-
-    result = create_completion2(
-        model="...",
-        response_model=str,
-        tools=[add, multiply, divide, pow], #, say],#, eval_python_expression],
-        # tools=[eval_python_expression],
-        temperature=0.0,
-        # repetition_penalty=1.0,
-        n_predict=1000,
-        top_k=1,
-        top_p=0.0,
-        # logit_bias={
-        #     i: 10.0
-        #     for i in range(1, 259)
-        # },
-        messages=[{
-        #     "role": "system",
-        #     "content": f"""
-        #         You are a reliable assistant. You think step by step and think before using tools
-        #     """
-        # }, {
-            "role": "user",
-            # "content": f"""
-            #     What is 10 squared?
-            # """
-            "content": f"""
-                What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
-
-                Keep your goal in mind at every step.
-            """
-                # Think step by step, start expressing the problem as an arithmetic expression
-        }])
-    
-    # result = create_completion(
-    #     model="...",
-    #     response_model=float,
-    #     tools=[add, multiply, divide, pow], #, say],#, eval_python_expression],
-    #     temperature=0.0,
-    #     # logit_bias={
-    #     #     i: 10.0
-    #     #     for i in range(1, 259)
-    #     # },
-    #     messages=[{
-    #         "role": "user",
-    #         # "content": f"""
-    #         #     What is 10 squared?
-    #         # """
-    #         "content": f"""
-    #             What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
-    #         """
-    #             # Think step by step, start expressing the problem as an arithmetic expression
-    #     }])
-    
-    # 💭 First, I need to square the number 2535. For this, I will use the 'pow' tool.
-    # ⚙️  pow(args={'value': 2535.0, 'power': 2.0})-> 6426225.0
-    # 💭 Now that I have the square of 2535, I need to add it to 32222000403.0 and store the result.
-    # ⚙️  add(args={'a': 6426225.0, 'b': 32222000403.0})-> 32228426628.0
-    # 💭 Now that I have the sum of 2535 squared and 32222000403, I need to multiply it by 1.5.
-    # ⚙️  pow(args={'value': 32228426628.0, 'power': 1.5})-> 5785736571757004.0
-    # 💭 Now that I have the result of the sum multiplied by 1.5, I need to divide it by 3 to get a third of the result.
-    # ⚙️  divide(args={'a': 5785736571757004.0, 'b': 3.0})-> 1928578857252334.8
-    # 💭 I have now calculated a third of the result, which is 1928578857252334.8. I can now share this as the final answer.
-    # Result:  1928578857252334.8
-
-    expected_result = (2535 ** 2 + 32222000403) * 1.5 / 3.0
-    print("➡️", result)
-    assert math.fabs(result - expected_result) < 0.0001, f"Expected {expected_result}, got {result}"
diff --git a/examples/openai/server.py b/examples/openai/server.py
index fbd2f22da46f1..d2a3aea2d1fab 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -21,39 +21,56 @@
 from starlette.responses import StreamingResponse
 from typing import Annotated, Optional
 import typer
-from typeguard import typechecked
 
 def generate_id(prefix):
     return f"{prefix}{random.randint(0, 1 << 32)}"
 
 def main(
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
-    # model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
+    template_hf_model_id_fallback: Annotated[Optional[str], typer.Option(help="If the GGUF model does not contain a chat template, get it from this HuggingFace tokenizer")] = 'meta-llama/Llama-2-7b-chat-hf',
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
-    cpp_server_endpoint: Optional[str] = None,
-    cpp_server_host: str = "localhost",
-    cpp_server_port: Optional[int] = 8081,
+    auth: Optional[str] = None,
+    verbose: bool = False,
+    context_length: Optional[int] = None,
+    endpoint: Optional[str] = None,
+    server_host: str = "localhost",
+    server_port: Optional[int] = 8081,
 ):
     import uvicorn
 
-    metadata = GGUFKeyValues(model)
-    context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
-    chat_template = ChatTemplate.from_gguf(metadata)
-    # print(chat_template)
+    if endpoint:
+        sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+        chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
+        
+    else:
+        metadata = GGUFKeyValues(model)
+
+        if not context_length:
+            context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
+    
+        if Keys.Tokenizer.CHAT_TEMPLATE in metadata:
+            chat_template = ChatTemplate.from_gguf(metadata)
+        else:
+            sys.stderr.write(f"# WARNING: Model does not contain a chat template, fetching it from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+            chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
 
-    if not cpp_server_endpoint:
-        sys.stderr.write(f"# Starting C++ server with model {model} on {cpp_server_host}:{cpp_server_port}\n")
+        if verbose:
+            sys.stderr.write(f"# CHAT TEMPLATE:\n\n{chat_template}\n\n")
+
+        if verbose:
+            sys.stderr.write(f"# Starting C++ server with model {model} on {server_host}:{server_port}\n")
         server_process = subprocess.Popen([
             "./server", "-m", model,
-            "--host", cpp_server_host, "--port", f'{cpp_server_port}',
+            "--host", server_host, "--port", f'{server_port}',
+            # TODO: pass these from JSON / BaseSettings?
             '-ctk', 'q4_0', '-ctv', 'f16',
-            "-c", f"{2*8192}",
-            # "-c", f"{context_length}",
+            "-c", f"{context_length}",
+            *([] if verbose else ["--log-disable"]),
         ], stdout=sys.stderr)
         atexit.register(server_process.kill)
-        cpp_server_endpoint = f"http://{cpp_server_host}:{cpp_server_port}"
+        endpoint = f"http://{server_host}:{server_port}/completions"
 
     app = FastAPI()
 
@@ -62,8 +79,8 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         headers = {
             "Content-Type": "application/json",
         }
-        if (auth := request.headers.get("Authorization")):
-            headers["Authorization"] = auth
+        if (auth_value := request.headers.get("Authorization", auth)):
+            headers["Authorization"] = auth_value
 
         if chat_request.response_format is not None:
             assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
@@ -79,9 +96,12 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         prompt = chat_template.render(messages, add_generation_prompt=True)
         
-        sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
-        sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
-        sys.stderr.write(f'\n# GRAMMAR:\n\n{chat_handler.grammar}\n\n')
+        
+        if verbose:
+            sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
+            # sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
+            sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
+            sys.stderr.write(f'\n# GRAMMAR:\n\n{chat_handler.grammar}\n\n')
         
         data = LlamaCppServerCompletionRequest(
             **{
@@ -101,7 +121,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         async with httpx.AsyncClient() as client:
             response = await client.post(
-                f"{cpp_server_endpoint}/completions",
+                f"{endpoint}",
                 json=data,
                 headers=headers,
                 timeout=None)
@@ -112,7 +132,8 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
         else:
             result = response.json()
-            sys.stderr.write("# RESULT:\n\n" + json.dumps(result, indent=2) + "\n\n")
+            if verbose:
+                sys.stderr.write("# RESULT:\n\n" + json.dumps(result, indent=2) + "\n\n")
             if 'content' not in result:
                 # print(json.dumps(result, indent=2))
                 return JSONResponse(result)

From b63f91ade43fc74f14fdd1f5300c2f5bf48b4e88 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 16:19:05 +0000
Subject: [PATCH 14/68] Update agent.py

---
 examples/agent/agent.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index b4feed43518d3..bbe8223e21b36 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -24,7 +24,7 @@ def _get_params_schema(fn: Callable, verbose):
 def completion_with_tool_usage(
         *,
         response_model: Optional[Union[Json, Type]]=None,
-        max_tool_iterations: Optional[int]=None,
+        max_iterations: Optional[int]=None,
         tools: List[Callable],
         endpoint: str,
         messages: List[Message],
@@ -61,7 +61,7 @@ def completion_with_tool_usage(
     ]
 
     i = 0
-    while (max_tool_iterations is None or i < max_tool_iterations):
+    while (max_iterations is None or i < max_iterations):
         request = ChatCompletionRequest(
             messages=messages,
             response_format=response_format,
@@ -117,8 +117,8 @@ def completion_with_tool_usage(
 
         i += 1
 
-    if max_tool_iterations is not None:
-        raise Exception(f"Failed to get a valid response after {max_tool_iterations} tool calls")
+    if max_iterations is not None:
+        raise Exception(f"Failed to get a valid response after {max_iterations} tool calls")
 
 
 def main(
@@ -205,7 +205,7 @@ def main(
         model="...",
         endpoint=endpoint,
         response_model=response_model,
-        max_tool_iterations=max_tool_iterations,
+        max_iterations=max_iterations,
         tools=tool_functions,
         auth=auth,
         verbose=verbose,

From c340e8cd3ba2859f32a5f1ffda8126f1193af29f Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 16:24:59 +0000
Subject: [PATCH 15/68] Update example_weather_tools.py

---
 examples/agent/tools/example_weather_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/agent/tools/example_weather_tools.py b/examples/agent/tools/example_weather_tools.py
index 0436ac1ab3d2f..0154d966ded63 100644
--- a/examples/agent/tools/example_weather_tools.py
+++ b/examples/agent/tools/example_weather_tools.py
@@ -3,7 +3,7 @@
 from typing import Literal
 
 
-def _weather(w: str, temp, f):
+def _weather(w: str, temp, format):
     return f'{w}, {temp}C' if format == 'celsius' \
         else f'{w}, {(temp * 9/5) + 32}F'
 

From ce2fb0155f2952e736fe36bbdd943bd4cc6746bd Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 16:40:23 +0000
Subject: [PATCH 16/68] agent: add --allow_parallel_calls

---
 examples/agent/README.md     |  2 ++
 examples/agent/agent.py      | 12 ++++++++++--
 examples/openai/prompting.py | 16 +++++++++-------
 examples/openai/server.py    | 11 ++++++++---
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index c774bfb31035e..3ae0fc7d10ba6 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -157,6 +157,8 @@ REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.tx
 
 ## TODO
 
+- Wait for spawned servers to be heathly
+
 - Add model URL / HF loading support
 
 - Add Embedding endpoint + storage / retrieval tools (Faiss? ScaNN?), or spontaneous RAG
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index bbe8223e21b36..78c1f82526c1d 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -128,6 +128,7 @@ def main(
     max_iterations: Optional[int] = 10,
     std_tools: Optional[bool] = False,
     auth: Optional[str] = None,
+    allow_parallel_calls: Optional[bool] = False,
     verbose: bool = False,
 
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
@@ -135,6 +136,8 @@ def main(
     context_length: Optional[int] = None,
     # endpoint: str = 'http://localhost:8080/v1/chat/completions',
 
+    greedy: Optional[bool] = True,
+
     n_predict: Optional[int] = 1000,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
@@ -157,6 +160,10 @@ def main(
     n_probs: Optional[int] = None,
     min_keep: Optional[int] = None,
 ):
+    if greedy:
+        top_k = 1
+        top_p = 0.0
+
     if not endpoint:
         server_port = 8080
         server_host = 'localhost'
@@ -167,9 +174,10 @@ def main(
             "python", "-m", "examples.openai.server",
             "--model", model,
             *(['--verbose'] if verbose else []),
-            *([f'--context_length={context_length}'] if context_length else []),
+            *(['--allow-parallel-calls'] if allow_parallel_calls else []),
+            *(['--context-length={context_length}'] if context_length else []),
+            *([])
         ]
-        print(cmd)
         server_process = subprocess.Popen(cmd, stdout=sys.stderr)
         atexit.register(server_process.kill)
         sleep(5)
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index a6d71e36fe0c4..3edea651da86c 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -320,8 +320,8 @@ def __init__(self, args: ChatHandlerArgs, template: str, escapes_underscores=Fal
         )
 
 class Hermes2ProToolsChatHandler(ToolCallTagsChatHandler):
-    def __init__(self, args: ChatHandlerArgs):
-        super().__init__(args, escapes_underscores=False, allow_parallel_calls=False)
+    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
+        super().__init__(args, escapes_underscores=False, allow_parallel_calls=allow_parallel_calls)
 
         # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
         path = str(Path(__file__).parent / "hermes_function_calling")
@@ -433,7 +433,7 @@ def parse(self, s: str) -> Optional[Message]:
             content = '\n'.join(text_content).strip()
             return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
 
-def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls=False):
+def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls):
     return {
         "type": "object",
         "properties": {
@@ -474,7 +474,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls
     }
 
 class BespokeToolsChatHandler(ChatHandler):
-    def __init__(self, args: ChatHandlerArgs):
+    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
         super().__init__(args)
         
         # args.response_schema = args.response_schema or {}
@@ -496,7 +496,8 @@ def __init__(self, args: ChatHandlerArgs):
                         }
                         for tool in self.args.tools
                     ]
-                }
+                },
+                allow_parallel_calls=allow_parallel_calls,
             ),
             '',
         )
@@ -523,7 +524,8 @@ def __init__(self, args: ChatHandlerArgs):
                                 }
                             },
                             "required": ["name", "arguments"]
-                        }
+                        },
+                        allow_parallel_calls=allow_parallel_calls,
                     )
                 ),
             ])
@@ -589,7 +591,7 @@ def get_chat_handler(args: ChatHandlerArgs, allow_parallel_calls=False) -> ChatH
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_MISTRAL:
         return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, escapes_underscores=True, allow_parallel_calls=allow_parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_BESPOKE:
-        return BespokeToolsChatHandler(args)
+        return BespokeToolsChatHandler(args, allow_parallel_calls=allow_parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
         return Hermes2ProToolsChatHandler(args)
     else:
diff --git a/examples/openai/server.py b/examples/openai/server.py
index d2a3aea2d1fab..96279632459e5 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -31,6 +31,7 @@ def main(
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
+    allow_parallel_calls: Optional[bool] = False,
     auth: Optional[str] = None,
     verbose: bool = False,
     context_length: Optional[int] = None,
@@ -61,14 +62,15 @@ def main(
 
         if verbose:
             sys.stderr.write(f"# Starting C++ server with model {model} on {server_host}:{server_port}\n")
-        server_process = subprocess.Popen([
+        cmd = [
             "./server", "-m", model,
             "--host", server_host, "--port", f'{server_port}',
             # TODO: pass these from JSON / BaseSettings?
             '-ctk', 'q4_0', '-ctv', 'f16',
             "-c", f"{context_length}",
             *([] if verbose else ["--log-disable"]),
-        ], stdout=sys.stderr)
+        ]
+        server_process = subprocess.Popen(cmd, stdout=sys.stderr)
         atexit.register(server_process.kill)
         endpoint = f"http://{server_host}:{server_port}/completions"
 
@@ -88,7 +90,10 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         else:
             response_schema = None
 
-        chat_handler = get_chat_handler(ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools))
+        chat_handler = get_chat_handler(
+            ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools),
+            allow_parallel_calls=allow_parallel_calls
+        )
         
         messages = chat_request.messages
         if chat_handler.output_format_prompt:

From ea34bd3e5c42f7a71cb2530e52ae8a2736cef4b8 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 17:00:53 +0000
Subject: [PATCH 17/68] agent/openai:nits

---
 examples/agent/README.md                |   2 +-
 examples/agent/agent.py                 |  12 +--
 examples/agent/fastify.py               |   2 +-
 examples/agent/run_sandboxed_tools.sh   |   2 +-
 examples/agent/tools/std_tools.py       |  14 +--
 examples/openai/README.md               |   6 +-
 examples/openai/llama_cpp_server_api.py |  26 +----
 examples/openai/prompting.py            | 133 +++++++-----------------
 examples/openai/server.py               |  18 ++--
 examples/openai/ts_converter.py         |   2 +-
 10 files changed, 72 insertions(+), 145 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 3ae0fc7d10ba6..045d237194325 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -76,7 +76,7 @@ This example relies on the new [OpenAI compatibility server](../openai).
   agent.py  →  examples.openai  →  server.cpp
             →  safe_tools.py
             → ( run_sandboxed_tools.sh :  Docker  →  fastify.py )  →  unsafe_tools.py  →  code interpreter, etc...
-``` 
+```
 
 The agent can use tools written in Python, or (soon) exposed under OpenAPI endpoints. Only has standard Python deps (e.g. no langchain)
 
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 78c1f82526c1d..ed428af6801ad 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -128,7 +128,7 @@ def main(
     max_iterations: Optional[int] = 10,
     std_tools: Optional[bool] = False,
     auth: Optional[str] = None,
-    allow_parallel_calls: Optional[bool] = False,
+    parallel_calls: Optional[bool] = True,
     verbose: bool = False,
 
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
@@ -174,14 +174,14 @@ def main(
             "python", "-m", "examples.openai.server",
             "--model", model,
             *(['--verbose'] if verbose else []),
-            *(['--allow-parallel-calls'] if allow_parallel_calls else []),
+            *(['--parallel-calls'] if parallel_calls else []),
             *(['--context-length={context_length}'] if context_length else []),
             *([])
         ]
         server_process = subprocess.Popen(cmd, stdout=sys.stderr)
         atexit.register(server_process.kill)
         sleep(5)
-    
+
     tool_functions = []
     types = {}
     for f in tools:
@@ -195,7 +195,7 @@ def main(
 
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
-    
+
     response_model = None#str
     if format:
         if format in types:
@@ -207,8 +207,8 @@ def main(
                 response_model = json.loads(format)
             except:
                 response_model = eval(format)
-    
-    
+
+
     result = completion_with_tool_usage(
         model="...",
         endpoint=endpoint,
diff --git a/examples/agent/fastify.py b/examples/agent/fastify.py
index 18186e83067e2..ccffe9d84a4b9 100644
--- a/examples/agent/fastify.py
+++ b/examples/agent/fastify.py
@@ -41,4 +41,4 @@ def main(files: List[str], host: str = '0.0.0.0', port: int = 8000):
 
 if __name__ == '__main__':
     typer.run(main)
-    
+
diff --git a/examples/agent/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
index 40f873d2ff34a..8eddb5d92ef8a 100755
--- a/examples/agent/run_sandboxed_tools.sh
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -11,7 +11,7 @@ script="$( realpath "$1" )"
 script_folder="$(dirname "$script")"
 shift 1
 
-function cleanup {      
+function cleanup {
   rm -rf "$BUILD_DIR"
   echo "Deleted $BUILD_DIR"
 }
diff --git a/examples/agent/tools/std_tools.py b/examples/agent/tools/std_tools.py
index 39ce40eca2985..9093e8dc2cf4c 100644
--- a/examples/agent/tools/std_tools.py
+++ b/examples/agent/tools/std_tools.py
@@ -1,15 +1,11 @@
-import atexit
 from datetime import date
 import datetime
+from pydantic import BaseModel
 import subprocess
 import sys
-from time import sleep
 import time
 import typer
-from pydantic import BaseModel, Json, TypeAdapter
-from annotated_types import MinLen
-from typing import Annotated, Callable, List, Union, Literal, Optional, Type, get_args, get_origin
-import json, requests
+from typing import Union, Optional
 
 class Duration(BaseModel):
     seconds: Optional[int] = None
@@ -50,7 +46,7 @@ def __call__(self):
         sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {d}...\n")
         time.sleep(days * 86400 + seconds)
         sys.stderr.write(f"Reached the target date: {self.until}\n")
-        
+
 
 class StandardTools:
 
@@ -61,7 +57,7 @@ def ask_user(question: str) -> str:
             This allows getting additional information, requesting disambiguation, etc.
         '''
         return typer.prompt(question)
-    
+
     @staticmethod
     def wait(_for: Union[WaitForDuration, WaitForDate]) -> None:
         '''
@@ -69,7 +65,7 @@ def wait(_for: Union[WaitForDuration, WaitForDate]) -> None:
             This can be used to wait for a specific duration or until a specific date.
         '''
         return _for()
-    
+
     @staticmethod
     def say_out_loud(something: str) -> str:
         """
diff --git a/examples/openai/README.md b/examples/openai/README.md
index 1efdd3d6ffd9c..e9a8658c26659 100644
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@@ -34,7 +34,7 @@ The new [examples/openai/server.py](./server.py):
           }
           // Where T is the output JSON schema, or 'any'
         ```
-      
+
         - Option to publicise schemas to models as TypeScript signatures (as for Functionary) or JSON schema.
 
         - Supports models that require user/assistant alternance (like Mixtral Instruct) by merging system messages into user messages.
@@ -175,7 +175,7 @@ curl http://localhost:8080/v1/chat/completions \
 - Evaluate options for session caching
 
     - Pass session id & store / read from file?
-    
+
     - Support parent session ids for trees of thought?
 
     - Support precaching long prompts from CLI / read session files?
@@ -186,4 +186,4 @@ curl http://localhost:8080/v1/chat/completions \
 
     - Remove non-Python json-schema-to-grammar versions
 
-    - Reach out to frameworks to advertise new option. 
+    - Reach out to frameworks to advertise new option.
diff --git a/examples/openai/llama_cpp_server_api.py b/examples/openai/llama_cpp_server_api.py
index 93690072826d2..d7cd08c4446d0 100644
--- a/examples/openai/llama_cpp_server_api.py
+++ b/examples/openai/llama_cpp_server_api.py
@@ -1,28 +1,12 @@
 from typing import Optional
-from pydantic import BaseModel, Json
+from pydantic import Json
 
-class LlamaCppServerCompletionRequest(BaseModel):
+from examples.openai.api import LlamaCppParams
+
+class LlamaCppServerCompletionRequest(LlamaCppParams):
     prompt: str
     stream: Optional[bool] = None
     cache_prompt: Optional[bool] = None
-    n_predict: Optional[int] = None
-    top_k: Optional[int] = None
-    top_p: Optional[float] = None
-    min_p: Optional[float] = None
-    tfs_z: Optional[float] = None
-    typical_p: Optional[float] = None
-    temperature: Optional[float] = None
-    dynatemp_range: Optional[float] = None
-    dynatemp_exponent: Optional[float] = None
-    repeat_last_n: Optional[int] = None
-    repeat_penalty: Optional[float] = None
-    frequency_penalty: Optional[float] = None
-    presence_penalty: Optional[float] = None
-    mirostat: Optional[bool] = None
-    mirostat_tau: Optional[float] = None
-    mirostat_eta: Optional[float] = None
-    penalize_nl: Optional[bool] = None
-    n_keep: Optional[int] = None
-    seed: Optional[int] = None
+
     grammar: Optional[str] = None
     json_schema: Optional[Json] = None
\ No newline at end of file
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 3edea651da86c..6a8fe46c7efee 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -1,15 +1,13 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from functools import wraps
 import jinja2
 import json
 from pathlib import Path
 import random
 import re
 import sys
-from typing import Any, Dict, Literal, Optional, Tuple, Callable, Union
+from typing import Optional
 from pydantic import BaseModel
-# from typeguard import typechecked
 
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.openai.api import Tool, Message, FunctionCall, ToolCall
@@ -55,7 +53,7 @@ class ChatTemplate(BaseModel):
     @property
     def tool_style(self) -> 'ToolsPromptStyle':
         return self._tool_style
-    
+
     def __init__(self, template: str, eos_token: str, bos_token: str):
         super().__init__(template=template
                          )
@@ -75,7 +73,7 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
             # self._tool_style = ToolsPromptStyle.TOOLS_MISTRAL
 
         # TODO: Test whether the template supports formatting tool_calls
-        
+
         delimiter = '<%$[SAMPLE]$%>'
         user_msg = Message(role="user", content="Hey")
         empty_prompt = self.render([user_msg], add_generation_prompt=True).strip()
@@ -112,7 +110,7 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
     def from_gguf(metadata: GGUFKeyValues):
         if Keys.Tokenizer.CHAT_TEMPLATE not in metadata:
             raise NotImplementedError(f'Only supporting models with {Keys.Tokenizer.CHAT_TEMPLATE} entry in their GGUF key-values (TODO: add default template, maybe pick llama2\'s?)')
-        
+
         tokens = metadata[Keys.Tokenizer.LIST]
         return ChatTemplate(
             template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
@@ -129,8 +127,6 @@ def from_huggingface(model_id: str):
             eos_token = tokenizer.eos_token)
 
     def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
-        # sys.stderr.write(f'# strict_user_assistant_alternation={self._strict_user_assistant_alternation}\n')
-        # sys.stderr.write(f'# messages=' + "\n".join(json.dumps(m.model_dump(), indent=2) for m in messages) + '\n')
         if self._strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
             new_messages=[]
             i = 0
@@ -161,8 +157,7 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                     i += 1
             # print(f'new_messages={json.dumps(new_messages, indent=2)}')
             messages = new_messages
-        # print(f'messages={messages}')
-        
+
         result = self._template.render(
             messages=messages,
             eos_token=self._eos_token,
@@ -170,7 +165,6 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
-        # sys.stderr.write(f'\n# RENDERED:\n\n{result}\n\n')
         return result
 
 class ChatHandlerArgs(BaseModel):
@@ -192,7 +186,7 @@ class NoToolsChatHandler(ChatHandler):
     def __init__(self, args: ChatHandlerArgs):
         super().__init__(args)
         assert not args.tools
-        
+
         if args.response_schema:
             self.output_format_prompt = Message(
                 role="system",
@@ -206,21 +200,20 @@ def __init__(self, args: ChatHandlerArgs):
             self.output_format_prompt = None
             self.grammar = None
 
-    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         return Message(role="assistant", content=s)
 
 class ToolCallTagsChatHandler(ChatHandler):
-    def __init__(self, args: ChatHandlerArgs, escapes_underscores: bool, allow_parallel_calls: bool):
+    def __init__(self, args: ChatHandlerArgs, escapes_underscores: bool, parallel_calls: bool):
         super().__init__(args)
 
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
         tool_rules = []
         for tool in self.args.tools:
-            
+
             parameters_schema = tool.function.parameters
             parameters_schema = converter.resolve_refs(parameters_schema, tool.function.name)
-            
+
             tool_rules.append(converter.visit(
                 dict(
                     type="object",
@@ -245,7 +238,7 @@ def format_literal(s: str) -> str:
             format_literal("<tool_call>") + " space (" +
             ' | '.join(tool_rules) +
             ")  space " + format_literal("</tool_call>"))# + ' space')
-        
+
         # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
         # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
         content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
@@ -253,22 +246,10 @@ def format_literal(s: str) -> str:
         converter._add_rule(
             'root',
             # tool_call_rule)
-            f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if allow_parallel_calls \
+            f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if parallel_calls \
                 else f'{content_rule}* {tool_call_rule}?')
         self.grammar = converter.format_grammar()
-        
-        # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
-        # # OR a tool-call message respecting the schema of any of the tools
-        # converter._add_rule(
-        #     "root", 
-        #     converter._format_literal(prefix) + " (" +
-        #         (response_rule or converter.not_literal("<tool_call>")) + " | " +
-        #         converter._format_literal("<tool_call>") + " (" +
-        #         ' | '.join(tool_rules) +
-        #         ") " + converter._format_literal("</tool_call>") +
-        #     ")") # + converter._format_literal(suffix))
-        
-    # @typechecked
+
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
 
@@ -294,21 +275,14 @@ def parse(self, s: str) -> Optional[Message]:
                         ToolCall(
                             id=gen_callid(),
                             function=FunctionCall(**fc)))
-                    
+
             content = '\n'.join(content).strip()
             return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
-                    
-        # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
-        #     if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
-        #         tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
-        #         return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
-        #     return None
-        # else:
-        #     return Message(role="assistant", content=s)
+
 
 class TemplatedToolsChatHandler(ToolCallTagsChatHandler):
-    def __init__(self, args: ChatHandlerArgs, template: str, escapes_underscores=False, allow_parallel_calls=True):
-        super().__init__(args, escapes_underscores=escapes_underscores, allow_parallel_calls=allow_parallel_calls)
+    def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, escapes_underscores: bool = False):
+        super().__init__(args, escapes_underscores=escapes_underscores, parallel_calls=parallel_calls)
         assert '{tools}' in template, 'Template must contain "{tools}"'
 
         self.output_format_prompt = Message(
@@ -320,8 +294,8 @@ def __init__(self, args: ChatHandlerArgs, template: str, escapes_underscores=Fal
         )
 
 class Hermes2ProToolsChatHandler(ToolCallTagsChatHandler):
-    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
-        super().__init__(args, escapes_underscores=False, allow_parallel_calls=allow_parallel_calls)
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
+        super().__init__(args, escapes_underscores=False, parallel_calls=parallel_calls)
 
         # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
         path = str(Path(__file__).parent / "hermes_function_calling")
@@ -330,15 +304,15 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
             from examples.openai.hermes_function_calling.prompter import PromptManager
         except ImportError:
             raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
-        
-        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in tools])
+
+        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in args.tools])
         assert len(prompt) == 1 and prompt[0]["role"] == "system"
         self.output_format_prompt = Message(**prompt[0])
 
 class FunctionaryToolsChatHandler(ChatHandler):
-    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         super().__init__(args)
-        
+
         # Only allowing a single tool call at a time for now.
         # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
 
@@ -347,7 +321,7 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
             content= '// Supported function definitions that should be called when necessary.\n' +
                 _tools_typescript_signatures(args.tools)
         )
-    
+
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
         tool_rules = [
             converter._add_rule(
@@ -355,17 +329,6 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
                 converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
                 converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
                 converter._format_literal('\n'))
-            # converter.visit(
-            #     dict(
-            #         type="object",
-            #         properties=dict(
-            #             name=dict(const=tool.function.name),
-            #             arguments=tool.function.parameters,
-            #         ),
-            #         required=['name', 'arguments']
-            #     ),
-            #     f'{tool.function.name}-tool-call'
-            # )
             for i, tool in enumerate(self.args.tools)
         ]
 
@@ -378,33 +341,18 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
         tool_call_without_start_rule = converter._add_rule(
             'tool_call_without_start',
             ' | '.join(tool_rules))
-            #   + ' ' +
-            # converter.not_literal("all", dotall=False) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' + not_from_rule + '*')
         tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
-        # converter._add_rule('root', f'({content_without_start_rule} ({content_rule})* ({tool_call_rule}+ {content_rule}*)? | {tool_call_without_start_rule} (* {tool_call_rule}{content_rule}*')
         converter._add_rule(
             'root',
             f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
-            f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if allow_parallel_calls \
+            f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if parallel_calls \
                 else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
 
         self.grammar = converter.format_grammar()
-        # converter._add_rule(
-        #     "root", 
-        #     converter._format_literal(prefix) + " (" +
-        #         (response_rule or converter.not_literal("<|recipient|>")) + " | " +
-        #         (' | '.join(
-        #             converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
-        #             converter.visit(tool.function.parameters, tool.function.name + '-args')
-        #             for tool in tools
-        #         )) +
-        #         ") " +
-        #     ")") # + converter._format_literal(suffix))
-    
-    # @typechecked
+
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
-        
+
         parts = _recipient_content_re.split(s)
         if len(parts) == 1:
             return Message(role="assistant", content=s)
@@ -426,14 +374,14 @@ def parse(self, s: str) -> Optional[Message]:
                         ToolCall(
                             id=gen_callid(),
                             function=FunctionCall(name=recipient, arguments=arguments)))
-                    
-            
+
+
             assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
 
             content = '\n'.join(text_content).strip()
             return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
 
-def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls):
+def _make_bespoke_schema(response_schema, tool_call_schema, parallel_calls):
     return {
         "type": "object",
         "properties": {
@@ -453,7 +401,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls
                             #     "const": "tool_calls"
                             # },
                             "tool_calls": {
-                                "prefixItems": tool_call_schema if allow_parallel_calls \
+                                "prefixItems": tool_call_schema if parallel_calls \
                                     else [tool_call_schema],
                             }
                         },
@@ -474,9 +422,9 @@ def _make_bespoke_schema(response_schema, tool_call_schema, allow_parallel_calls
     }
 
 class BespokeToolsChatHandler(ChatHandler):
-    def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         super().__init__(args)
-        
+
         # args.response_schema = args.response_schema or {}
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
 
@@ -497,7 +445,7 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
                         for tool in self.args.tools
                     ]
                 },
-                allow_parallel_calls=allow_parallel_calls,
+                parallel_calls=parallel_calls,
             ),
             '',
         )
@@ -525,13 +473,12 @@ def __init__(self, args: ChatHandlerArgs, allow_parallel_calls: bool):
                             },
                             "required": ["name", "arguments"]
                         },
-                        allow_parallel_calls=allow_parallel_calls,
+                        parallel_calls=parallel_calls,
                     )
                 ),
             ])
         )
 
-    # @typechecked
     def parse(self, s: str) -> Optional[Message]:
         s = self.args.chat_template.strip_suffix(s)
         try:
@@ -579,19 +526,19 @@ def parse(self, s: str) -> Optional[Message]:
     # 'This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
 ])
 
-def get_chat_handler(args: ChatHandlerArgs, allow_parallel_calls=False) -> ChatHandler:
+def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool) -> ChatHandler:
     if not args.tools:
         return NoToolsChatHandler(args)
     elif args.chat_template.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        return FunctionaryToolsChatHandler(args, allow_parallel_calls=False)
+        return FunctionaryToolsChatHandler(args, parallel_calls=parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_SHORT:
-        return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, allow_parallel_calls=allow_parallel_calls)
+        return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, parallel_calls=parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_LONG:
-        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, allow_parallel_calls=allow_parallel_calls)
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_MISTRAL:
-        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, escapes_underscores=True, allow_parallel_calls=allow_parallel_calls)
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls, escapes_underscores=True)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_BESPOKE:
-        return BespokeToolsChatHandler(args, allow_parallel_calls=allow_parallel_calls)
+        return BespokeToolsChatHandler(args, parallel_calls=parallel_calls)
     elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
         return Hermes2ProToolsChatHandler(args)
     else:
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 96279632459e5..ccf8fddce8306 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -31,7 +31,7 @@ def main(
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
-    allow_parallel_calls: Optional[bool] = False,
+    parallel_calls: Optional[bool] = True,
     auth: Optional[str] = None,
     verbose: bool = False,
     context_length: Optional[int] = None,
@@ -44,13 +44,13 @@ def main(
     if endpoint:
         sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
         chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
-        
+
     else:
         metadata = GGUFKeyValues(model)
 
         if not context_length:
             context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
-    
+
         if Keys.Tokenizer.CHAT_TEMPLATE in metadata:
             chat_template = ChatTemplate.from_gguf(metadata)
         else:
@@ -92,22 +92,22 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         chat_handler = get_chat_handler(
             ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools),
-            allow_parallel_calls=allow_parallel_calls
+            parallel_calls=parallel_calls
         )
-        
+
         messages = chat_request.messages
         if chat_handler.output_format_prompt:
             messages = chat_template.add_system_prompt(messages, chat_handler.output_format_prompt)
 
         prompt = chat_template.render(messages, add_generation_prompt=True)
-        
-        
+
+
         if verbose:
             sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
             # sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
             sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
             sys.stderr.write(f'\n# GRAMMAR:\n\n{chat_handler.grammar}\n\n')
-        
+
         data = LlamaCppServerCompletionRequest(
             **{
                 k: v
@@ -130,7 +130,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
                 json=data,
                 headers=headers,
                 timeout=None)
-        
+
         if chat_request.stream:
             # TODO: Remove suffix from streamed response using partial parser.
             assert not chat_request.tools and not chat_request.response_format, "Streaming not supported yet with tools or response_format"
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index e29e83507fef5..108c1482e19c7 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -31,7 +31,7 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st
             [f"{self._desc_comment(additional_properties) if additional_properties else ''}[key: string]: {self.visit(additional_properties)}"]
             if additional_properties is not None else []
         )) + "}"
-    
+
     def visit(self, schema: dict):
         def print_constant(v):
             return json.dumps(v)

From 80c793047b14ed226b7163ff45720cddac484159 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 17:01:20 +0000
Subject: [PATCH 18/68] openai: fix message merging for mixtral (parallel
 calls)

---
 examples/openai/prompting.py | 57 +++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 6a8fe46c7efee..c431a7987a6a7 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -131,31 +131,40 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
             new_messages=[]
             i = 0
             n = len(messages)
-            while i < n:
-                if messages[i].role == 'system':
-                    assert messages[i+1].role == 'user'
-                    new_messages.append(Message(
-                        role="user",
-                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'
-                    ))
-                    i += 2
-                elif messages[i].role == 'assistant' and messages[i].tool_calls and messages[i].content:
-                    tc = '\n'.join(f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>' for tc in messages[i].tool_calls)
-                    new_messages.append(Message(
-                        role="assistant",
-                        content=f'{messages[i].content}\n{tc}'
-                    ))
-                    i += 1
-                elif messages[i].role == 'tool':
-                    new_messages.append(Message(
-                        role="user",
-                        content=f'TOOL RESULT(name={messages[i].name}, id={messages[i].tool_call_id}): {messages[i].content}',
-                    ))  
-                    i += 1
+            current_role = 'user'
+            current_content = []
+
+            def flush():
+                nonlocal current_content
+                nonlocal current_role
+                new_messages.append(Message(
+                    role=current_role,
+                    content='\n'.join(current_content)
+                ))
+                current_content = []
+
+            for i, message in enumerate(messages):
+                if message.role == current_role:
+                    current_content.append(message.content)
+                elif message.role in ('user', 'assistant'):
+                    flush()
+                    current_role = 'assistant' if current_role == 'user' else 'user'
+                    current_content.append(message.content)
                 else:
-                    new_messages.append(messages[i])
-                    i += 1
-            # print(f'new_messages={json.dumps(new_messages, indent=2)}')
+                    if current_role == 'assistant':
+                        flush()
+                        current_role = 'user'
+                    if message.role == 'system':
+                        current_content.append(f'[SYS]{messages[i].content}[/SYS]')
+                    elif message.role == 'tool':
+                        current_content.append(f'[TOOL RESULT(name={messages[i].name}, id={messages[i].tool_call_id}]{messages[i].content}[/TOOL RESULT]')
+                    else:
+                        sys.stderr.write(f'Unexpected message role: {message.role}\n')
+                        current_content.append(f'[ROLE={messages[i].role}]{messages[i].content}[/ROLE]')
+
+            if current_content:
+                flush()
+
             messages = new_messages
 
         result = self._template.render(

From 9ab493f67e58ac19887424aa5f3f4569ddcfb5a5 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 17:11:03 +0000
Subject: [PATCH 19/68] Update prompting.py

---
 examples/openai/prompting.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index c431a7987a6a7..173810139302f 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -162,6 +162,10 @@ def flush():
                         sys.stderr.write(f'Unexpected message role: {message.role}\n')
                         current_content.append(f'[ROLE={messages[i].role}]{messages[i].content}[/ROLE]')
 
+                current_content.extend(
+                    f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>'
+                    for tc in (message.tool_calls or [])
+                )
             if current_content:
                 flush()
 

From e0c8af4ba06cbbaef7fa7c3facb9345dfd3a438b Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 18:09:31 +0000
Subject: [PATCH 20/68] agent: --style

---
 examples/agent/README.md     | 18 ++++++++++-
 examples/agent/agent.py      |  9 ++++--
 examples/openai/prompting.py | 62 ++++++++++++++++++++----------------
 examples/openai/server.py    |  6 ++--
 4 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 045d237194325..9ca8a99fd7e8e 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -8,7 +8,6 @@ python -m examples.agent \
     --tools examples/agent/tools/example_math_tools.py \
     --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?"
 ```
-<!-- --format float \ -->
 
 <details>
 <summary>Show output</summary>
@@ -37,6 +36,23 @@ python -m examples.agent \
 <summary>Show output</summary>
 
 ```bash
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_current_weather(location=San Francisco, format=fahrenheit) -> ...
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_n_day_weather_forecast(location=San Francisco, format=fahrenheit, num_days=4) -> ...
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_n_day_weather_forecast(location=Glasgow, format=celsius, num_days=4) -> ...
+The current weather in San Francisco is sunny and 87.8F. Here is the 4-day weather forecast:
+
+For San Francisco:
+- In 1 day: Cloudy, 60.8F
+- In 2 days: Sunny, 73.4F
+- In 3 days: Cloudy, 62.6F
+
+For Glasgow:
+- In 1 day: Cloudy, 16C
+- In 2 days: Sunny, 23C
+- In 3 days: Cloudy, 17C
 ```
 
 </details>
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index ed428af6801ad..96355d225ee98 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -12,6 +12,7 @@
 from examples.agent.tools.std_tools import StandardTools
 from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, Tool, ToolFunction
 from examples.agent.utils import collect_functions, load_module
+from examples.openai.prompting import ToolsPromptStyle
 
 def _get_params_schema(fn: Callable, verbose):
     converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
@@ -130,6 +131,7 @@ def main(
     auth: Optional[str] = None,
     parallel_calls: Optional[bool] = True,
     verbose: bool = False,
+    style: Optional[ToolsPromptStyle] = None,
 
     model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
     endpoint: Optional[str] = None,
@@ -175,8 +177,8 @@ def main(
             "--model", model,
             *(['--verbose'] if verbose else []),
             *(['--parallel-calls'] if parallel_calls else []),
-            *(['--context-length={context_length}'] if context_length else []),
-            *([])
+            *([f'--context-length={context_length}'] if context_length else []),
+            *([f'--style={style.value}'] if style else []),
         ]
         server_process = subprocess.Popen(cmd, stdout=sys.stderr)
         atexit.register(server_process.kill)
@@ -196,7 +198,7 @@ def main(
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
 
-    response_model = None#str
+    response_model = str
     if format:
         if format in types:
             response_model = types[format]
@@ -245,6 +247,7 @@ def main(
         }]
     )
     print(result if response_model else f'➡️ {result}')
+    # exit(0)
 
 if __name__ == '__main__':
     typer.run(main)
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 173810139302f..0c3f2fcb264db 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -14,45 +14,45 @@
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 
+_THOUGHT_KEY = "thought"
+# _THOUGHT_KEY = "thought_about_next_step_only"
+
 # While the API will be usable with a generic tools usage like OpenAI,
 # (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
 # each model may need specific prompting (and/or constrained output,
 # especially for models not fine-tuned for tool usage / function calling).
-class ToolsPromptStyle(Enum):
+class ToolsPromptStyle(str, Enum):
     # Short prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
-    TOOLS_SHORT = 1
+    TOOLS_SHORT = "short"
 
     # Longer prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
-    TOOLS_LONG = 2
+    TOOLS_LONG = "long"
 
     # Bespoke constrained output format that favours thought and reasoning
     # while allowing unambiguous parsing of parallel tool calling.
-    TOOLS_BESPOKE = 3
+    TOOLS_CONSTRAINED = "thoughtful_steps"
 
     # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
     # <tool_call>...</tool_call> output
     # Requires:
     # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
     # - Set large context length as their prompts are super long
-    TOOLS_HERMES_2_PRO = 4
+    TOOLS_HERMES_2_PRO = "tools_hermes_2_pro"
 
     # Seems to want to escape underscores in tool names and in the <tool\_call>...</tool\_call> tags
-    TOOLS_MISTRAL = 5
+    TOOLS_MIXTRAL = "mixtral"
 
     # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
     # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
     # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
-    TYPESCRIPT_FUNCTIONARY_V2 = 6
+    TYPESCRIPT_FUNCTIONARY_V2 = "functionary_v2"
 
 def raise_exception(msg: str):
     raise Exception(msg)
 
 class ChatTemplate(BaseModel):
     template: str
-
-    @property
-    def tool_style(self) -> 'ToolsPromptStyle':
-        return self._tool_style
+    inferred_tool_style: Optional['ToolsPromptStyle'] = None
 
     def __init__(self, template: str, eos_token: str, bos_token: str):
         super().__init__(template=template
@@ -65,12 +65,12 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         self._strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
 
         if "<|recipient|>' + tool_call['function']['name']" in template:
-            self._tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+            self.inferred_tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
         else:
-            self._tool_style = ToolsPromptStyle.TOOLS_BESPOKE
-            # self._tool_style = ToolsPromptStyle.TOOLS_LONG
-            # self._tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
-            # self._tool_style = ToolsPromptStyle.TOOLS_MISTRAL
+            self.inferred_tool_style = ToolsPromptStyle.TOOLS_CONSTRAINED
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_LONG
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_MIXTRAL
 
         # TODO: Test whether the template supports formatting tool_calls
 
@@ -399,7 +399,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, parallel_calls):
         "type": "object",
         "properties": {
             # "original_goal": {"title": "Original Goal", "type": "string"},
-            "thought_about_next_step_only": {
+            _THOUGHT_KEY: {
                 "title": "Thought about next step",
                 # "title": "Thought about how the next step brings us closer to achieving the original goal",
                 "type": "string"
@@ -430,7 +430,7 @@ def _make_bespoke_schema(response_schema, tool_call_schema, parallel_calls):
                 ]
             },
         },
-        "required": ["original_goal", "thought_about_next_step_only", "next_step"]
+        "required": ["original_goal", _THOUGHT_KEY, "next_step"]
         # "required": ["next_step"]
     }
 
@@ -505,7 +505,7 @@ def parse(self, s: str) -> Optional[Message]:
         elif 'tool_calls' in next_step:
             return Message(
                 role="assistant",
-                content=data["thought_about_next_step_only"] if "thought_about_next_step_only" in data else None,
+                content=data.get(_THOUGHT_KEY),
                 tool_calls=[
                     ToolCall(id=gen_callid(), function=FunctionCall(**tc))
                     for tc in next_step['tool_calls']
@@ -539,20 +539,28 @@ def parse(self, s: str) -> Optional[Message]:
     # 'This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
 ])
 
-def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool) -> ChatHandler:
+def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Optional[ToolsPromptStyle] = None) -> ChatHandler:
+    tool_style = tool_style or args.chat_template.inferred_tool_style
+
     if not args.tools:
         return NoToolsChatHandler(args)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+
+    elif tool_style == ToolsPromptStyle.TOOLS_CONSTRAINED:
+        return BespokeToolsChatHandler(args, parallel_calls=parallel_calls)
+
+    elif tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
         return FunctionaryToolsChatHandler(args, parallel_calls=parallel_calls)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_SHORT:
+
+    elif tool_style == ToolsPromptStyle.TOOLS_SHORT:
         return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, parallel_calls=parallel_calls)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_LONG:
+
+    elif tool_style == ToolsPromptStyle.TOOLS_LONG:
         return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_MISTRAL:
+
+    elif tool_style == ToolsPromptStyle.TOOLS_MIXTRAL:
         return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls, escapes_underscores=True)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_BESPOKE:
-        return BespokeToolsChatHandler(args, parallel_calls=parallel_calls)
-    elif args.chat_template.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
+
+    elif tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
         return Hermes2ProToolsChatHandler(args)
     else:
         raise ValueError(f"Unsupported tool call style: {args.chat_template.tool_style}")
diff --git a/examples/openai/server.py b/examples/openai/server.py
index ccf8fddce8306..a8abe8c8a7499 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -12,7 +12,7 @@
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.api import ChatCompletionResponse, Choice, Message, ChatCompletionRequest, Usage
-from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, get_chat_handler, ChatHandler
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler, ChatHandler
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
@@ -32,6 +32,7 @@ def main(
     host: str = "localhost",
     port: int = 8080,
     parallel_calls: Optional[bool] = True,
+    style: Optional[ToolsPromptStyle] = None,
     auth: Optional[str] = None,
     verbose: bool = False,
     context_length: Optional[int] = None,
@@ -92,7 +93,8 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         chat_handler = get_chat_handler(
             ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools),
-            parallel_calls=parallel_calls
+            parallel_calls=parallel_calls,
+            tool_style=style,
         )
 
         messages = chat_request.messages

From b4e292ec0101a877da531771436fead775e6e9af Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 18:19:28 +0000
Subject: [PATCH 21/68] Create requirements.txt

---
 examples/agent/requirements.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 examples/agent/requirements.txt

diff --git a/examples/agent/requirements.txt b/examples/agent/requirements.txt
new file mode 100644
index 0000000000000..01aab7cae824d
--- /dev/null
+++ b/examples/agent/requirements.txt
@@ -0,0 +1,8 @@
+fastapi[all]
+gguf
+jinja2
+pydantic
+requests
+sse-starlette
+uvicorn[all]
+typer[all]
\ No newline at end of file

From d1d86027c46b19ea241e833af0c4f46bb9eed77e Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 19:22:15 +0000
Subject: [PATCH 22/68] agent: disable parallel by default

---
 examples/agent/agent.py   |  6 +++---
 examples/openai/api.py    |  2 +-
 examples/openai/server.py | 44 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 96355d225ee98..1652c44790225 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -108,8 +108,8 @@ def completion_with_tool_usage(
                     tool_call_id=tool_call.id,
                     role="tool",
                     name=tool_call.function.name,
-                    # content=f'{tool_result}',
-                    content=f'{pretty_call} = {tool_result}',
+                    content=f'{tool_result}',
+                    # content=f'{pretty_call} = {tool_result}',
                 ))
         else:
             assert content
@@ -129,7 +129,7 @@ def main(
     max_iterations: Optional[int] = 10,
     std_tools: Optional[bool] = False,
     auth: Optional[str] = None,
-    parallel_calls: Optional[bool] = True,
+    parallel_calls: Optional[bool] = False,
     verbose: bool = False,
     style: Optional[ToolsPromptStyle] = None,
 
diff --git a/examples/openai/api.py b/examples/openai/api.py
index b95eb17fae7a8..7780d8bc4e848 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -4,7 +4,7 @@
 
 class FunctionCall(BaseModel):
     name: str
-    arguments: Dict[str, Any]
+    arguments: Union[Dict[str, Any], str]
 
 class ToolCall(BaseModel):
     id: Optional[str] = None
diff --git a/examples/openai/server.py b/examples/openai/server.py
index a8abe8c8a7499..6d19f12f1677b 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -31,7 +31,7 @@ def main(
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
-    parallel_calls: Optional[bool] = True,
+    parallel_calls: Optional[bool] = False,
     style: Optional[ToolsPromptStyle] = None,
     auth: Optional[str] = None,
     verbose: bool = False,
@@ -75,6 +75,44 @@ def main(
         atexit.register(server_process.kill)
         endpoint = f"http://{server_host}:{server_port}/completions"
 
+
+    # print(chat_template.render([
+    #     Message(**{
+    #         "role": "user",
+    #         "name": None,
+    #         "tool_call_id": None,
+    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+    #         "tool_calls": None
+    #     }),
+    #     Message(**{
+    #         "role": "assistant",
+    #         # "name": None,
+    #         "tool_call_id": None,
+    #         "content": "?",
+    #         "tool_calls": [
+    #             {
+    #                 # "id": "call_531873",
+    #                 "type": "function",
+    #                 "function": {
+    #                     "name": "add",
+    #                     "arguments": {
+    #                         "a": 2535,
+    #                         "b": 32222000403
+    #                     }
+    #                 }
+    #             }
+    #         ]
+    #     }),
+    #     Message(**{
+    #         "role": "tool",
+    #         "name": "add",
+    #         "tool_call_id": "call_531873",
+    #         "content": "32222002938",
+    #         "tool_calls": None
+    #     })
+    # ], add_generation_prompt=True))
+    # exit(0)
+
     app = FastAPI()
 
     @app.post("/v1/chat/completions")
@@ -95,6 +133,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools),
             parallel_calls=parallel_calls,
             tool_style=style,
+            verbose=verbose,
         )
 
         messages = chat_request.messages
@@ -102,8 +141,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             messages = chat_template.add_system_prompt(messages, chat_handler.output_format_prompt)
 
         prompt = chat_template.render(messages, add_generation_prompt=True)
-
-
+        
         if verbose:
             sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
             # sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')

From eb9a5524eb85c4d72e0919d30ef2e0c45c05d7a4 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 19:22:46 +0000
Subject: [PATCH 23/68] agent: nits

---
 examples/openai/prompting.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 0c3f2fcb264db..876f0b27fae82 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -14,8 +14,8 @@
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 
-_THOUGHT_KEY = "thought"
-# _THOUGHT_KEY = "thought_about_next_step_only"
+# _THOUGHT_KEY = "thought"
+_THOUGHT_KEY = "thought_about_next_step_only"
 
 # While the API will be usable with a generic tools usage like OpenAI,
 # (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
@@ -30,7 +30,7 @@ class ToolsPromptStyle(str, Enum):
 
     # Bespoke constrained output format that favours thought and reasoning
     # while allowing unambiguous parsing of parallel tool calling.
-    TOOLS_CONSTRAINED = "thoughtful_steps"
+    TOOLS_THOUGHTFUL_STEPS = "thoughtful_steps"
 
     # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
     # <tool_call>...</tool_call> output
@@ -67,7 +67,7 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         if "<|recipient|>' + tool_call['function']['name']" in template:
             self.inferred_tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
         else:
-            self.inferred_tool_style = ToolsPromptStyle.TOOLS_CONSTRAINED
+            self.inferred_tool_style = ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
             # self.inferred_tool_style = ToolsPromptStyle.TOOLS_LONG
             # self.inferred_tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
             # self.inferred_tool_style = ToolsPromptStyle.TOOLS_MIXTRAL
@@ -539,13 +539,16 @@ def parse(self, s: str) -> Optional[Message]:
     # 'This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
 ])
 
-def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Optional[ToolsPromptStyle] = None) -> ChatHandler:
-    tool_style = tool_style or args.chat_template.inferred_tool_style
+def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Optional[ToolsPromptStyle] = None, verbose=False) -> ChatHandler:
+    tool_style = tool_style if tool_style is not None else args.chat_template.inferred_tool_style
+
+    if verbose:
+        sys.stderr.write(f"# Using tool style: {tool_style}\n")
 
     if not args.tools:
         return NoToolsChatHandler(args)
 
-    elif tool_style == ToolsPromptStyle.TOOLS_CONSTRAINED:
+    elif tool_style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS:
         return BespokeToolsChatHandler(args, parallel_calls=parallel_calls)
 
     elif tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:

From 3da30ed89e9774755bccf6883e006146973d6cfb Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 19:22:59 +0000
Subject: [PATCH 24/68] agent: fix functionary tool_calls templating

---
 examples/openai/prompting.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 876f0b27fae82..1ec2a11e322bf 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -171,6 +171,27 @@ def flush():
 
             messages = new_messages
 
+        # JSON!
+        messages = [m.model_dump() for m in messages]
+
+        if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+            messages = [
+                {
+                    **m,
+                    "tool_calls": [
+                        {
+                            **tc,
+                            "function": {
+                                "name": tc["function"]["name"],
+                                "arguments": json.dumps(tc["function"]["arguments"]),
+                            }
+                        }
+                        for tc in m["tool_calls"]
+                    ] if m.get("tool_calls") else None
+                }
+                for m in messages
+            ]
+
         result = self._template.render(
             messages=messages,
             eos_token=self._eos_token,

From ff6563a7bbc919e843e70c7c9563e0d9f3d7303c Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 19:23:09 +0000
Subject: [PATCH 25/68] Delete test.sh

---
 examples/openai/test.sh | 95 -----------------------------------------
 1 file changed, 95 deletions(-)
 delete mode 100755 examples/openai/test.sh

diff --git a/examples/openai/test.sh b/examples/openai/test.sh
deleted file mode 100755
index 4dca39adecc3f..0000000000000
--- a/examples/openai/test.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-SERVER_PID=""
-function cleanup() {
-  if [ -n "$SERVER_PID" ]; then
-    echo "# Killing server" >&2
-    kill $SERVER_PID
-    wait $SERVER_PID
-  fi
-}
-trap cleanup EXIT
-
-echo "# Starting the server" >&2
-
-args=(
-    # --cpp_server_endpoint "http://localhost:8081"
-    
-    # --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
-    
-    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
-    # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
-
-    # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf
-    # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
-)
-python -m examples.openai "${args[@]}" &
-SERVER_PID=$!
-
-sleep 5
-
-echo "# Send a message to the chat API" >&2
-
-python -m examples.openai.reactor
-exit
-
-curl http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer $OPENAI_API_KEY" \
-  -d '{
-    "model": "gpt-3.5-turbo",
-    "tools": [{
-          "type": "function",
-          "function": {
-              "name": "get_current_weather",
-              "description": "Get the current weather",
-              "parameters": {
-                  "type": "object",
-                  "properties": {
-                      "location": {
-                          "type": "string",
-                          "description": "The city and state, e.g. San Francisco, CA"
-                      },
-                      "format": {
-                          "type": "string",
-                          "enum": ["celsius", "fahrenheit"],
-                          "description": "The temperature unit to use. Infer this from the users location."
-                      }
-                  },
-                  "required": ["location", "format"]
-              }
-          }
-      }, {
-          "type": "function",
-          "function": {
-              "name": "get_n_day_weather_forecast",
-              "description": "Get an N-day weather forecast",
-              "parameters": {
-                  "type": "object",
-                  "properties": {
-                      "location": {
-                          "type": "string",
-                          "description": "The city and state, e.g. San Francisco, CA"
-                      },
-                      "format": {
-                          "type": "string",
-                          "enum": ["celsius", "fahrenheit"],
-                          "description": "The temperature unit to use. Infer this from the users location."
-                      },
-                      "num_days": {
-                          "type": "integer",
-                          "description": "The number of days to forecast"
-                      }
-                  },
-                  "required": ["location", "format", "num_days"]
-              }
-          }
-      }],
-    "messages": [
-      {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
-    ]
-  }' | \
-  jq .
-
-#   {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},

From dd11bb6937c3d2a4c16e769f106f228087e52766 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 19:41:11 +0000
Subject: [PATCH 26/68] agent: format still broken

---
 examples/agent/README.md | 9 ++++++---
 examples/agent/agent.py  | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 9ca8a99fd7e8e..8e9dec4695254 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -6,7 +6,8 @@ Have any LLM use local (sandboxed) tools, with a simple CLI.
 python -m examples.agent \
     --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
     --tools examples/agent/tools/example_math_tools.py \
-    --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?"
+    --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?" \
+    --greedy
 ```
 
 <details>
@@ -29,7 +30,8 @@ python -m examples.agent \
 ```bash
 python -m examples.agent \
     --tools examples/agent/tools/example_weather_tools.py \
-    --goal "What is the weather going to be like in San Francisco and Glasgow over the next 4 days."
+    --goal "What is the weather going to be like in San Francisco and Glasgow over the next 4 days." \
+    --greedy
 ```
 
 <details>
@@ -62,7 +64,8 @@ For Glasgow:
 python -m examples.agent \
     --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
     --std_tools \
-    --goal "Wait 10sec then say Hi out loud"
+    --goal "Wait 10sec then say Hi out loud" \
+    --greedy
 ```
 
 <details>
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 1652c44790225..e7373eb285970 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -198,7 +198,7 @@ def main(
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
 
-    response_model = str
+    response_model = None #str
     if format:
         if format in types:
             response_model = types[format]

From 22b980ffc32336a95ff43cf905950ab32f966dbd Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 20:16:55 +0000
Subject: [PATCH 27/68] agent: update readme

---
 .../tools/{example_python_tools.py => unsafe_python_tools.py} | 0
 examples/openai/README.md                                     | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/agent/tools/{example_python_tools.py => unsafe_python_tools.py} (100%)

diff --git a/examples/agent/tools/example_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
similarity index 100%
rename from examples/agent/tools/example_python_tools.py
rename to examples/agent/tools/unsafe_python_tools.py
diff --git a/examples/openai/README.md b/examples/openai/README.md
index e9a8658c26659..c8bc0130d4edc 100644
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@@ -1,6 +1,6 @@
-# examples.agent: Interactive agent that can use Python tools!
+# examples.openai: OpenAI-compatibility layer on top of server.cpp
 
-New Python OpenAI API compatibility server, which calls into the C++ server under the hood:
+New Python OpenAI API compatibility server, which calls into / spawns the C++ server under the hood:
 
 ```bash
 python -m examples.openai.server --model model.gguf

From 61f35e07a569db95f083da38b0e79ebeea96e679 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 29 Mar 2024 23:04:23 +0000
Subject: [PATCH 28/68] agent: prepare to test various templates

---
 examples/agent/test_chat_handlers.md | 1436 ++++++++++++++++++++++++++
 examples/agent/test_chat_handlers.py |  199 ++++
 examples/openai/prompting.py         |   27 +-
 examples/openai/ts_converter.py      |    4 +-
 4 files changed, 1647 insertions(+), 19 deletions(-)
 create mode 100644 examples/agent/test_chat_handlers.md
 create mode 100644 examples/agent/test_chat_handlers.py

diff --git a/examples/agent/test_chat_handlers.md b/examples/agent/test_chat_handlers.md
new file mode 100644
index 0000000000000..a93927fca4b38
--- /dev/null
+++ b/examples/agent/test_chat_handlers.md
@@ -0,0 +1,1436 @@
+
+Messages:
+
+```js
+[
+  {
+    "role": "user",
+    "name": null,
+    "tool_call_id": null,
+    "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+    "tool_calls": null
+  },
+  {
+    "role": "assistant",
+    "name": null,
+    "tool_call_id": null,
+    "content": "?",
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "add",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  },
+  {
+    "role": "tool",
+    "name": "add",
+    "tool_call_id": "call_531873",
+    "content": "32222002938",
+    "tool_calls": null
+  }
+]
+```
+
+
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result? [/INST]?
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "add", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL RESULT(name=add, id=call_531873]32222002938[/TOOL RESULT] [/INST]
+```
+
+
+## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
+
+
+### with tools
+
+
+Prompt:
+
+```json
+You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": {
+                "properties": {
+                  "name": {
+                    "title": "Name of the tool to call",
+                    "type": "string"
+                  },
+                  "arguments": {
+                    "title": "Arguments to pass to the tool",
+                    "type": "object"
+                  }
+                },
+                "required": [
+                  "name",
+                  "arguments"
+                ]
+              }
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+next-step ::= next-step-0 | next-step-1
+next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
+next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
+next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
+next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
+next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
+next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
+next-step-0-tool-calls-item-0-name ::= "\"add\""
+next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
+next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
+next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
+next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
+next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-item-1-name ::= "\"say\""
+next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-1 ::= "{" space next-step-1-result-kv "}" space
+next-step-1-result-kv ::= "\"result\"" space ":" space integer
+next-step-kv ::= "\"next_step\"" space ":" space next-step
+root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_MIXTRAL
+
+
+### with tools
+
+
+Prompt:
+
+```json
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"" "add" "\"" space
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"" "say" "\"" space
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool" "\\"? "_" "call>" space (add-tool-call | say-tool-call)  space "</tool" "\\"? "_" "call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+# functionary_v2_2
+
+
+Template:
+
+```js
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
+' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|from|>user
+<|recipient|>all
+<|content|>What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
+<|from|>assistant
+<|recipient|>all
+<|content|>?
+<|from|>assistant
+<|recipient|>add
+<|content|>{"a": 2535, "b": 32222000403}<|stop|>
+<|from|>add
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>
+```
+
+
+## ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+
+
+### with tools
+
+
+Prompt:
+
+```json
+// Supported function definitions that should be called when necessary.
+namespace functions {
+// Adds two numbers
+type add = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+```
+
+
+Grammar:
+
+```js
+add-args ::= "{" space add-args-a-kv "," space add-args-b-kv "}" space
+add-args-a-kv ::= "\"a\"" space ":" space integer
+add-args-b-kv ::= "\"b\"" space ":" space integer
+add-call ::= "add" "\n<|content|>\n" add-args "\n"
+content ::= start content-without-start
+content-without-start ::= "all\n<|content|>" not-from*
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+not-from ::= ([^<] | "<" ([^|] | "|" ([^f] | "f" ([^r] | "r" ([^o] | "o" ([^m] | "m" ([^|] | "|" ([^>])?)?)?)?)?)?)?)
+root ::= content-without-start   content*   (tool-call+ content*)? | tool-call-without-start tool-call* content*
+say-args ::= "{" space say-args-text-kv "}" space
+say-args-text-kv ::= "\"text\"" space ":" space string
+say-call ::= "say" "\n<|content|>\n" say-args "\n"
+space ::= " "?
+start ::= "<|from|>assistant\n<|recipient|>"
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= start tool-call-without-start
+tool-call-without-start ::= add-call | say-call
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?<|im_end|>
+<|im_start|>assistant
+?<|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+## ToolsPromptStyle.TOOLS_SHORT
+
+
+### with tools
+
+
+Prompt:
+
+```json
+Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_LONG
+
+
+### with tools
+
+
+Prompt:
+
+```json
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
+
+
+### with tools
+
+
+Prompt:
+
+```json
+You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": {
+                "properties": {
+                  "name": {
+                    "title": "Name of the tool to call",
+                    "type": "string"
+                  },
+                  "arguments": {
+                    "title": "Arguments to pass to the tool",
+                    "type": "object"
+                  }
+                },
+                "required": [
+                  "name",
+                  "arguments"
+                ]
+              }
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+next-step ::= next-step-0 | next-step-1
+next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
+next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
+next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
+next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
+next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
+next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
+next-step-0-tool-calls-item-0-name ::= "\"add\""
+next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
+next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
+next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
+next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
+next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-item-1-name ::= "\"say\""
+next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-1 ::= "{" space next-step-1-result-kv "}" space
+next-step-1-result-kv ::= "\"result\"" space ":" space integer
+next-step-kv ::= "\"next_step\"" space ":" space next-step
+root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_HERMES_2_PRO
+
+
+### with tools
+
+
+Prompt:
+
+```json
+You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-29. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"add","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+If you plan to continue with analysis, always call another function.
+For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"arguments": <args-dict>, "name": <function-name>}
+</tool_call>
+
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result? [/INST] ?
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "add", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL RESULT(name=add, id=call_531873]32222002938[/TOOL RESULT] [/INST]
+```
+
+
+## ToolsPromptStyle.TOOLS_SHORT
+
+
+### with tools
+
+
+Prompt:
+
+```json
+Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_LONG
+
+
+### with tools
+
+
+Prompt:
+
+```json
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
+
+
+### with tools
+
+
+Prompt:
+
+```json
+You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "add",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": {
+                "properties": {
+                  "name": {
+                    "title": "Name of the tool to call",
+                    "type": "string"
+                  },
+                  "arguments": {
+                    "title": "Arguments to pass to the tool",
+                    "type": "object"
+                  }
+                },
+                "required": [
+                  "name",
+                  "arguments"
+                ]
+              }
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+next-step ::= next-step-0 | next-step-1
+next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
+next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
+next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
+next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
+next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
+next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
+next-step-0-tool-calls-item-0-name ::= "\"add\""
+next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
+next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
+next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
+next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
+next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-item-1-name ::= "\"say\""
+next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-1 ::= "{" space next-step-1-result-kv "}" space
+next-step-1-result-kv ::= "\"result\"" space ":" space integer
+next-step-kv ::= "\"next_step\"" space ":" space next-step
+root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## ToolsPromptStyle.TOOLS_HERMES_2_PRO
+
+
+### with tools
+
+
+Prompt:
+
+```json
+You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-29. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"add","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+If you plan to continue with analysis, always call another function.
+For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"arguments": <args-dict>, "name": <function-name>}
+</tool_call>
+
+```
+
+
+Grammar:
+
+```js
+add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
+add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
+add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
+add-tool-call-name ::= "\"add\""
+add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* (tool-call+ content*)?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### without tools
+
+
+Prompt:
+
+```json
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
diff --git a/examples/agent/test_chat_handlers.py b/examples/agent/test_chat_handlers.py
new file mode 100644
index 0000000000000..f1b212f0e6b61
--- /dev/null
+++ b/examples/agent/test_chat_handlers.py
@@ -0,0 +1,199 @@
+#
+#
+# python -m examples.agent.test_chat_handlers | tee examples/agent/test_chat_handlers.md
+    
+import json
+from pathlib import Path
+import typer
+from typing import Annotated
+
+from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, Tool, ToolFunction
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
+
+
+
+TEST_MESSAGES = [
+    Message(**{
+        "role": "user",
+        "name": None,
+        "tool_call_id": None,
+        "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+        "tool_calls": None
+    }),
+    Message(**{
+        "role": "assistant",
+        "name": None,
+        "tool_call_id": None,
+        "content": "?",
+        "tool_calls": [
+            {
+                "id": "call_531873",
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "arguments": {
+                        "a": 2535,
+                        "b": 32222000403
+                    }
+                }
+            }
+        ]
+    }),
+    Message(**{
+        "role": "tool",
+        "name": "add",
+        "tool_call_id": "call_531873",
+        "content": "32222002938",
+        "tool_calls": None
+    })
+]
+
+TEST_TOOLS = [
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name="add",
+            description="Adds two numbers",
+            parameters={
+                "properties": {
+                  "a": {"type": "integer"},
+                  "b": {"type": "integer"},
+                },
+                "required": ["a", "b"]
+            }
+        )
+    ),
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name="say",
+            description="Says something out loud (TTS)",
+            parameters={
+                "properties": {
+                  "text": {
+                      "description": "The text to say out loud",
+                      "type": "string"
+                  },
+                },
+                "required": ["text"]
+            }
+        )
+    )
+]
+
+TEST_OUTPUT_SCHEMA = {"type": "integer"}
+
+if __name__ == "__main__":
+   
+    # chat_templates = {
+    #   'mistral_instruct_v0_1': ChatTemplate.from_huggingface("mistralai/Mixtral-8x7B-Instruct-v0.1"),
+    #   'functionary_v2_2': ChatTemplate.from_huggingface("meetkai/functionary-small-v2.2"),
+    #   'hermes_2_pro_mistral': ChatTemplate.from_huggingface("NousResearch/Hermes-2-Pro-Mistral-7B"),
+    #   'llama2': ChatTemplate.from_huggingface("meta-llama/Llama-2-7b-chat-hf"),
+    # }
+    # print(json.dumps({k: v.model_dump() for k, v in chat_templates.items()}, indent=2))
+    # exit(0)
+
+    chat_templates = {
+        "mistral_instruct_v0_1": {
+            "template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            "eos_token": "</s>",
+            "bos_token": "<s>"
+        },
+        "functionary_v2_2": {
+            "template": "{#v2.2#}\n{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}",
+            "eos_token": "</s>",
+            "bos_token": "<s>"
+        },
+        "hermes_2_pro_mistral": {
+            "template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+            "eos_token": "<|im_end|>",
+            "bos_token": "<s>"
+        },
+        "llama2": {
+            "template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+            "eos_token": "</s>",
+            "bos_token": "<s>"
+        },
+    }
+    chat_templates = {k: ChatTemplate(**v) for k, v in chat_templates.items()}
+
+    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES], indent=2)}\n```\n')
+
+    for model_name, chat_template in chat_templates.items():
+        print(f"\n# {model_name}\n")
+        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
+
+        print(f'\nPrompt:\n\n```js\n{chat_template.render(TEST_MESSAGES, add_generation_prompt=True)}\n```\n')
+
+        argss = {
+            "with tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=TEST_TOOLS,
+            ),
+            "without tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=[],
+            ),
+        }
+        
+        for style in ToolsPromptStyle:
+            if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
+                continue
+
+            if style == ToolsPromptStyle.TOOLS_MIXTRAL and model_name != "mistral_instruct_v0_1":
+                continue
+
+            if model_name == "mistral_instruct_v0_1" and style not in (ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS, ToolsPromptStyle.TOOLS_MIXTRAL):
+                continue
+
+            print(f'\n## {style}\n')
+
+            for tn, args in argss.items():
+                ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
+                
+                print(f'\n### {tn}\n')
+                
+                print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
+
+                print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
+
+
+    # test_templates([
+    #     Message(**{
+    #         "role": "user",
+    #         "name": None,
+    #         "tool_call_id": None,
+    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+    #         "tool_calls": None
+    #     }),
+    #     Message(**{
+    #         "role": "assistant",
+    #         # "name": None,
+    #         "tool_call_id": None,
+    #         "content": "?",
+    #         "tool_calls": [
+    #             {
+    #                 # "id": "call_531873",
+    #                 "type": "function",
+    #                 "function": {
+    #                     "name": "add",
+    #                     "arguments": {
+    #                         "a": 2535,
+    #                         "b": 32222000403
+    #                     }
+    #                 }
+    #             }
+    #         ]
+    #     }),
+    #     Message(**{
+    #         "role": "tool",
+    #         "name": "add",
+    #         "tool_call_id": "call_531873",
+    #         "content": "32222002938",
+    #         "tool_calls": None
+    #     })
+    # ])
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 1ec2a11e322bf..d2a0cd1292844 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -53,14 +53,13 @@ def raise_exception(msg: str):
 class ChatTemplate(BaseModel):
     template: str
     inferred_tool_style: Optional['ToolsPromptStyle'] = None
+    eos_token: str
+    bos_token: str
 
     def __init__(self, template: str, eos_token: str, bos_token: str):
-        super().__init__(template=template
-                         )
+        super().__init__(template=template, eos_token=eos_token, bos_token=bos_token)
         env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
         self._template = env.from_string(template)
-        self._eos_token = eos_token
-        self._bos_token = bos_token
 
         self._strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
 
@@ -93,9 +92,6 @@ def strip_suffix(self, s: str) -> str:
             sys.stderr.write(f"Expected suffix ({self._suffix}) not found: {s}\n")
             return s
 
-    def __str__(self):
-        return f"ChatTemplate(template={self.template}, eos_token={self._eos_token}, bos_token={self._bos_token})"
-
     def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
         assert system_prompt.role == "system"
         # TODO: add to last system message, or create a new one just before the last user message
@@ -194,8 +190,8 @@ def flush():
 
         result = self._template.render(
             messages=messages,
-            eos_token=self._eos_token,
-            bos_token='' if omit_bos else self._bos_token,
+            eos_token=self.eos_token,
+            bos_token='' if omit_bos else self.bos_token,
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
@@ -339,7 +335,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         except ImportError:
             raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
 
-        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in args.tools])
+        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[tool.model_dump_json() for tool in args.tools])
         assert len(prompt) == 1 and prompt[0]["role"] == "system"
         self.output_format_prompt = Message(**prompt[0])
 
@@ -347,9 +343,6 @@ class FunctionaryToolsChatHandler(ChatHandler):
     def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         super().__init__(args)
 
-        # Only allowing a single tool call at a time for now.
-        # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
-
         self.output_format_prompt = Message(
             role="system",
             content= '// Supported function definitions that should be called when necessary.\n' +
@@ -585,19 +578,19 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
         return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls, escapes_underscores=True)
 
     elif tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
-        return Hermes2ProToolsChatHandler(args)
+        return Hermes2ProToolsChatHandler(args, parallel_calls=parallel_calls)
     else:
         raise ValueError(f"Unsupported tool call style: {args.chat_template.tool_style}")
 
 _ts_converter = SchemaToTypeScriptConverter()
 
 def _please_respond_with_schema(schema: dict) -> str:
-    # sig = json.dumps(schema, indent=2)
-    sig = _ts_converter.visit(schema)
+    sig = json.dumps(schema, indent=2)
+    # sig = _ts_converter.visit(schema)
     return f'Please respond in JSON format with the following schema: {sig}'
 
 def _tools_typescript_signatures(tools: list[Tool]) -> str:
-    return 'namespace functions {' + '\n'.join(
+    return 'namespace functions {\n' + '\n'.join(
         '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
         'type ' + tool.function.name + ' = (_: ' + _ts_converter.visit(tool.function.parameters) + ") => any;\n"
         for tool in tools
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index 108c1482e19c7..7ba5c439f86d9 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -24,13 +24,13 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st
         elif additional_properties == False:
             additional_properties = None
 
-        return "{" + ', '.join([
+        return "{\n" + ',\n'.join([
             f'{self._desc_comment(prop_schema)}{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
             for prop_name, prop_schema in properties
         ] + (
             [f"{self._desc_comment(additional_properties) if additional_properties else ''}[key: string]: {self.visit(additional_properties)}"]
             if additional_properties is not None else []
-        )) + "}"
+        )) + "\n}"
 
     def visit(self, schema: dict):
         def print_constant(v):

From d8a53eadf231f7c9e716b4b24f805ca97a55106d Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:00:07 +0000
Subject: [PATCH 29/68] openai: test features of templates at runtime, to make
 sure no bits of intel are lost

---
 examples/agent/test_chat_handlers.py          | 199 -------
 examples/openai/prompting.py                  | 165 ++++--
 .../{agent => openai}/test_chat_handlers.md   | 491 +++++++++++-------
 examples/openai/test_chat_handlers.py         | 251 +++++++++
 4 files changed, 688 insertions(+), 418 deletions(-)
 delete mode 100644 examples/agent/test_chat_handlers.py
 rename examples/{agent => openai}/test_chat_handlers.md (60%)
 create mode 100644 examples/openai/test_chat_handlers.py

diff --git a/examples/agent/test_chat_handlers.py b/examples/agent/test_chat_handlers.py
deleted file mode 100644
index f1b212f0e6b61..0000000000000
--- a/examples/agent/test_chat_handlers.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#
-#
-# python -m examples.agent.test_chat_handlers | tee examples/agent/test_chat_handlers.md
-    
-import json
-from pathlib import Path
-import typer
-from typing import Annotated
-
-from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, Tool, ToolFunction
-from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
-
-
-
-TEST_MESSAGES = [
-    Message(**{
-        "role": "user",
-        "name": None,
-        "tool_call_id": None,
-        "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
-        "tool_calls": None
-    }),
-    Message(**{
-        "role": "assistant",
-        "name": None,
-        "tool_call_id": None,
-        "content": "?",
-        "tool_calls": [
-            {
-                "id": "call_531873",
-                "type": "function",
-                "function": {
-                    "name": "add",
-                    "arguments": {
-                        "a": 2535,
-                        "b": 32222000403
-                    }
-                }
-            }
-        ]
-    }),
-    Message(**{
-        "role": "tool",
-        "name": "add",
-        "tool_call_id": "call_531873",
-        "content": "32222002938",
-        "tool_calls": None
-    })
-]
-
-TEST_TOOLS = [
-    Tool(
-        type="function",
-        function=ToolFunction(
-            name="add",
-            description="Adds two numbers",
-            parameters={
-                "properties": {
-                  "a": {"type": "integer"},
-                  "b": {"type": "integer"},
-                },
-                "required": ["a", "b"]
-            }
-        )
-    ),
-    Tool(
-        type="function",
-        function=ToolFunction(
-            name="say",
-            description="Says something out loud (TTS)",
-            parameters={
-                "properties": {
-                  "text": {
-                      "description": "The text to say out loud",
-                      "type": "string"
-                  },
-                },
-                "required": ["text"]
-            }
-        )
-    )
-]
-
-TEST_OUTPUT_SCHEMA = {"type": "integer"}
-
-if __name__ == "__main__":
-   
-    # chat_templates = {
-    #   'mistral_instruct_v0_1': ChatTemplate.from_huggingface("mistralai/Mixtral-8x7B-Instruct-v0.1"),
-    #   'functionary_v2_2': ChatTemplate.from_huggingface("meetkai/functionary-small-v2.2"),
-    #   'hermes_2_pro_mistral': ChatTemplate.from_huggingface("NousResearch/Hermes-2-Pro-Mistral-7B"),
-    #   'llama2': ChatTemplate.from_huggingface("meta-llama/Llama-2-7b-chat-hf"),
-    # }
-    # print(json.dumps({k: v.model_dump() for k, v in chat_templates.items()}, indent=2))
-    # exit(0)
-
-    chat_templates = {
-        "mistral_instruct_v0_1": {
-            "template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-            "eos_token": "</s>",
-            "bos_token": "<s>"
-        },
-        "functionary_v2_2": {
-            "template": "{#v2.2#}\n{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}",
-            "eos_token": "</s>",
-            "bos_token": "<s>"
-        },
-        "hermes_2_pro_mistral": {
-            "template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-            "eos_token": "<|im_end|>",
-            "bos_token": "<s>"
-        },
-        "llama2": {
-            "template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
-            "eos_token": "</s>",
-            "bos_token": "<s>"
-        },
-    }
-    chat_templates = {k: ChatTemplate(**v) for k, v in chat_templates.items()}
-
-    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES], indent=2)}\n```\n')
-
-    for model_name, chat_template in chat_templates.items():
-        print(f"\n# {model_name}\n")
-        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
-
-        print(f'\nPrompt:\n\n```js\n{chat_template.render(TEST_MESSAGES, add_generation_prompt=True)}\n```\n')
-
-        argss = {
-            "with tools": ChatHandlerArgs(
-                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
-                response_schema=TEST_OUTPUT_SCHEMA,
-                tools=TEST_TOOLS,
-            ),
-            "without tools": ChatHandlerArgs(
-                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
-                response_schema=TEST_OUTPUT_SCHEMA,
-                tools=[],
-            ),
-        }
-        
-        for style in ToolsPromptStyle:
-            if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
-                continue
-
-            if style == ToolsPromptStyle.TOOLS_MIXTRAL and model_name != "mistral_instruct_v0_1":
-                continue
-
-            if model_name == "mistral_instruct_v0_1" and style not in (ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS, ToolsPromptStyle.TOOLS_MIXTRAL):
-                continue
-
-            print(f'\n## {style}\n')
-
-            for tn, args in argss.items():
-                ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
-                
-                print(f'\n### {tn}\n')
-                
-                print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
-
-                print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
-
-
-    # test_templates([
-    #     Message(**{
-    #         "role": "user",
-    #         "name": None,
-    #         "tool_call_id": None,
-    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
-    #         "tool_calls": None
-    #     }),
-    #     Message(**{
-    #         "role": "assistant",
-    #         # "name": None,
-    #         "tool_call_id": None,
-    #         "content": "?",
-    #         "tool_calls": [
-    #             {
-    #                 # "id": "call_531873",
-    #                 "type": "function",
-    #                 "function": {
-    #                     "name": "add",
-    #                     "arguments": {
-    #                         "a": 2535,
-    #                         "b": 32222000403
-    #                     }
-    #                 }
-    #             }
-    #         ]
-    #     }),
-    #     Message(**{
-    #         "role": "tool",
-    #         "name": "add",
-    #         "tool_call_id": "call_531873",
-    #         "content": "32222002938",
-    #         "tool_calls": None
-    #     })
-    # ])
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index d2a0cd1292844..ab7aaa25ff55c 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -6,8 +6,8 @@
 import random
 import re
 import sys
-from typing import Optional
-from pydantic import BaseModel
+from typing import Annotated, Optional
+from pydantic import BaseModel, Field
 
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.openai.api import Tool, Message, FunctionCall, ToolCall
@@ -52,16 +52,31 @@ def raise_exception(msg: str):
 
 class ChatTemplate(BaseModel):
     template: str
-    inferred_tool_style: Optional['ToolsPromptStyle'] = None
     eos_token: str
     bos_token: str
+    
+    inferred_tool_style: Annotated[Optional['ToolsPromptStyle'], Field(exclude=True)] = None
+    expects_stringified_function_arguments: Annotated[Optional[bool], Field(exclude=True)] = None
+    expects_strict_user_assistant_alternance: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_call: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_call_content: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_result: Optional[bool] = None
+    formats_tool_name: Optional[bool] = None
+
+    @property
+    def potentially_supports_parallel_calls(self) -> bool:
+        return self.formats_tool_result and self.formats_tool_name
 
     def __init__(self, template: str, eos_token: str, bos_token: str):
         super().__init__(template=template, eos_token=eos_token, bos_token=bos_token)
         env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
         self._template = env.from_string(template)
+        print(template)
 
-        self._strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+        # self.expects_strict_user_assistant_alternance = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+
+        self.probe_template_capabilities()
+        self.extract_prefix_suffix_from_template()
 
         if "<|recipient|>' + tool_call['function']['name']" in template:
             self.inferred_tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
@@ -71,7 +86,50 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
             # self.inferred_tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
             # self.inferred_tool_style = ToolsPromptStyle.TOOLS_MIXTRAL
 
-        # TODO: Test whether the template supports formatting tool_calls
+    def probe_template_capabilities(self):
+
+        def test(messages: list[Message]):
+            return self._template.render(messages=messages, eos_token=self.eos_token, bos_token=self.bos_token, raise_exception=raise_exception, add_generation_prompt=True)
+
+        def succeeds(messages: list[Message], strings_to_find = ()):
+            try:
+                result = test(messages)
+                print(result)
+                for s in strings_to_find:
+                    if s not in result:
+                        return False
+                return True
+            except Exception as e:
+                print(e)
+                return False
+
+        # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        user_msg = Message(role="user", content="Hey")
+        assistant_msg = Message(role="assistant", content="I, Robot")
+        
+        self.expects_strict_user_assistant_alternance = not succeeds([assistant_msg, user_msg]) and succeeds([user_msg, assistant_msg])
+
+        thought = "Precious thought"
+        fn_name = "callMeMaybe"
+        toolcall = ToolCall(id="call_531873", type="function", function=FunctionCall(name=fn_name, arguments={"lol": 123}))
+        toolcall_msg = Message(role="assistant", content=None, tool_calls=[toolcall])
+        tool_result = "Tool result"
+        tool_name = "additioner"
+        tool_msg = Message(role="tool", name=tool_name, content=tool_result)
+        stringified_toolcall_msg = Message(role="assistant", content=None, tool_calls=[ToolCall(function=FunctionCall(name=fn_name, arguments=json.dumps({"lol": 123})))])
+        toolcall_content_msg = Message(role="assistant", content=thought, tool_calls=toolcall_msg.tool_calls)
+
+        self.formats_tool_call = succeeds([user_msg, toolcall_msg], (fn_name,))
+        if self.formats_tool_call:
+            self.formats_tool_call_content = succeeds([user_msg, toolcall_content_msg], (thought,))
+            self.expects_stringified_function_arguments = \
+                not succeeds([user_msg, toolcall_content_msg]) and succeeds([user_msg, stringified_toolcall_msg], (fn_name,))
+
+        self.formats_tool_result = succeeds([user_msg, assistant_msg, tool_msg], (tool_result,))
+        self.formats_tool_name = succeeds([user_msg, assistant_msg, tool_msg], (tool_name,))
+        # assert self.formats_tools or self.expects_strict_user_assistant_alternance
+
+    def extract_prefix_suffix_from_template(self):
 
         delimiter = '<%$[SAMPLE]$%>'
         user_msg = Message(role="user", content="Hey")
@@ -85,6 +143,7 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         self._prefix = prefix
         self._suffix = suffix
 
+
     def strip_suffix(self, s: str) -> str:
         if s.endswith(self._suffix):
             return s[:-len(self._suffix)]
@@ -123,54 +182,88 @@ def from_huggingface(model_id: str):
             eos_token = tokenizer.eos_token)
 
     def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
-        if self._strict_user_assistant_alternation and any(m.role not in ('user', 'assistant') for m in messages):
+        def normalize(m: Message):
+            if m.tool_calls:
+                if not self.formats_tool_call or not self.formats_tool_call_content:
+                    return Message(
+                        role=m.role,
+                        content='\n'.join([
+                            *([m.content] if m.content else ()),
+                            *([
+                                f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>'
+                                for tc in m.tool_calls
+                            ])
+                        ])
+                    )
+                elif self.expects_stringified_function_arguments:
+                    return Message(
+                        role=m.role,
+                        content=m.content,
+                        name=m.name,
+                        tool_call_id=m.tool_call_id,
+                        tool_calls=[
+                            ToolCall(
+                                id=tc.id,
+                                type=tc.type,
+                                function=FunctionCall(
+                                    name=tc.function.name,
+                                    arguments=json.dumps(tc.function.arguments)
+                                )
+                            )
+                            for tc in m.tool_calls
+                        ],
+                    )
+                else:
+                    return m
+            elif self.expects_strict_user_assistant_alternance and m.role not in ('user', 'assistant'):
+                if m.role == "system":
+                    return Message(role="user", content=f'[SYS]{m.content}[/SYS]')
+                elif m.role == "tool":
+                    return Message(role="user", content=f'[TOOL(name={m.name}, id={m.tool_call_id})]{m.content}[/TOOL]')
+                else:
+                    sys.stderr.write(f'Unexpected message role: {message.role}\n')
+                    return Message(role="user", content=f'[{m.role.upper()}]{m.content}[/{m.role.upper()}]')
+            else:
+                return m
+    
+        messages=[normalize(m) for m in messages]
+        
+        if self.expects_strict_user_assistant_alternance:
             new_messages=[]
-            i = 0
-            n = len(messages)
             current_role = 'user'
             current_content = []
 
             def flush():
                 nonlocal current_content
                 nonlocal current_role
-                new_messages.append(Message(
-                    role=current_role,
-                    content='\n'.join(current_content)
-                ))
-                current_content = []
+
+                if self.expects_strict_user_assistant_alternance or current_content:
+                    new_messages.append(Message(
+                        role=current_role,
+                        content='\n'.join(current_content)
+                    ))
+                    current_content = []
 
             for i, message in enumerate(messages):
+                assert message.role in ('user', 'assistant')
+
                 if message.role == current_role:
-                    current_content.append(message.content)
-                elif message.role in ('user', 'assistant'):
+                    if message.content:
+                        current_content.append(message.content)
+                else:
                     flush()
                     current_role = 'assistant' if current_role == 'user' else 'user'
-                    current_content.append(message.content)
-                else:
-                    if current_role == 'assistant':
-                        flush()
-                        current_role = 'user'
-                    if message.role == 'system':
-                        current_content.append(f'[SYS]{messages[i].content}[/SYS]')
-                    elif message.role == 'tool':
-                        current_content.append(f'[TOOL RESULT(name={messages[i].name}, id={messages[i].tool_call_id}]{messages[i].content}[/TOOL RESULT]')
-                    else:
-                        sys.stderr.write(f'Unexpected message role: {message.role}\n')
-                        current_content.append(f'[ROLE={messages[i].role}]{messages[i].content}[/ROLE]')
-
-                current_content.extend(
-                    f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>'
-                    for tc in (message.tool_calls or [])
-                )
+                    if message.content:
+                        current_content.append(message.content)
             if current_content:
                 flush()
-
             messages = new_messages
 
         # JSON!
         messages = [m.model_dump() for m in messages]
 
-        if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        if self.expects_stringified_function_arguments:
             messages = [
                 {
                     **m,
@@ -556,6 +649,10 @@ def parse(self, s: str) -> Optional[Message]:
 def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Optional[ToolsPromptStyle] = None, verbose=False) -> ChatHandler:
     tool_style = tool_style if tool_style is not None else args.chat_template.inferred_tool_style
 
+    if parallel_calls and not args.chat_template.potentially_supports_parallel_calls:
+        sys.stderr.write(f"# WARNING: Disabled parallel_calls as model does not seem to support it (will fall back to sequential calls)\n")
+        parallel_calls = False
+
     if verbose:
         sys.stderr.write(f"# Using tool style: {tool_style}\n")
 
diff --git a/examples/agent/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
similarity index 60%
rename from examples/agent/test_chat_handlers.md
rename to examples/openai/test_chat_handlers.md
index a93927fca4b38..3448d601249eb 100644
--- a/examples/agent/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -1,3 +1,111 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+Conversation roles must alternate user/assistant/user/assistant/...
+<s>[INST] Hey [/INST]I, Robot</s>
+unsupported operand type(s) for +: 'NoneType' and 'str'
+Conversation roles must alternate user/assistant/user/assistant/...
+Conversation roles must alternate user/assistant/user/assistant/...
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
+' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
+<|from|>assistant
+<|recipient|>all
+<|content|>I, Robot<|stop|>
+<|from|>user
+<|recipient|>all
+<|content|>Hey
+<|from|>assistant
+<|recipient|>
+<|from|>user
+<|recipient|>all
+<|content|>Hey
+<|stop|>
+<|from|>assistant
+<|recipient|>
+<|from|>user
+<|recipient|>all
+<|content|>Hey
+<|from|>assistant
+<|recipient|>all
+<|content|>I, Robot<|stop|>
+<|from|>additioner
+<|recipient|>all
+<|content|>Tool result
+<|from|>assistant
+<|recipient|>
+<|from|>user
+<|recipient|>all
+<|content|>Hey
+<|from|>assistant
+<|recipient|>all
+<|content|>I, Robot<|stop|>
+<|from|>additioner
+<|recipient|>all
+<|content|>Tool result
+<|from|>assistant
+<|recipient|>
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+<|im_start|>assistant
+I, Robot<|im_end|>
+<|im_start|>user
+Hey<|im_end|>
+<|im_start|>assistant
+
+can only concatenate str (not "NoneType") to str
+<|im_start|>user
+Hey<|im_end|>
+<|im_start|>assistant
+I, Robot<|im_end|>
+<|im_start|>tool
+Tool result<|im_end|>
+<|im_start|>assistant
+
+<|im_start|>user
+Hey<|im_end|>
+<|im_start|>assistant
+I, Robot<|im_end|>
+<|im_start|>tool
+Tool result<|im_end|>
+<|im_start|>assistant
+
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+Conversation roles must alternate user/assistant/user/assistant/...
+<s>[INST] Hey [/INST] I, Robot </s>
+'None' has no attribute 'strip'
+Conversation roles must alternate user/assistant/user/assistant/...
+Conversation roles must alternate user/assistant/user/assistant/...
 
 Messages:
 
@@ -7,20 +115,20 @@ Messages:
     "role": "user",
     "name": null,
     "tool_call_id": null,
-    "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+    "content": "Add two numbers for the purpose of this test.",
     "tool_calls": null
   },
   {
     "role": "assistant",
     "name": null,
     "tool_call_id": null,
-    "content": "?",
+    "content": null,
     "tool_calls": [
       {
         "id": "call_531873",
         "type": "function",
         "function": {
-          "name": "add",
+          "name": "superSecretTool",
           "arguments": {
             "a": 2535,
             "b": 32222000403
@@ -31,10 +139,17 @@ Messages:
   },
   {
     "role": "tool",
-    "name": "add",
+    "name": "superSecretTool",
     "tool_call_id": "call_531873",
     "content": "32222002938",
     "tool_calls": null
+  },
+  {
+    "role": "assistant",
+    "name": null,
+    "tool_call_id": null,
+    "content": "The sum of 2535 and 32222000403 is 42.",
+    "tool_calls": null
   }
 ]
 ```
@@ -53,8 +168,7 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result? [/INST]?
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "add", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL RESULT(name=add, id=call_531873]32222002938[/TOOL RESULT] [/INST]
+<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
@@ -72,7 +186,7 @@ Here are the tools available:
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -121,22 +235,24 @@ Please respond in JSON format with the following schema: {
         {
           "properties": {
             "tool_calls": {
-              "prefixItems": {
-                "properties": {
-                  "name": {
-                    "title": "Name of the tool to call",
-                    "type": "string"
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
                   },
-                  "arguments": {
-                    "title": "Arguments to pass to the tool",
-                    "type": "object"
-                  }
-                },
-                "required": [
-                  "name",
-                  "arguments"
-                ]
-              }
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
             }
           },
           "required": [
@@ -174,22 +290,22 @@ integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 next-step ::= next-step-0 | next-step-1
 next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
-next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
-next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
-next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
-next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
-next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
-next-step-0-tool-calls-item-0-name ::= "\"add\""
-next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
-next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
-next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
-next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
-next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
-next-step-0-tool-calls-item-1-name ::= "\"say\""
-next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
 next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
+next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
+next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
+next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
+next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
+next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
+next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
 next-step-1 ::= "{" space next-step-1-result-kv "}" space
 next-step-1-result-kv ::= "\"result\"" space ":" space integer
 next-step-kv ::= "\"next_step\"" space ":" space next-step
@@ -239,7 +355,7 @@ Call one or more functions to assist with the user query, every time this is pos
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -287,18 +403,11 @@ To call each function, give its name and arguments within <tool_call></tool_call
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"" "add" "\"" space
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -310,7 +419,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool" "\\"? "_" "call>" space (add-tool-call | say-tool-call)  space "</tool" "\\"? "_" "call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"" "superSecretTool" "\"" space
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool" "\\"? "_" "call>"
 ```
 
 
@@ -383,17 +499,17 @@ Prompt:
 ```js
 <|from|>user
 <|recipient|>all
-<|content|>What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?
+<|content|>Add two numbers for the purpose of this test.
 <|from|>assistant
 <|recipient|>all
-<|content|>?
-<|from|>assistant
-<|recipient|>add
-<|content|>{"a": 2535, "b": 32222000403}<|stop|>
-<|from|>add
+<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
 <|recipient|>all
 <|content|>32222002938
 <|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
 <|recipient|>
 ```
 
@@ -410,7 +526,7 @@ Prompt:
 // Supported function definitions that should be called when necessary.
 namespace functions {
 // Adds two numbers
-type add = (_: {
+type superSecretTool = (_: {
 a: number,
 b: number
 }) => any;
@@ -427,10 +543,6 @@ text: string
 Grammar:
 
 ```js
-add-args ::= "{" space add-args-a-kv "," space add-args-b-kv "}" space
-add-args-a-kv ::= "\"a\"" space ":" space integer
-add-args-b-kv ::= "\"b\"" space ":" space integer
-add-call ::= "add" "\n<|content|>\n" add-args "\n"
 content ::= start content-without-start
 content-without-start ::= "all\n<|content|>" not-from*
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
@@ -447,8 +559,12 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
+superSecretTool-args ::= "{" space superSecretTool-args-a-kv "," space superSecretTool-args-b-kv "}" space
+superSecretTool-args-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-args-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-call ::= "superSecretTool" "\n<|content|>\n" superSecretTool-args "\n"
 tool-call ::= start tool-call-without-start
-tool-call-without-start ::= add-call | say-call
+tool-call-without-start ::= superSecretTool-call | say-call
 ```
 
 
@@ -491,12 +607,14 @@ Prompt:
 
 ```js
 <|im_start|>user
-What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?<|im_end|>
+Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
-?<|im_end|>
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
 <|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
 
 ```
 
@@ -515,7 +633,7 @@ Here are the tools available:
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -558,18 +676,11 @@ Here are the tools available:
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -581,7 +692,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
@@ -621,7 +739,7 @@ Call one or more functions to assist with the user query, every time this is pos
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -669,18 +787,11 @@ To call each function, give its name and arguments within <tool_call></tool_call
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -692,7 +803,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
@@ -732,7 +850,7 @@ Here are the tools available:
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -781,22 +899,24 @@ Please respond in JSON format with the following schema: {
         {
           "properties": {
             "tool_calls": {
-              "prefixItems": {
-                "properties": {
-                  "name": {
-                    "title": "Name of the tool to call",
-                    "type": "string"
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
                   },
-                  "arguments": {
-                    "title": "Arguments to pass to the tool",
-                    "type": "object"
-                  }
-                },
-                "required": [
-                  "name",
-                  "arguments"
-                ]
-              }
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
             }
           },
           "required": [
@@ -834,22 +954,22 @@ integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 next-step ::= next-step-0 | next-step-1
 next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
-next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
-next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
-next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
-next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
-next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
-next-step-0-tool-calls-item-0-name ::= "\"add\""
-next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
-next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
-next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
-next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
-next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
-next-step-0-tool-calls-item-1-name ::= "\"say\""
-next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
 next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
+next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
+next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
+next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
+next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
+next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
+next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
 next-step-1 ::= "{" space next-step-1-result-kv "}" space
 next-step-1-result-kv ::= "\"result\"" space ":" space integer
 next-step-kv ::= "\"next_step\"" space ":" space next-step
@@ -894,7 +1014,7 @@ space ::= " "?
 Prompt:
 
 ```json
-You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-29. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"add","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
 Please keep a running summary with analysis of previous function results and summaries from previous iterations.
 Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
 Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
@@ -910,18 +1030,11 @@ For each function call return a valid json object (using doulbe quotes) with fun
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -933,7 +1046,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
@@ -972,8 +1092,7 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result? [/INST] ?
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "add", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL RESULT(name=add, id=call_531873]32222002938[/TOOL RESULT] [/INST]
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -991,7 +1110,7 @@ Here are the tools available:
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -1034,18 +1153,11 @@ Here are the tools available:
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -1057,7 +1169,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
@@ -1097,7 +1216,7 @@ Call one or more functions to assist with the user query, every time this is pos
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -1145,18 +1264,11 @@ To call each function, give its name and arguments within <tool_call></tool_call
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -1168,7 +1280,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
@@ -1208,7 +1327,7 @@ Here are the tools available:
 {
   "type": "function",
   "function": {
-    "name": "add",
+    "name": "superSecretTool",
     "description": "Adds two numbers",
     "parameters": {
       "properties": {
@@ -1257,22 +1376,24 @@ Please respond in JSON format with the following schema: {
         {
           "properties": {
             "tool_calls": {
-              "prefixItems": {
-                "properties": {
-                  "name": {
-                    "title": "Name of the tool to call",
-                    "type": "string"
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
                   },
-                  "arguments": {
-                    "title": "Arguments to pass to the tool",
-                    "type": "object"
-                  }
-                },
-                "required": [
-                  "name",
-                  "arguments"
-                ]
-              }
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
             }
           },
           "required": [
@@ -1310,22 +1431,22 @@ integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 next-step ::= next-step-0 | next-step-1
 next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
-next-step-0-tool-calls ::= "[" space ( next-step-0-tool-calls-item ( "," space next-step-0-tool-calls-item )* )? "]" space
-next-step-0-tool-calls-item ::= next-step-0-tool-calls-item-0 | next-step-0-tool-calls-item-1
-next-step-0-tool-calls-item-0 ::= "{" space next-step-0-tool-calls-item-0-name-kv "," space next-step-0-tool-calls-item-0-arguments-kv "}" space
-next-step-0-tool-calls-item-0-arguments ::= "{" space next-step-0-tool-calls-item-0-arguments-a-kv "," space next-step-0-tool-calls-item-0-arguments-b-kv "}" space
-next-step-0-tool-calls-item-0-arguments-a-kv ::= "\"a\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-b-kv ::= "\"b\"" space ":" space integer
-next-step-0-tool-calls-item-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-0-arguments
-next-step-0-tool-calls-item-0-name ::= "\"add\""
-next-step-0-tool-calls-item-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-0-name
-next-step-0-tool-calls-item-1 ::= "{" space next-step-0-tool-calls-item-1-name-kv "," space next-step-0-tool-calls-item-1-arguments-kv "}" space
-next-step-0-tool-calls-item-1-arguments ::= "{" space next-step-0-tool-calls-item-1-arguments-text-kv "}" space
-next-step-0-tool-calls-item-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-item-1-arguments
-next-step-0-tool-calls-item-1-arguments-text-kv ::= "\"text\"" space ":" space string
-next-step-0-tool-calls-item-1-name ::= "\"say\""
-next-step-0-tool-calls-item-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-item-1-name
+next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
 next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
+next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
+next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
+next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
+next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
+next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
+next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
 next-step-1 ::= "{" space next-step-1-result-kv "}" space
 next-step-1-result-kv ::= "\"result\"" space ":" space integer
 next-step-kv ::= "\"next_step\"" space ":" space next-step
@@ -1370,7 +1491,7 @@ space ::= " "?
 Prompt:
 
 ```json
-You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-29. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"add","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
 Please keep a running summary with analysis of previous function results and summaries from previous iterations.
 Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
 Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
@@ -1386,18 +1507,11 @@ For each function call return a valid json object (using doulbe quotes) with fun
 Grammar:
 
 ```js
-add-tool-call ::= "{" space add-tool-call-name-kv "," space add-tool-call-arguments-kv "}" space
-add-tool-call-arguments ::= "{" space add-tool-call-arguments-a-kv "," space add-tool-call-arguments-b-kv "}" space
-add-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-add-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-add-tool-call-arguments-kv ::= "\"arguments\"" space ":" space add-tool-call-arguments
-add-tool-call-name ::= "\"add\""
-add-tool-call-name-kv ::= "\"name\"" space ":" space add-tool-call-name
 content ::= [^<] | "<" [^t<] | "<t" [^o<]
 decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
 integer ::= ("-"? integral-part) space
 integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* (tool-call+ content*)?
+root ::= content* tool-call?
 say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
 say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
 say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
@@ -1409,7 +1523,14 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-tool-call ::= "<tool_call>" space (add-tool-call | say-tool-call)  space "</tool_call>"
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
 ```
 
 
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
new file mode 100644
index 0000000000000..7b44d29ca3d87
--- /dev/null
+++ b/examples/openai/test_chat_handlers.py
@@ -0,0 +1,251 @@
+#
+#
+# python -m examples.openai.test_chat_handlers | tee examples/openai/test_chat_handlers.md
+    
+import json
+
+from examples.openai.api import FunctionCall, Message, Tool, ToolCall, ToolFunction
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
+
+TEST_ARG_A = 2535
+TEST_ARG_B = 32222000403
+TEST_SUM = 32222002938
+
+QUESTION = "Add two numbers for the purpose of this test."
+ANSWER = "The sum of 2535 and 32222000403 is 42."
+
+PROMPT_MESSAGE = Message(
+    role="user",
+    content=QUESTION,
+)
+ASSIST_MESSAGE = Message(
+    role="assistant",
+    content=ANSWER,
+)
+TOOL_NAME = "superSecretTool"
+TOOL_CALL = ToolCall(
+    id="call_531873",
+    type="function",
+    function=FunctionCall(
+        name=TOOL_NAME,
+        arguments={
+            "a": TEST_ARG_A,
+            "b": TEST_ARG_B
+        }
+    )
+)
+TOOL_CALL_MESSAGE = Message(
+    role="assistant",
+    content=None,
+    tool_calls=[TOOL_CALL],
+)
+
+TEST_THOUGHT = "I've thought a lot about this."
+THOUGHTFUL_TOOL_CALL_MESSAGE = Message(
+    role="assistant",
+    content=TEST_THOUGHT,
+    tool_calls=[TOOL_CALL],
+)
+
+# UNDERSCORE_ESCAPED_TOOL_CALL_MESSAGE = Message(**{
+#     **TOOL_CALL_MESSAGE.model_dump(),
+#     "tool_calls": [
+#         json.loads(tc.model_dump_json().replace("_", "\\_"))
+#         for tc in TOOL_CALL_MESSAGE.tool_calls
+#     ],
+# })
+TOOL_MESSAGE = Message(
+    role="tool",
+    name=TOOL_NAME,
+    tool_call_id="call_531873",
+    content=f'{TEST_SUM}',
+    tool_calls=None
+)
+TEST_MESSAGES = [
+    PROMPT_MESSAGE,
+    TOOL_CALL_MESSAGE,
+    TOOL_MESSAGE,
+    ASSIST_MESSAGE,
+]
+TEST_MESSAGES_THOUGHT = [
+    PROMPT_MESSAGE,
+    THOUGHTFUL_TOOL_CALL_MESSAGE,
+    TOOL_MESSAGE,
+    ASSIST_MESSAGE,
+]
+
+
+TEST_TOOLS = [
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name=TOOL_NAME,
+            description="Adds two numbers",
+            parameters={
+                "properties": {
+                  "a": {"type": "integer"},
+                  "b": {"type": "integer"},
+                },
+                "required": ["a", "b"]
+            }
+        )
+    ),
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name="say",
+            description="Says something out loud (TTS)",
+            parameters={
+                "properties": {
+                  "text": {
+                      "description": "The text to say out loud",
+                      "type": "string"
+                  },
+                },
+                "required": ["text"]
+            }
+        )
+    )
+]
+
+TEST_OUTPUT_SCHEMA = {"type": "integer"}
+
+# Generate the JSON for TEST_TEMPLATES below by uncommenting this block:
+#
+# TEST_TEMPLATES = {
+#   'mistral_instruct_v0_1': ChatTemplate.from_huggingface("mistralai/Mixtral-8x7B-Instruct-v0.1"),
+#   'functionary_v2_2': ChatTemplate.from_huggingface("meetkai/functionary-small-v2.2"),
+#   'hermes_2_pro_mistral': ChatTemplate.from_huggingface("NousResearch/Hermes-2-Pro-Mistral-7B"),
+#   'llama2': ChatTemplate.from_huggingface("meta-llama/Llama-2-7b-chat-hf"),
+# }
+# print(json.dumps({k: v.model_dump() for k, v in TEST_TEMPLATES.items()}, indent=2))
+# exit(0)
+
+TEST_TEMPLATES = {
+    "mistral_instruct_v0_1": {
+        "template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+    "functionary_v2_2": {
+        "template": "{#v2.2#}\n{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+    "hermes_2_pro_mistral": {
+        "template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+        "eos_token": "<|im_end|>",
+        "bos_token": "<s>"
+    },
+    "llama2": {
+        "template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+}
+TEST_TEMPLATES = {k: ChatTemplate(**v) for k, v in TEST_TEMPLATES.items()}
+
+if __name__ == "__main__":
+   
+    failures = []
+
+    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES], indent=2)}\n```\n')
+
+    for model_name, chat_template in TEST_TEMPLATES.items():
+
+        # if model_name == 'hermes_2_pro_mistral':
+        #     print("Skipping hermes_2_pro_mistral")
+        #     continue
+        def check_finds(msgs, strings_to_find):
+            prompt = chat_template.render(msgs, add_generation_prompt=True)
+            for s in strings_to_find:
+                if str(s) not in prompt:
+                    failures.append(f"Missing {s} in prompt for {model_name}:\n{prompt}")
+
+        check_finds([PROMPT_MESSAGE], (QUESTION,))
+        check_finds([ASSIST_MESSAGE], (ANSWER,))
+        check_finds([TOOL_CALL_MESSAGE], (TEST_ARG_A, TEST_ARG_B, TOOL_NAME))
+        check_finds([THOUGHTFUL_TOOL_CALL_MESSAGE], (TEST_THOUGHT, TEST_ARG_A, TEST_ARG_B, TOOL_NAME,))
+        check_finds([TOOL_MESSAGE], (TEST_SUM,))
+        if chat_template.potentially_supports_parallel_calls:
+            check_finds([TOOL_MESSAGE], (TOOL_NAME,))
+
+        print(f"\n# {model_name}\n")
+        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
+
+        print(f'\nPrompt:\n\n```js\n{chat_template.render(TEST_MESSAGES, add_generation_prompt=True)}\n```\n')
+
+        argss = {
+            "with tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=TEST_TOOLS,
+            ),
+            "without tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=[],
+            ),
+        }
+        
+        for style in ToolsPromptStyle:
+            if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
+                continue
+
+            if style == ToolsPromptStyle.TOOLS_MIXTRAL and model_name != "mistral_instruct_v0_1":
+                continue
+
+            if model_name == "mistral_instruct_v0_1" and style not in (ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS, ToolsPromptStyle.TOOLS_MIXTRAL):
+                continue
+
+            print(f'\n## {style}\n')
+
+            for tn, args in argss.items():
+                ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
+                
+                print(f'\n### {tn}\n')
+                
+                print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
+
+                print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
+
+    if failures:
+        for f in failures:
+            print(f'{f}\n\n')
+
+        assert not failures
+    # test_templates([
+    #     Message(**{
+    #         "role": "user",
+    #         "name": None,
+    #         "tool_call_id": None,
+    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
+    #         "tool_calls": None
+    #     }),
+    #     Message(**{
+    #         "role": "assistant",
+    #         # "name": None,
+    #         "tool_call_id": None,
+    #         "content": "?",
+    #         "tool_calls": [
+    #             {
+    #                 # "id": "call_531873",
+    #                 "type": "function",
+    #                 "function": {
+    #                     "name": TOOL_NAME,
+    #                     "arguments": {
+    #                         "a": 2535,
+    #                         "b": 32222000403
+    #                     }
+    #                 }
+    #             }
+    #         ]
+    #     }),
+    #     Message(**{
+    #         "role": "tool",
+    #         "name": TOOL_NAME,
+    #         "tool_call_id": "call_531873",
+    #         "content": "32222002938",
+    #         "tool_calls": None
+    #     })
+    # ])

From ad2f4c119a59cc5187643f3996b3b2d678ca4d36 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:10:14 +0000
Subject: [PATCH 30/68] Update test_chat_handlers.py

---
 examples/openai/test_chat_handlers.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
index 7b44d29ca3d87..33173ef064889 100644
--- a/examples/openai/test_chat_handlers.py
+++ b/examples/openai/test_chat_handlers.py
@@ -1,8 +1,9 @@
 #
 #
 # python -m examples.openai.test_chat_handlers | tee examples/openai/test_chat_handlers.md
-    
+
 import json
+import sys
 
 from examples.openai.api import FunctionCall, Message, Tool, ToolCall, ToolFunction
 from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
@@ -143,15 +144,26 @@
         "bos_token": "<s>"
     },
 }
+MODELS_WITH_PARALLEL_CALLS = set(["functionary_v2_2"])
 TEST_TEMPLATES = {k: ChatTemplate(**v) for k, v in TEST_TEMPLATES.items()}
 
 if __name__ == "__main__":
-   
+
     failures = []
 
     print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES], indent=2)}\n```\n')
 
+    def check(b: bool, msg: str):
+        if not b:
+            sys.stderr.write(f'FAILURE: {msg}\n\n')
+            failures.append(msg)
+
+    functionary_v2_2 = TEST_TEMPLATES["functionary_v2_2"]
+    check(functionary_v2_2.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2, "functionary_v2_2 should be inferred as TYPESCRIPT_FUNCTIONARY_V2")
+
     for model_name, chat_template in TEST_TEMPLATES.items():
+        check(chat_template.potentially_supports_parallel_calls == (model_name in MODELS_WITH_PARALLEL_CALLS),
+              f"{model_name} should {'not ' if model_name not in MODELS_WITH_PARALLEL_CALLS else ''} be detected as potentially supporting parallel calls")
 
         # if model_name == 'hermes_2_pro_mistral':
         #     print("Skipping hermes_2_pro_mistral")
@@ -159,8 +171,7 @@
         def check_finds(msgs, strings_to_find):
             prompt = chat_template.render(msgs, add_generation_prompt=True)
             for s in strings_to_find:
-                if str(s) not in prompt:
-                    failures.append(f"Missing {s} in prompt for {model_name}:\n{prompt}")
+                check(str(s) in prompt, f"Missing {s} in prompt for {model_name}:\n{prompt}")
 
         check_finds([PROMPT_MESSAGE], (QUESTION,))
         check_finds([ASSIST_MESSAGE], (ANSWER,))
@@ -187,7 +198,7 @@ def check_finds(msgs, strings_to_find):
                 tools=[],
             ),
         }
-        
+
         for style in ToolsPromptStyle:
             if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
                 continue
@@ -202,9 +213,9 @@ def check_finds(msgs, strings_to_find):
 
             for tn, args in argss.items():
                 ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
-                
+
                 print(f'\n### {tn}\n')
-                
+
                 print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
 
                 print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')

From 3c3eff52aa7226f2422c141ff2dabb9b80580e1b Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:15:46 +0000
Subject: [PATCH 31/68] openai: quiet + update prompt output

---
 examples/openai/prompting.py          |   6 +-
 examples/openai/test_chat_handlers.md | 108 --------------------------
 2 files changed, 3 insertions(+), 111 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index ab7aaa25ff55c..f274df6a8f562 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -71,7 +71,7 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
         super().__init__(template=template, eos_token=eos_token, bos_token=bos_token)
         env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
         self._template = env.from_string(template)
-        print(template)
+        # print(template)
 
         # self.expects_strict_user_assistant_alternance = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
 
@@ -94,13 +94,13 @@ def test(messages: list[Message]):
         def succeeds(messages: list[Message], strings_to_find = ()):
             try:
                 result = test(messages)
-                print(result)
+                # print(result)
                 for s in strings_to_find:
                     if s not in result:
                         return False
                 return True
             except Exception as e:
-                print(e)
+                # print(e)
                 return False
 
         # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index 3448d601249eb..6f80964e98d09 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -1,111 +1,3 @@
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
-Conversation roles must alternate user/assistant/user/assistant/...
-<s>[INST] Hey [/INST]I, Robot</s>
-unsupported operand type(s) for +: 'NoneType' and 'str'
-Conversation roles must alternate user/assistant/user/assistant/...
-Conversation roles must alternate user/assistant/user/assistant/...
-{#v2.2#}
-{% for message in messages %}
-{% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
-{% if 'tool_calls' in message and message['tool_calls'] is not none %}
-{% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|stop|>
-' }}{% endif %}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}
-<|from|>assistant
-<|recipient|>all
-<|content|>I, Robot<|stop|>
-<|from|>user
-<|recipient|>all
-<|content|>Hey
-<|from|>assistant
-<|recipient|>
-<|from|>user
-<|recipient|>all
-<|content|>Hey
-<|stop|>
-<|from|>assistant
-<|recipient|>
-<|from|>user
-<|recipient|>all
-<|content|>Hey
-<|from|>assistant
-<|recipient|>all
-<|content|>I, Robot<|stop|>
-<|from|>additioner
-<|recipient|>all
-<|content|>Tool result
-<|from|>assistant
-<|recipient|>
-<|from|>user
-<|recipient|>all
-<|content|>Hey
-<|from|>assistant
-<|recipient|>all
-<|content|>I, Robot<|stop|>
-<|from|>additioner
-<|recipient|>all
-<|content|>Tool result
-<|from|>assistant
-<|recipient|>
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-<|im_start|>assistant
-I, Robot<|im_end|>
-<|im_start|>user
-Hey<|im_end|>
-<|im_start|>assistant
-
-can only concatenate str (not "NoneType") to str
-<|im_start|>user
-Hey<|im_end|>
-<|im_start|>assistant
-I, Robot<|im_end|>
-<|im_start|>tool
-Tool result<|im_end|>
-<|im_start|>assistant
-
-<|im_start|>user
-Hey<|im_end|>
-<|im_start|>assistant
-I, Robot<|im_end|>
-<|im_start|>tool
-Tool result<|im_end|>
-<|im_start|>assistant
-
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-Conversation roles must alternate user/assistant/user/assistant/...
-<s>[INST] Hey [/INST] I, Robot </s>
-'None' has no attribute 'strip'
-Conversation roles must alternate user/assistant/user/assistant/...
-Conversation roles must alternate user/assistant/user/assistant/...
 
 Messages:
 

From 6935503b530ac5d0d4a1fb9bcd26f1fb6957950f Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:50:36 +0000
Subject: [PATCH 32/68] openai: refactor chat handler vs. template

---
 examples/openai/prompting.py          | 115 +++--
 examples/openai/server.py             |   2 +-
 examples/openai/test_chat_handlers.md | 695 +++++++++++++++++++++++---
 examples/openai/test_chat_handlers.py |  56 ++-
 4 files changed, 717 insertions(+), 151 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index f274df6a8f562..222e133f0de72 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -88,12 +88,9 @@ def __init__(self, template: str, eos_token: str, bos_token: str):
 
     def probe_template_capabilities(self):
 
-        def test(messages: list[Message]):
-            return self._template.render(messages=messages, eos_token=self.eos_token, bos_token=self.bos_token, raise_exception=raise_exception, add_generation_prompt=True)
-
         def succeeds(messages: list[Message], strings_to_find = ()):
             try:
-                result = test(messages)
+                result = self.raw_render(messages, add_generation_prompt=True)
                 # print(result)
                 for s in strings_to_find:
                     if s not in result:
@@ -133,8 +130,8 @@ def extract_prefix_suffix_from_template(self):
 
         delimiter = '<%$[SAMPLE]$%>'
         user_msg = Message(role="user", content="Hey")
-        empty_prompt = self.render([user_msg], add_generation_prompt=True).strip()
-        planted_prompt = self.render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
+        empty_prompt = self.raw_render([user_msg], add_generation_prompt=True).strip()
+        planted_prompt = self.raw_render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
         assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
         [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
@@ -181,10 +178,59 @@ def from_huggingface(model_id: str):
             bos_token = tokenizer.bos_token,
             eos_token = tokenizer.eos_token)
 
-    def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
+    def raw_render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
+        result = self._template.render(
+            messages=messages,
+            eos_token=self.eos_token,
+            bos_token='' if omit_bos else self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=add_generation_prompt,
+        )
+        return result
+
+class ChatHandlerArgs(BaseModel):
+    chat_template: ChatTemplate
+    response_schema: Optional[dict] = None
+    tools: Optional[list[Tool]] = None
+
+class ChatHandler(ABC):
+    def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle]):
+        self.args = args
+        self.style = style
+        self.output_format_prompt: Optional[Message] = None
+        self.grammar: Optional[str] = None
+
+    @abstractmethod
+    def parse(self, s: str) -> Optional[Message]:
+        raise NotImplementedError()
+    
+    def render_prompt(self, messages: list[Message]) -> str:
         def normalize(m: Message):
+            if self.style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS:
+                if m.tool_calls:
+                    m = Message(
+                        role=m.role,
+                        content=json.dumps({
+                            _THOUGHT_KEY: m.content or '',
+                            "next_step": {
+                                "tool_calls": [tc.model_dump() for tc in m.tool_calls]
+                            }
+                        }, indent=2)
+                    )
+                else:
+                    m = Message(
+                        role=m.role,
+                        content=json.dumps({
+                            _THOUGHT_KEY: '',
+                            "next_step": {
+                                "result": m.content
+                            }
+                        }, indent=2)
+                    )
+                # Fall through to benefit from role normalization
+                
             if m.tool_calls:
-                if not self.formats_tool_call or not self.formats_tool_call_content:
+                if not self.args.chat_template.formats_tool_call or not self.args.chat_template.formats_tool_call_content:
                     return Message(
                         role=m.role,
                         content='\n'.join([
@@ -195,7 +241,7 @@ def normalize(m: Message):
                             ])
                         ])
                     )
-                elif self.expects_stringified_function_arguments:
+                elif self.args.chat_template.expects_stringified_function_arguments:
                     return Message(
                         role=m.role,
                         content=m.content,
@@ -215,7 +261,7 @@ def normalize(m: Message):
                     )
                 else:
                     return m
-            elif self.expects_strict_user_assistant_alternance and m.role not in ('user', 'assistant'):
+            elif self.args.chat_template.expects_strict_user_assistant_alternance and m.role not in ('user', 'assistant'):
                 if m.role == "system":
                     return Message(role="user", content=f'[SYS]{m.content}[/SYS]')
                 elif m.role == "tool":
@@ -228,7 +274,7 @@ def normalize(m: Message):
     
         messages=[normalize(m) for m in messages]
         
-        if self.expects_strict_user_assistant_alternance:
+        if self.args.chat_template.expects_strict_user_assistant_alternance:
             new_messages=[]
             current_role = 'user'
             current_content = []
@@ -237,7 +283,7 @@ def flush():
                 nonlocal current_content
                 nonlocal current_role
 
-                if self.expects_strict_user_assistant_alternance or current_content:
+                if self.args.chat_template.expects_strict_user_assistant_alternance or current_content:
                     new_messages.append(Message(
                         role=current_role,
                         content='\n'.join(current_content)
@@ -263,7 +309,7 @@ def flush():
         messages = [m.model_dump() for m in messages]
 
         # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        if self.expects_stringified_function_arguments:
+        if self.args.chat_template.expects_stringified_function_arguments:
             messages = [
                 {
                     **m,
@@ -281,33 +327,14 @@ def flush():
                 for m in messages
             ]
 
-        result = self._template.render(
+        return self.args.chat_template.raw_render(
             messages=messages,
-            eos_token=self.eos_token,
-            bos_token='' if omit_bos else self.bos_token,
-            raise_exception=raise_exception,
-            add_generation_prompt=add_generation_prompt,
+            add_generation_prompt=True,
         )
-        return result
-
-class ChatHandlerArgs(BaseModel):
-    chat_template: ChatTemplate
-    response_schema: Optional[dict] = None
-    tools: Optional[list[Tool]] = None
-
-class ChatHandler(ABC):
-    def __init__(self, args: ChatHandlerArgs):
-        self.args = args
-        self.output_format_prompt: Optional[Message] = None
-        self.grammar: Optional[str] = None
-
-    @abstractmethod
-    def parse(self, s: str) -> Optional[Message]:
-        raise NotImplementedError()
 
 class NoToolsChatHandler(ChatHandler):
     def __init__(self, args: ChatHandlerArgs):
-        super().__init__(args)
+        super().__init__(args, None)
         assert not args.tools
 
         if args.response_schema:
@@ -327,8 +354,8 @@ def parse(self, s: str) -> Optional[Message]:
         return Message(role="assistant", content=s)
 
 class ToolCallTagsChatHandler(ChatHandler):
-    def __init__(self, args: ChatHandlerArgs, escapes_underscores: bool, parallel_calls: bool):
-        super().__init__(args)
+    def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle], escapes_underscores: bool, parallel_calls: bool):
+        super().__init__(args, style)
 
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
         tool_rules = []
@@ -404,8 +431,8 @@ def parse(self, s: str) -> Optional[Message]:
 
 
 class TemplatedToolsChatHandler(ToolCallTagsChatHandler):
-    def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, escapes_underscores: bool = False):
-        super().__init__(args, escapes_underscores=escapes_underscores, parallel_calls=parallel_calls)
+    def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, escapes_underscores: bool = False, style: Optional[ToolsPromptStyle] = None):
+        super().__init__(args, style=style, escapes_underscores=escapes_underscores, parallel_calls=parallel_calls)
         assert '{tools}' in template, 'Template must contain "{tools}"'
 
         self.output_format_prompt = Message(
@@ -418,7 +445,7 @@ def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, e
 
 class Hermes2ProToolsChatHandler(ToolCallTagsChatHandler):
     def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
-        super().__init__(args, escapes_underscores=False, parallel_calls=parallel_calls)
+        super().__init__(args, style=ToolsPromptStyle.TOOLS_HERMES_2_PRO, escapes_underscores=False, parallel_calls=parallel_calls)
 
         # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
         path = str(Path(__file__).parent / "hermes_function_calling")
@@ -434,7 +461,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
 
 class FunctionaryToolsChatHandler(ChatHandler):
     def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
-        super().__init__(args)
+        super().__init__(args, ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2)
 
         self.output_format_prompt = Message(
             role="system",
@@ -541,9 +568,9 @@ def _make_bespoke_schema(response_schema, tool_call_schema, parallel_calls):
         # "required": ["next_step"]
     }
 
-class BespokeToolsChatHandler(ChatHandler):
+class ThoughtfulStepsToolsChatHandler(ChatHandler):
     def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
-        super().__init__(args)
+        super().__init__(args, ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS)
 
         # args.response_schema = args.response_schema or {}
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
@@ -660,7 +687,7 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
         return NoToolsChatHandler(args)
 
     elif tool_style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS:
-        return BespokeToolsChatHandler(args, parallel_calls=parallel_calls)
+        return ThoughtfulStepsToolsChatHandler(args, parallel_calls=parallel_calls)
 
     elif tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
         return FunctionaryToolsChatHandler(args, parallel_calls=parallel_calls)
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 6d19f12f1677b..474f07489a615 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -140,7 +140,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         if chat_handler.output_format_prompt:
             messages = chat_template.add_system_prompt(messages, chat_handler.output_format_prompt)
 
-        prompt = chat_template.render(messages, add_generation_prompt=True)
+        prompt = chat_handler.render_prompt(messages)
         
         if verbose:
             sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index 6f80964e98d09..f5fc81c446ece 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -47,23 +47,6 @@ Messages:
 ```
 
 
-# mistral_instruct_v0_1
-
-
-Template:
-
-```js
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
-```
-
-
-Prompt:
-
-```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
-```
-
-
 ## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
 
 
@@ -211,6 +194,55 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 ```
 
 
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+} [/INST]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}</s>[INST] [TOOL(name=None, id=None)]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}[/TOOL] [/INST]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}</s>
+```
+
+
 ### without tools
 
 
@@ -233,6 +265,23 @@ space ::= " "?
 ```
 
 
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+```
+
+
 ## ToolsPromptStyle.TOOLS_MIXTRAL
 
 
@@ -322,6 +371,23 @@ tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-t
 ```
 
 
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+```
+
+
 ### without tools
 
 
@@ -344,65 +410,20 @@ space ::= " "?
 ```
 
 
-# functionary_v2_2
+# mistral_instruct_v0_1
 
 
 Template:
 
 ```js
-{#v2.2#}
-{% for message in messages %}
-{% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
-{% if 'tool_calls' in message and message['tool_calls'] is not none %}
-{% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|stop|>
-' }}{% endif %}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
 ```
 
 
 Prompt:
 
 ```js
-<|from|>user
-<|recipient|>all
-<|content|>Add two numbers for the purpose of this test.
-<|from|>assistant
-<|recipient|>all
-<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
-<|from|>superSecretTool
-<|recipient|>all
-<|content|>32222002938
-<|from|>assistant
-<|recipient|>all
-<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
-<|from|>assistant
-<|recipient|>
+<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
@@ -460,6 +481,68 @@ tool-call-without-start ::= superSecretTool-call | say-call
 ```
 
 
+# functionary_v2_2
+
+
+Template:
+
+```js
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
+' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|from|>user
+<|recipient|>all
+<|content|>Add two numbers for the purpose of this test.
+<|from|>assistant
+<|recipient|>all
+<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
+<|recipient|>
+```
+
+
 ### without tools
 
 
@@ -482,32 +565,65 @@ space ::= " "?
 ```
 
 
-# hermes_2_pro_mistral
+# functionary_v2_2
 
 
 Template:
 
 ```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
 ' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
 ```
 
 
 Prompt:
 
 ```js
-<|im_start|>user
-Add two numbers for the purpose of this test.<|im_end|>
-<|im_start|>assistant
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
-<|im_start|>tool
-32222002938<|im_end|>
-<|im_start|>assistant
-The sum of 2535 and 32222000403 is 42.<|im_end|>
-<|im_start|>assistant
-
+<|from|>user
+<|recipient|>all
+<|content|>Add two numbers for the purpose of this test.
+<|from|>assistant
+<|recipient|>all
+<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
+<|recipient|>
 ```
 
 
@@ -595,6 +711,35 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ### without tools
 
 
@@ -617,6 +762,35 @@ space ::= " "?
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ## ToolsPromptStyle.TOOLS_LONG
 
 
@@ -706,6 +880,35 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ### without tools
 
 
@@ -728,6 +931,35 @@ space ::= " "?
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
 
 
@@ -875,6 +1107,67 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+}<|im_end|>
+<|im_start|>assistant
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}<|im_end|>
+<|im_start|>tool
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}<|im_end|>
+<|im_start|>assistant
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ### without tools
 
 
@@ -897,6 +1190,35 @@ space ::= " "?
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ## ToolsPromptStyle.TOOLS_HERMES_2_PRO
 
 
@@ -949,6 +1271,35 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+Prompt:
+
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
 ### without tools
 
 
@@ -971,20 +1322,32 @@ space ::= " "?
 ```
 
 
-# llama2
+# hermes_2_pro_mistral
 
 
 Template:
 
 ```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
 ```
 
 
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
 ```
 
 
@@ -1072,6 +1435,23 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ### without tools
 
 
@@ -1094,6 +1474,23 @@ space ::= " "?
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ## ToolsPromptStyle.TOOLS_LONG
 
 
@@ -1183,6 +1580,23 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ### without tools
 
 
@@ -1205,6 +1619,23 @@ space ::= " "?
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
 
 
@@ -1352,6 +1783,55 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+} [/INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+} </s><s>[INST] [TOOL(name=None, id=None)]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}[/TOOL] [/INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+} </s>
+```
+
+
 ### without tools
 
 
@@ -1374,6 +1854,23 @@ space ::= " "?
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ## ToolsPromptStyle.TOOLS_HERMES_2_PRO
 
 
@@ -1426,6 +1923,23 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
+
 ### without tools
 
 
@@ -1447,3 +1961,20 @@ root ::= ("-"? integral-part) space
 space ::= " "?
 ```
 
+
+# llama2
+
+
+Template:
+
+```js
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
+```
+
+
+Prompt:
+
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
+
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
index 33173ef064889..fc4b1680bddae 100644
--- a/examples/openai/test_chat_handlers.py
+++ b/examples/openai/test_chat_handlers.py
@@ -165,27 +165,6 @@ def check(b: bool, msg: str):
         check(chat_template.potentially_supports_parallel_calls == (model_name in MODELS_WITH_PARALLEL_CALLS),
               f"{model_name} should {'not ' if model_name not in MODELS_WITH_PARALLEL_CALLS else ''} be detected as potentially supporting parallel calls")
 
-        # if model_name == 'hermes_2_pro_mistral':
-        #     print("Skipping hermes_2_pro_mistral")
-        #     continue
-        def check_finds(msgs, strings_to_find):
-            prompt = chat_template.render(msgs, add_generation_prompt=True)
-            for s in strings_to_find:
-                check(str(s) in prompt, f"Missing {s} in prompt for {model_name}:\n{prompt}")
-
-        check_finds([PROMPT_MESSAGE], (QUESTION,))
-        check_finds([ASSIST_MESSAGE], (ANSWER,))
-        check_finds([TOOL_CALL_MESSAGE], (TEST_ARG_A, TEST_ARG_B, TOOL_NAME))
-        check_finds([THOUGHTFUL_TOOL_CALL_MESSAGE], (TEST_THOUGHT, TEST_ARG_A, TEST_ARG_B, TOOL_NAME,))
-        check_finds([TOOL_MESSAGE], (TEST_SUM,))
-        if chat_template.potentially_supports_parallel_calls:
-            check_finds([TOOL_MESSAGE], (TOOL_NAME,))
-
-        print(f"\n# {model_name}\n")
-        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
-
-        print(f'\nPrompt:\n\n```js\n{chat_template.render(TEST_MESSAGES, add_generation_prompt=True)}\n```\n')
-
         argss = {
             "with tools": ChatHandlerArgs(
                 chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
@@ -199,6 +178,13 @@ def check_finds(msgs, strings_to_find):
             ),
         }
 
+        print(f"\n# {model_name}\n")
+
+        if chat_template.potentially_supports_parallel_calls:
+            print("\n**Might Support Parallel Tool Calls**\n")
+
+        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
+
         for style in ToolsPromptStyle:
             if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
                 continue
@@ -209,17 +195,39 @@ def check_finds(msgs, strings_to_find):
             if model_name == "mistral_instruct_v0_1" and style not in (ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS, ToolsPromptStyle.TOOLS_MIXTRAL):
                 continue
 
-            print(f'\n## {style}\n')
+            print(f'\n## {model_name} / {style.name}\n')
+
 
-            for tn, args in argss.items():
+            for tool_situation, args in argss.items():
                 ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
 
-                print(f'\n### {tn}\n')
+                print(f'\n### {model_name} / {style.name} / {tool_situation}\n')
+                
+                print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES)}\n```\n')
 
                 print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
 
                 print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
 
+                        
+                # if model_name == 'hermes_2_pro_mistral':
+                #     print("Skipping hermes_2_pro_mistral")
+                #     continue
+                def check_finds(msgs, strings_to_find):
+                    prompt = ch.render_prompt(msgs)
+                    for s in strings_to_find:
+                        check(str(s) in prompt, f"Missing {s} in prompt for {model_name}:\n{prompt}")
+
+                check_finds([PROMPT_MESSAGE], (QUESTION,))
+                check_finds([ASSIST_MESSAGE], (ANSWER,))
+                check_finds([TOOL_CALL_MESSAGE], (TEST_ARG_A, TEST_ARG_B, TOOL_NAME))
+                check_finds([THOUGHTFUL_TOOL_CALL_MESSAGE], (TEST_THOUGHT, TEST_ARG_A, TEST_ARG_B, TOOL_NAME,))
+                check_finds([TOOL_MESSAGE], (TEST_SUM,))
+                if chat_template.potentially_supports_parallel_calls:
+                    check_finds([TOOL_MESSAGE], (TOOL_NAME,))
+
+
+
     if failures:
         for f in failures:
             print(f'{f}\n\n')

From d9f30f86c8b1201f7916c67d770bc5c1f393461a Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:50:44 +0000
Subject: [PATCH 33/68] Update test_chat_handlers.md

---
 examples/openai/test_chat_handlers.md | 702 +++++++++-----------------
 1 file changed, 236 insertions(+), 466 deletions(-)

diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index f5fc81c446ece..f01bbce21ffa8 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -47,10 +47,59 @@ Messages:
 ```
 
 
-## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
 
+## mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS
 
-### with tools
+
+### mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS / with tools
+
+
+Prompt:
+
+```js
+<s>[INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+} [/INST]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}</s>[INST] [TOOL(name=None, id=None)]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}[/TOOL] [/INST]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}</s>
+```
 
 
 Prompt:
@@ -194,58 +243,16 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 ```
 
 
-# mistral_instruct_v0_1
-
-
-Template:
-
-```js
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
-```
+### mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS / without tools
 
 
 Prompt:
 
 ```js
-<s>[INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-} [/INST]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "tool_calls": [
-      {
-        "id": "call_531873",
-        "type": "function",
-        "function": {
-          "name": "superSecretTool",
-          "arguments": {
-            "a": 2535,
-            "b": 32222000403
-          }
-        }
-      }
-    ]
-  }
-}</s>[INST] [TOOL(name=None, id=None)]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}[/TOOL] [/INST]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "The sum of 2535 and 32222000403 is 42."
-  }
-}</s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -265,14 +272,10 @@ space ::= " "?
 ```
 
 
-# mistral_instruct_v0_1
-
+## mistral_instruct_v0_1 / TOOLS_MIXTRAL
 
-Template:
 
-```js
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
-```
+### mistral_instruct_v0_1 / TOOLS_MIXTRAL / with tools
 
 
 Prompt:
@@ -282,12 +285,6 @@ Prompt:
 ```
 
 
-## ToolsPromptStyle.TOOLS_MIXTRAL
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -371,14 +368,7 @@ tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-t
 ```
 
 
-# mistral_instruct_v0_1
-
-
-Template:
-
-```js
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
-```
+### mistral_instruct_v0_1 / TOOLS_MIXTRAL / without tools
 
 
 Prompt:
@@ -388,9 +378,6 @@ Prompt:
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -410,27 +397,75 @@ space ::= " "?
 ```
 
 
-# mistral_instruct_v0_1
+# functionary_v2_2
+
+
+**Might Support Parallel Tool Calls**
 
 
 Template:
 
 ```js
-{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
+' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
 ```
 
 
-Prompt:
+## functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2
 
-```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
-```
 
+### functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2 / with tools
 
-## ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
 
+Prompt:
 
-### with tools
+```js
+<|from|>user
+<|recipient|>all
+<|content|>Add two numbers for the purpose of this test.
+<|from|>assistant
+<|recipient|>all
+<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
+<|recipient|>
+```
 
 
 Prompt:
@@ -481,46 +516,7 @@ tool-call-without-start ::= superSecretTool-call | say-call
 ```
 
 
-# functionary_v2_2
-
-
-Template:
-
-```js
-{#v2.2#}
-{% for message in messages %}
-{% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
-{% if 'tool_calls' in message and message['tool_calls'] is not none %}
-{% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|stop|>
-' }}{% endif %}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}
-```
+### functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2 / without tools
 
 
 Prompt:
@@ -543,9 +539,6 @@ Prompt:
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -565,72 +558,39 @@ space ::= " "?
 ```
 
 
-# functionary_v2_2
+# hermes_2_pro_mistral
 
 
 Template:
 
 ```js
-{#v2.2#}
-{% for message in messages %}
-{% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
-{% if 'tool_calls' in message and message['tool_calls'] is not none %}
-{% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|stop|>
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
 ' }}{% endif %}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}
 ```
 
 
-Prompt:
+## hermes_2_pro_mistral / TOOLS_SHORT
 
-```js
-<|from|>user
-<|recipient|>all
-<|content|>Add two numbers for the purpose of this test.
-<|from|>assistant
-<|recipient|>all
-<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
-<|from|>superSecretTool
-<|recipient|>all
-<|content|>32222002938
-<|from|>assistant
-<|recipient|>all
-<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
-<|from|>assistant
-<|recipient|>
-```
 
+### hermes_2_pro_mistral / TOOLS_SHORT / with tools
 
-## ToolsPromptStyle.TOOLS_SHORT
 
+Prompt:
 
-### with tools
+```js
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
 
 
 Prompt:
@@ -711,17 +671,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# hermes_2_pro_mistral
-
-
-Template:
-
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_SHORT / without tools
 
 
 Prompt:
@@ -740,9 +690,6 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -762,17 +709,10 @@ space ::= " "?
 ```
 
 
-# hermes_2_pro_mistral
-
+## hermes_2_pro_mistral / TOOLS_LONG
 
-Template:
 
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_LONG / with tools
 
 
 Prompt:
@@ -791,12 +731,6 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-## ToolsPromptStyle.TOOLS_LONG
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -880,17 +814,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# hermes_2_pro_mistral
-
-
-Template:
-
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_LONG / without tools
 
 
 Prompt:
@@ -909,9 +833,6 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -931,41 +852,60 @@ space ::= " "?
 ```
 
 
-# hermes_2_pro_mistral
+## hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS
 
 
-Template:
-
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS / with tools
 
 
 Prompt:
 
 ```js
 <|im_start|>user
-Add two numbers for the purpose of this test.<|im_end|>
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+}<|im_end|>
 <|im_start|>assistant
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}<|im_end|>
 <|im_start|>tool
-32222002938<|im_end|>
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}<|im_end|>
 <|im_start|>assistant
-The sum of 2535 and 32222000403 is 42.<|im_end|>
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}<|im_end|>
 <|im_start|>assistant
 
 ```
 
 
-## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -1103,74 +1043,29 @@ string ::=  "\"" (
         [^"\\] |
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" space
-thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
-```
-
-
-# hermes_2_pro_mistral
-
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
 
-Template:
 
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS / without tools
 
 
 Prompt:
 
 ```js
 <|im_start|>user
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-}<|im_end|>
+Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "tool_calls": [
-      {
-        "id": "call_531873",
-        "type": "function",
-        "function": {
-          "name": "superSecretTool",
-          "arguments": {
-            "a": 2535,
-            "b": 32222000403
-          }
-        }
-      }
-    ]
-  }
-}<|im_end|>
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}<|im_end|>
+32222002938<|im_end|>
 <|im_start|>assistant
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "The sum of 2535 and 32222000403 is 42."
-  }
-}<|im_end|>
+The sum of 2535 and 32222000403 is 42.<|im_end|>
 <|im_start|>assistant
 
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1190,17 +1085,10 @@ space ::= " "?
 ```
 
 
-# hermes_2_pro_mistral
-
+## hermes_2_pro_mistral / TOOLS_HERMES_2_PRO
 
-Template:
 
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_HERMES_2_PRO / with tools
 
 
 Prompt:
@@ -1219,12 +1107,6 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-## ToolsPromptStyle.TOOLS_HERMES_2_PRO
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -1271,17 +1153,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# hermes_2_pro_mistral
-
-
-Template:
-
-```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
-```
+### hermes_2_pro_mistral / TOOLS_HERMES_2_PRO / without tools
 
 
 Prompt:
@@ -1300,9 +1172,6 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1322,39 +1191,27 @@ space ::= " "?
 ```
 
 
-# hermes_2_pro_mistral
+# llama2
 
 
 Template:
 
 ```js
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
 ```
 
 
-Prompt:
-
-```js
-<|im_start|>user
-Add two numbers for the purpose of this test.<|im_end|>
-<|im_start|>assistant
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
-<|im_start|>tool
-32222002938<|im_end|>
-<|im_start|>assistant
-The sum of 2535 and 32222000403 is 42.<|im_end|>
-<|im_start|>assistant
+## llama2 / TOOLS_SHORT
 
-```
 
+### llama2 / TOOLS_SHORT / with tools
 
-## ToolsPromptStyle.TOOLS_SHORT
 
+Prompt:
 
-### with tools
+```js
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+```
 
 
 Prompt:
@@ -1435,14 +1292,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_SHORT / without tools
 
 
 Prompt:
@@ -1452,9 +1302,6 @@ Prompt:
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1474,14 +1321,10 @@ space ::= " "?
 ```
 
 
-# llama2
-
+## llama2 / TOOLS_LONG
 
-Template:
 
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_LONG / with tools
 
 
 Prompt:
@@ -1491,12 +1334,6 @@ Prompt:
 ```
 
 
-## ToolsPromptStyle.TOOLS_LONG
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -1580,14 +1417,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_LONG / without tools
 
 
 Prompt:
@@ -1597,9 +1427,6 @@ Prompt:
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1619,29 +1446,51 @@ space ::= " "?
 ```
 
 
-# llama2
-
+## llama2 / TOOLS_THOUGHTFUL_STEPS
 
-Template:
 
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_THOUGHTFUL_STEPS / with tools
 
 
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "Add two numbers for the purpose of this test."
+  }
+} [/INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+} </s><s>[INST] [TOOL(name=None, id=None)]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "32222002938"
+  }
+}[/TOOL] [/INST] {
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+} </s>
 ```
 
 
-## ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -1783,58 +1632,16 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 ```
 
 
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_THOUGHTFUL_STEPS / without tools
 
 
 Prompt:
 
 ```js
-<s>[INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-} [/INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "tool_calls": [
-      {
-        "id": "call_531873",
-        "type": "function",
-        "function": {
-          "name": "superSecretTool",
-          "arguments": {
-            "a": 2535,
-            "b": 32222000403
-          }
-        }
-      }
-    ]
-  }
-} </s><s>[INST] [TOOL(name=None, id=None)]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}[/TOOL] [/INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "The sum of 2535 and 32222000403 is 42."
-  }
-} </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1854,14 +1661,10 @@ space ::= " "?
 ```
 
 
-# llama2
-
+## llama2 / TOOLS_HERMES_2_PRO
 
-Template:
 
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_HERMES_2_PRO / with tools
 
 
 Prompt:
@@ -1871,12 +1674,6 @@ Prompt:
 ```
 
 
-## ToolsPromptStyle.TOOLS_HERMES_2_PRO
-
-
-### with tools
-
-
 Prompt:
 
 ```json
@@ -1923,14 +1720,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 ```
 
 
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
+### llama2 / TOOLS_HERMES_2_PRO / without tools
 
 
 Prompt:
@@ -1940,9 +1730,6 @@ Prompt:
 ```
 
 
-### without tools
-
-
 Prompt:
 
 ```json
@@ -1961,20 +1748,3 @@ root ::= ("-"? integral-part) space
 space ::= " "?
 ```
 
-
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
-
-
-Prompt:
-
-```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-

From da2067a0d63aa9c2eef6175245f9ab517885e4b6 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:55:08 +0000
Subject: [PATCH 34/68] openai: only special-format assistant in thoughtful
 mode

---
 examples/openai/prompting.py          |  2 +-
 examples/openai/test_chat_handlers.md | 42 ++++-----------------------
 2 files changed, 7 insertions(+), 37 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 222e133f0de72..b5c19e8fd6d20 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -206,7 +206,7 @@ def parse(self, s: str) -> Optional[Message]:
     
     def render_prompt(self, messages: list[Message]) -> str:
         def normalize(m: Message):
-            if self.style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS:
+            if self.style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS and m.role == "assistant":
                 if m.tool_calls:
                     m = Message(
                         role=m.role,
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index f01bbce21ffa8..28146dac08bd1 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -66,12 +66,7 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-} [/INST]{
+<s>[INST] Add two numbers for the purpose of this test. [/INST]{
   "thought_about_next_step_only": "",
   "next_step": {
     "tool_calls": [
@@ -88,12 +83,7 @@ Prompt:
       }
     ]
   }
-}</s>[INST] [TOOL(name=None, id=None)]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}[/TOOL] [/INST]{
+}</s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]{
   "thought_about_next_step_only": "",
   "next_step": {
     "result": "The sum of 2535 and 32222000403 is 42."
@@ -862,12 +852,7 @@ Prompt:
 
 ```js
 <|im_start|>user
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-}<|im_end|>
+Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
 {
   "thought_about_next_step_only": "",
@@ -888,12 +873,7 @@ Prompt:
   }
 }<|im_end|>
 <|im_start|>tool
-{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}<|im_end|>
+32222002938<|im_end|>
 <|im_start|>assistant
 {
   "thought_about_next_step_only": "",
@@ -1455,12 +1435,7 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "Add two numbers for the purpose of this test."
-  }
-} [/INST] {
+<s>[INST] Add two numbers for the purpose of this test. [/INST] {
   "thought_about_next_step_only": "",
   "next_step": {
     "tool_calls": [
@@ -1477,12 +1452,7 @@ Prompt:
       }
     ]
   }
-} </s><s>[INST] [TOOL(name=None, id=None)]{
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "32222002938"
-  }
-}[/TOOL] [/INST] {
+} </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] {
   "thought_about_next_step_only": "",
   "next_step": {
     "result": "The sum of 2535 and 32222000403 is 42."

From 09de4eb9ed0ee043fdd1a1093138ef3ebcc88662 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 01:57:05 +0000
Subject: [PATCH 35/68] openai: actually use thoughtful examples in tests

---
 examples/openai/test_chat_handlers.md | 51 ++++++++++++++++++---------
 examples/openai/test_chat_handlers.py |  4 +--
 2 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index 28146dac08bd1..a770998926391 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -14,7 +14,7 @@ Messages:
     "role": "assistant",
     "name": null,
     "tool_call_id": null,
-    "content": null,
+    "content": "I've thought a lot about this.",
     "tool_calls": [
       {
         "id": "call_531873",
@@ -67,7 +67,7 @@ Prompt:
 
 ```js
 <s>[INST] Add two numbers for the purpose of this test. [/INST]{
-  "thought_about_next_step_only": "",
+  "thought_about_next_step_only": "I've thought a lot about this.",
   "next_step": {
     "tool_calls": [
       {
@@ -239,7 +239,8 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
@@ -271,7 +272,8 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
@@ -364,7 +366,8 @@ tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-t
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
@@ -446,7 +449,8 @@ Prompt:
 <|content|>Add two numbers for the purpose of this test.
 <|from|>assistant
 <|recipient|>all
-<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|content|>I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
 <|from|>superSecretTool
 <|recipient|>all
 <|content|>32222002938
@@ -517,7 +521,8 @@ Prompt:
 <|content|>Add two numbers for the purpose of this test.
 <|from|>assistant
 <|recipient|>all
-<|content|><tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|content|>I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
 <|from|>superSecretTool
 <|recipient|>all
 <|content|>32222002938
@@ -573,6 +578,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -670,6 +676,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -711,6 +718,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -813,6 +821,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -855,7 +864,7 @@ Prompt:
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
 {
-  "thought_about_next_step_only": "",
+  "thought_about_next_step_only": "I've thought a lot about this.",
   "next_step": {
     "tool_calls": [
       {
@@ -1036,6 +1045,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -1077,6 +1087,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -1142,6 +1153,7 @@ Prompt:
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
+I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
 <|im_start|>tool
 32222002938<|im_end|>
@@ -1190,7 +1202,8 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1278,7 +1291,8 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1310,7 +1324,8 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1403,7 +1418,8 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1436,7 +1452,7 @@ Prompt:
 
 ```js
 <s>[INST] Add two numbers for the purpose of this test. [/INST] {
-  "thought_about_next_step_only": "",
+  "thought_about_next_step_only": "I've thought a lot about this.",
   "next_step": {
     "tool_calls": [
       {
@@ -1608,7 +1624,8 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1640,7 +1657,8 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
@@ -1696,7 +1714,8 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
+<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
index fc4b1680bddae..ee724e002d21f 100644
--- a/examples/openai/test_chat_handlers.py
+++ b/examples/openai/test_chat_handlers.py
@@ -151,7 +151,7 @@
 
     failures = []
 
-    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES], indent=2)}\n```\n')
+    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES_THOUGHT], indent=2)}\n```\n')
 
     def check(b: bool, msg: str):
         if not b:
@@ -203,7 +203,7 @@ def check(b: bool, msg: str):
 
                 print(f'\n### {model_name} / {style.name} / {tool_situation}\n')
                 
-                print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES)}\n```\n')
+                print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES_THOUGHT)}\n```\n')
 
                 print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
 

From 19811a4011f18396c347fc58189cb1c1f7ed3253 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 02:24:04 +0000
Subject: [PATCH 36/68] openai: tests didn't catch output format

---
 examples/openai/prompting.py          |  25 +-
 examples/openai/server.py             |   2 -
 examples/openai/test_chat_handlers.md | 666 ++++++++++++++++++++++++--
 examples/openai/test_chat_handlers.py |   2 +-
 4 files changed, 648 insertions(+), 47 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index b5c19e8fd6d20..1485ccf6e5f78 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -148,16 +148,6 @@ def strip_suffix(self, s: str) -> str:
             sys.stderr.write(f"Expected suffix ({self._suffix}) not found: {s}\n")
             return s
 
-    def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
-        assert system_prompt.role == "system"
-        # TODO: add to last system message, or create a new one just before the last user message
-        system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
-        if system_message is not None:
-            (i, m) = system_message
-            return messages[:i] + [Message(role="system", content=system_prompt.content + '\n' + m.content)] + messages[i+1:]
-        else:
-            return [system_prompt] + messages
-
     @staticmethod
     def from_gguf(metadata: GGUFKeyValues):
         if Keys.Tokenizer.CHAT_TEMPLATE not in metadata:
@@ -204,7 +194,22 @@ def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle]):
     def parse(self, s: str) -> Optional[Message]:
         raise NotImplementedError()
     
+
+    def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
+        assert system_prompt.role == "system"
+        # TODO: add to last system message, or create a new one just before the last user message
+        system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
+        if system_message is not None:
+            (i, m) = system_message
+            return messages[:i] + [Message(role="system", content=system_prompt.content + '\n' + m.content)] + messages[i+1:]
+        else:
+            return [system_prompt] + messages
+
     def render_prompt(self, messages: list[Message]) -> str:
+
+        if self.output_format_prompt:
+            messages = self.add_system_prompt(messages, self.output_format_prompt)
+
         def normalize(m: Message):
             if self.style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS and m.role == "assistant":
                 if m.tool_calls:
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 474f07489a615..aa2dba211b9c7 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -137,8 +137,6 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         )
 
         messages = chat_request.messages
-        if chat_handler.output_format_prompt:
-            messages = chat_template.add_system_prompt(messages, chat_handler.output_format_prompt)
 
         prompt = chat_handler.render_prompt(messages)
         
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index a770998926391..8cec828d06605 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -66,7 +66,105 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]{
+<s>[INST] [SYS]You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
+                  },
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST]{
   "thought_about_next_step_only": "I've thought a lot about this.",
   "next_step": {
     "tool_calls": [
@@ -92,7 +190,7 @@ Prompt:
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 You are a function calling AI model.
@@ -239,12 +337,15 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -272,12 +373,59 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<s>[INST] [SYS]Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
@@ -366,12 +514,15 @@ tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-t
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -444,6 +595,22 @@ Template:
 Prompt:
 
 ```js
+<|from|>system
+<|recipient|>all
+<|content|>// Supported function definitions that should be called when necessary.
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 <|from|>user
 <|recipient|>all
 <|content|>Add two numbers for the purpose of this test.
@@ -462,7 +629,7 @@ Prompt:
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 // Supported function definitions that should be called when necessary.
@@ -516,6 +683,11 @@ tool-call-without-start ::= superSecretTool-call | say-call
 Prompt:
 
 ```js
+<|from|>system
+<|recipient|>all
+<|content|>Please respond in JSON format with the following schema: {
+  "type": "integer"
+}
 <|from|>user
 <|recipient|>all
 <|content|>Add two numbers for the purpose of this test.
@@ -534,7 +706,7 @@ Prompt:
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -575,6 +747,49 @@ Template:
 Prompt:
 
 ```js
+<|im_start|>system
+Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools><|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -589,7 +804,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Here are the tools available:
@@ -673,6 +888,10 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
+<|im_start|>system
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -687,7 +906,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -715,6 +934,54 @@ space ::= " "?
 Prompt:
 
 ```js
+<|im_start|>system
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call><|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -729,7 +996,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
@@ -818,6 +1085,10 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
+<|im_start|>system
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -832,7 +1103,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -860,6 +1131,105 @@ space ::= " "?
 Prompt:
 
 ```js
+<|im_start|>system
+You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
+                  },
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -895,7 +1265,7 @@ Add two numbers for the purpose of this test.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 You are a function calling AI model.
@@ -1042,6 +1412,10 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
+<|im_start|>system
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1056,7 +1430,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -1084,6 +1458,17 @@ space ::= " "?
 Prompt:
 
 ```js
+<|im_start|>system
+You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+If you plan to continue with analysis, always call another function.
+For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"arguments": <args-dict>, "name": <function-name>}
+</tool_call>
+<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1098,7 +1483,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
@@ -1150,6 +1535,10 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
+<|im_start|>system
+Please respond in JSON format with the following schema: {
+  "type": "integer"
+}<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1164,7 +1553,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -1202,12 +1591,54 @@ Template:
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Here are the tools available:
@@ -1291,12 +1722,15 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -1324,12 +1758,59 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
@@ -1418,12 +1899,15 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -1451,7 +1935,105 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] {
+<s>[INST] [SYS]You are a function calling AI model.
+Here are the tools available:
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+Please respond in JSON format with the following schema: {
+  "type": "object",
+  "properties": {
+    "thought_about_next_step_only": {
+      "title": "Thought about next step",
+      "type": "string"
+    },
+    "next_step": {
+      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+      "oneOf": [
+        {
+          "properties": {
+            "tool_calls": {
+              "prefixItems": [
+                {
+                  "properties": {
+                    "name": {
+                      "title": "Name of the tool to call",
+                      "type": "string"
+                    },
+                    "arguments": {
+                      "title": "Arguments to pass to the tool",
+                      "type": "object"
+                    }
+                  },
+                  "required": [
+                    "name",
+                    "arguments"
+                  ]
+                }
+              ]
+            }
+          },
+          "required": [
+            "tool_calls"
+          ]
+        },
+        {
+          "title": "Result (achieving original goal)",
+          "properties": {
+            "result": {
+              "type": "integer"
+            }
+          },
+          "required": [
+            "result"
+          ]
+        }
+      ]
+    }
+  },
+  "required": [
+    "original_goal",
+    "thought_about_next_step_only",
+    "next_step"
+  ]
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST] {
   "thought_about_next_step_only": "I've thought a lot about this.",
   "next_step": {
     "tool_calls": [
@@ -1477,7 +2059,7 @@ Prompt:
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 You are a function calling AI model.
@@ -1624,12 +2206,15 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
@@ -1657,12 +2242,22 @@ space ::= " "?
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+If you plan to continue with analysis, always call another function.
+For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"arguments": <args-dict>, "name": <function-name>}
+</tool_call>
+[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
@@ -1714,12 +2309,15 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
+<s>[INST] [SYS]Please respond in JSON format with the following schema: {
+  "type": "integer"
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
 
 
-Prompt:
+Output format prompt:
 
 ```json
 Please respond in JSON format with the following schema: {
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
index ee724e002d21f..7d4c89c87cc7c 100644
--- a/examples/openai/test_chat_handlers.py
+++ b/examples/openai/test_chat_handlers.py
@@ -205,7 +205,7 @@ def check(b: bool, msg: str):
                 
                 print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES_THOUGHT)}\n```\n')
 
-                print(f'\nPrompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
+                print(f'\nOutput format prompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
 
                 print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
 

From 22fe86d8b803d4f4903fbbb256e55c0d5eb0b14f Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 30 Mar 2024 02:57:36 +0000
Subject: [PATCH 37/68] openai tools: TS signatures work well too at a fraction
 of the eval cost

---
 examples/openai/prompting.py          |   8 +-
 examples/openai/test_chat_handlers.md | 784 +++++---------------------
 2 files changed, 159 insertions(+), 633 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 1485ccf6e5f78..29a48ef43f5e2 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -608,7 +608,8 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
             content='\n'.join([
                 'You are a function calling AI model.',
                 'Here are the tools available:',
-                _tools_schema_signatures(self.args.tools, indent=2),
+                # _tools_schema_signatures(self.args.tools, indent=2),
+                _tools_typescript_signatures(self.args.tools),
                 _please_respond_with_schema(
                     _make_bespoke_schema(
                         response_schema,
@@ -713,9 +714,10 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
 
 _ts_converter = SchemaToTypeScriptConverter()
 
+# os.environ.get('NO_TS')
 def _please_respond_with_schema(schema: dict) -> str:
-    sig = json.dumps(schema, indent=2)
-    # sig = _ts_converter.visit(schema)
+    # sig = json.dumps(schema, indent=2)
+    sig = _ts_converter.visit(schema)
     return f'Please respond in JSON format with the following schema: {sig}'
 
 def _tools_typescript_signatures(tools: list[Tool]) -> str:
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index 8cec828d06605..8779441e2f292 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -68,101 +68,29 @@ Prompt:
 ```js
 <s>[INST] [SYS]You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }[/SYS]
 Add two numbers for the purpose of this test. [/INST]{
   "thought_about_next_step_only": "I've thought a lot about this.",
@@ -195,101 +123,29 @@ Output format prompt:
 ```json
 You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }
 ```
 
@@ -337,9 +193,7 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
@@ -348,9 +202,7 @@ Add two numbers for the purpose of this test. [/INST]I've thought a lot about th
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -514,9 +366,7 @@ tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-t
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
 ```
@@ -525,9 +375,7 @@ Add two numbers for the purpose of this test. [/INST]I've thought a lot about th
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -685,9 +533,7 @@ Prompt:
 ```js
 <|from|>system
 <|recipient|>all
-<|content|>Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+<|content|>Please respond in JSON format with the following schema: number
 <|from|>user
 <|recipient|>all
 <|content|>Add two numbers for the purpose of this test.
@@ -709,9 +555,7 @@ Prompt:
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -889,9 +733,7 @@ Prompt:
 
 ```js
 <|im_start|>system
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}<|im_end|>
+Please respond in JSON format with the following schema: number<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -909,9 +751,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1086,9 +926,7 @@ Prompt:
 
 ```js
 <|im_start|>system
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}<|im_end|>
+Please respond in JSON format with the following schema: number<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1106,9 +944,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1134,101 +970,29 @@ Prompt:
 <|im_start|>system
 You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
@@ -1270,101 +1034,29 @@ Output format prompt:
 ```json
 You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }
 ```
 
@@ -1413,9 +1105,7 @@ Prompt:
 
 ```js
 <|im_start|>system
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}<|im_end|>
+Please respond in JSON format with the following schema: number<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1433,9 +1123,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1536,9 +1224,7 @@ Prompt:
 
 ```js
 <|im_start|>system
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}<|im_end|>
+Please respond in JSON format with the following schema: number<|im_end|>
 <|im_start|>user
 Add two numbers for the purpose of this test.<|im_end|>
 <|im_start|>assistant
@@ -1556,9 +1242,7 @@ The sum of 2535 and 32222000403 is 42.<|im_end|>
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1722,9 +1406,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
@@ -1733,9 +1415,7 @@ Add two numbers for the purpose of this test. [/INST] I've thought a lot about t
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1899,9 +1579,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
@@ -1910,9 +1588,7 @@ Add two numbers for the purpose of this test. [/INST] I've thought a lot about t
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -1937,101 +1613,29 @@ Prompt:
 ```js
 <s>[INST] [SYS]You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }[/SYS]
 Add two numbers for the purpose of this test. [/INST] {
   "thought_about_next_step_only": "I've thought a lot about this.",
@@ -2064,101 +1668,29 @@ Output format prompt:
 ```json
 You are a function calling AI model.
 Here are the tools available:
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
 Please respond in JSON format with the following schema: {
-  "type": "object",
-  "properties": {
-    "thought_about_next_step_only": {
-      "title": "Thought about next step",
-      "type": "string"
-    },
-    "next_step": {
-      "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
-      "oneOf": [
-        {
-          "properties": {
-            "tool_calls": {
-              "prefixItems": [
-                {
-                  "properties": {
-                    "name": {
-                      "title": "Name of the tool to call",
-                      "type": "string"
-                    },
-                    "arguments": {
-                      "title": "Arguments to pass to the tool",
-                      "type": "object"
-                    }
-                  },
-                  "required": [
-                    "name",
-                    "arguments"
-                  ]
-                }
-              ]
-            }
-          },
-          "required": [
-            "tool_calls"
-          ]
-        },
-        {
-          "title": "Result (achieving original goal)",
-          "properties": {
-            "result": {
-              "type": "integer"
-            }
-          },
-          "required": [
-            "result"
-          ]
-        }
-      ]
-    }
-  },
-  "required": [
-    "original_goal",
-    "thought_about_next_step_only",
-    "next_step"
-  ]
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
 }
 ```
 
@@ -2206,9 +1738,7 @@ thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":"
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
@@ -2217,9 +1747,7 @@ Add two numbers for the purpose of this test. [/INST] I've thought a lot about t
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 
@@ -2309,9 +1837,7 @@ tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  s
 Prompt:
 
 ```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: {
-  "type": "integer"
-}[/SYS]
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
 Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
 <tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
 ```
@@ -2320,9 +1846,7 @@ Add two numbers for the purpose of this test. [/INST] I've thought a lot about t
 Output format prompt:
 
 ```json
-Please respond in JSON format with the following schema: {
-  "type": "integer"
-}
+Please respond in JSON format with the following schema: number
 ```
 
 

From 6e52a9ce48a8f3dce5be1d2e5dfbdd2a9e7ef18d Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 8 Apr 2024 19:18:01 +0100
Subject: [PATCH 38/68] Update test_chat_handlers.md

---
 examples/openai/test_chat_handlers.md | 720 --------------------------
 1 file changed, 720 deletions(-)

diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
index 8779441e2f292..ffbf700103dcb 100644
--- a/examples/openai/test_chat_handlers.md
+++ b/examples/openai/test_chat_handlers.md
@@ -1139,723 +1139,3 @@ space ::= " "?
 
 ## hermes_2_pro_mistral / TOOLS_HERMES_2_PRO
 
-
-### hermes_2_pro_mistral / TOOLS_HERMES_2_PRO / with tools
-
-
-Prompt:
-
-```js
-<|im_start|>system
-You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
-Please keep a running summary with analysis of previous function results and summaries from previous iterations.
-Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
-Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
-If you plan to continue with analysis, always call another function.
-For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"arguments": <args-dict>, "name": <function-name>}
-</tool_call>
-<|im_end|>
-<|im_start|>user
-Add two numbers for the purpose of this test.<|im_end|>
-<|im_start|>assistant
-I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
-<|im_start|>tool
-32222002938<|im_end|>
-<|im_start|>assistant
-The sum of 2535 and 32222000403 is 42.<|im_end|>
-<|im_start|>assistant
-
-```
-
-
-Output format prompt:
-
-```json
-You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
-Please keep a running summary with analysis of previous function results and summaries from previous iterations.
-Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
-Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
-If you plan to continue with analysis, always call another function.
-For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"arguments": <args-dict>, "name": <function-name>}
-</tool_call>
-
-```
-
-
-Grammar:
-
-```js
-content ::= [^<] | "<" [^t<] | "<t" [^o<]
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integer ::= ("-"? integral-part) space
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* tool-call?
-say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
-say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
-say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
-say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
-say-tool-call-name ::= "\"say\""
-say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
-space ::= " "?
-string ::=  "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space
-superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
-superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
-superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
-superSecretTool-tool-call-name ::= "\"superSecretTool\""
-superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
-tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
-```
-
-
-### hermes_2_pro_mistral / TOOLS_HERMES_2_PRO / without tools
-
-
-Prompt:
-
-```js
-<|im_start|>system
-Please respond in JSON format with the following schema: number<|im_end|>
-<|im_start|>user
-Add two numbers for the purpose of this test.<|im_end|>
-<|im_start|>assistant
-I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
-<|im_start|>tool
-32222002938<|im_end|>
-<|im_start|>assistant
-The sum of 2535 and 32222000403 is 42.<|im_end|>
-<|im_start|>assistant
-
-```
-
-
-Output format prompt:
-
-```json
-Please respond in JSON format with the following schema: number
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= ("-"? integral-part) space
-space ::= " "?
-```
-
-
-# llama2
-
-
-Template:
-
-```js
-{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
-```
-
-
-## llama2 / TOOLS_SHORT
-
-
-### llama2 / TOOLS_SHORT / with tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Here are the tools available:
-<tools>
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
-</tools>[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Here are the tools available:
-<tools>
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
-</tools>
-```
-
-
-Grammar:
-
-```js
-content ::= [^<] | "<" [^t<] | "<t" [^o<]
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integer ::= ("-"? integral-part) space
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* tool-call?
-say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
-say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
-say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
-say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
-say-tool-call-name ::= "\"say\""
-say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
-space ::= " "?
-string ::=  "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space
-superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
-superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
-superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
-superSecretTool-tool-call-name ::= "\"superSecretTool\""
-superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
-tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
-```
-
-
-### llama2 / TOOLS_SHORT / without tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Please respond in JSON format with the following schema: number
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= ("-"? integral-part) space
-space ::= " "?
-```
-
-
-## llama2 / TOOLS_LONG
-
-
-### llama2 / TOOLS_LONG / with tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
-<tools>
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
-</tools>
-
-To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>, "arguments": <args-dict>}
-</tool_call>[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
-<tools>
-{
-  "type": "function",
-  "function": {
-    "name": "superSecretTool",
-    "description": "Adds two numbers",
-    "parameters": {
-      "properties": {
-        "a": {
-          "type": "integer"
-        },
-        "b": {
-          "type": "integer"
-        }
-      },
-      "required": [
-        "a",
-        "b"
-      ]
-    }
-  }
-}
-{
-  "type": "function",
-  "function": {
-    "name": "say",
-    "description": "Says something out loud (TTS)",
-    "parameters": {
-      "properties": {
-        "text": {
-          "description": "The text to say out loud",
-          "type": "string"
-        }
-      },
-      "required": [
-        "text"
-      ]
-    }
-  }
-}
-</tools>
-
-To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>, "arguments": <args-dict>}
-</tool_call>
-```
-
-
-Grammar:
-
-```js
-content ::= [^<] | "<" [^t<] | "<t" [^o<]
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integer ::= ("-"? integral-part) space
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* tool-call?
-say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
-say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
-say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
-say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
-say-tool-call-name ::= "\"say\""
-say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
-space ::= " "?
-string ::=  "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space
-superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
-superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
-superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
-superSecretTool-tool-call-name ::= "\"superSecretTool\""
-superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
-tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
-```
-
-
-### llama2 / TOOLS_LONG / without tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Please respond in JSON format with the following schema: number
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= ("-"? integral-part) space
-space ::= " "?
-```
-
-
-## llama2 / TOOLS_THOUGHTFUL_STEPS
-
-
-### llama2 / TOOLS_THOUGHTFUL_STEPS / with tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]You are a function calling AI model.
-Here are the tools available:
-namespace functions {
-// Adds two numbers
-type superSecretTool = (_: {
-a: number,
-b: number
-}) => any;
-
-// Says something out loud (TTS)
-type say = (_: {
-// The text to say out loud
-text: string
-}) => any;
-} // namespace functions
-Please respond in JSON format with the following schema: {
-thought_about_next_step_only: string,
-next_step: {
-tool_calls: [{
-name: string,
-arguments: any
-}][]
-}|{
-result: number
-}
-}[/SYS]
-Add two numbers for the purpose of this test. [/INST] {
-  "thought_about_next_step_only": "I've thought a lot about this.",
-  "next_step": {
-    "tool_calls": [
-      {
-        "id": "call_531873",
-        "type": "function",
-        "function": {
-          "name": "superSecretTool",
-          "arguments": {
-            "a": 2535,
-            "b": 32222000403
-          }
-        }
-      }
-    ]
-  }
-} </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] {
-  "thought_about_next_step_only": "",
-  "next_step": {
-    "result": "The sum of 2535 and 32222000403 is 42."
-  }
-} </s>
-```
-
-
-Output format prompt:
-
-```json
-You are a function calling AI model.
-Here are the tools available:
-namespace functions {
-// Adds two numbers
-type superSecretTool = (_: {
-a: number,
-b: number
-}) => any;
-
-// Says something out loud (TTS)
-type say = (_: {
-// The text to say out loud
-text: string
-}) => any;
-} // namespace functions
-Please respond in JSON format with the following schema: {
-thought_about_next_step_only: string,
-next_step: {
-tool_calls: [{
-name: string,
-arguments: any
-}][]
-}|{
-result: number
-}
-}
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integer ::= ("-"? integral-part) space
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-next-step ::= next-step-0 | next-step-1
-next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
-next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
-next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
-next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
-next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
-next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
-next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
-next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
-next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
-next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
-next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
-next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
-next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
-next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
-next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
-next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
-next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
-next-step-1 ::= "{" space next-step-1-result-kv "}" space
-next-step-1-result-kv ::= "\"result\"" space ":" space integer
-next-step-kv ::= "\"next_step\"" space ":" space next-step
-root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
-space ::= " "?
-string ::=  "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space
-thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
-```
-
-
-### llama2 / TOOLS_THOUGHTFUL_STEPS / without tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Please respond in JSON format with the following schema: number
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= ("-"? integral-part) space
-space ::= " "?
-```
-
-
-## llama2 / TOOLS_HERMES_2_PRO
-
-
-### llama2 / TOOLS_HERMES_2_PRO / with tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
-Please keep a running summary with analysis of previous function results and summaries from previous iterations.
-Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
-Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
-If you plan to continue with analysis, always call another function.
-For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"arguments": <args-dict>, "name": <function-name>}
-</tool_call>
-[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-You are a function calling AI agent with self-recursion. You can call only one function at a time and analyse data you get from function response. You are provided with function signatures within <tools></tools> XML tags. The current date is: 2024-03-30. You may use agentic frameworks for reasoning and planning to help with user query. Please call a function and wait for function results to be provided to you in the next iteration. Don't make assumptions about what values to plug into function arguments. Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags. Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet. Analyze the data once you get the results and call another function. At each iteration please continue adding the your analysis to previous summary. Your final response should directly answer the user query with an anlysis or summary of the results of function calls. Here are the available tools: <tools> ['{"type":"function","function":{"name":"superSecretTool","description":"Adds two numbers","parameters":{"properties":{"a":{"type":"integer"},"b":{"type":"integer"}},"required":["a","b"]}}}', '{"type":"function","function":{"name":"say","description":"Says something out loud (TTS)","parameters":{"properties":{"text":{"description":"The text to say out loud","type":"string"}},"required":["text"]}}}'] </tools> If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: <tool_call> {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}} </tool_call> Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. Use the following pydantic model json schema for each tool call you will make: {'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name'], 'title': 'FunctionCall', 'type': 'object'} At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
-Please keep a running summary with analysis of previous function results and summaries from previous iterations.
-Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
-Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
-If you plan to continue with analysis, always call another function.
-For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"arguments": <args-dict>, "name": <function-name>}
-</tool_call>
-
-```
-
-
-Grammar:
-
-```js
-content ::= [^<] | "<" [^t<] | "<t" [^o<]
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integer ::= ("-"? integral-part) space
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= content* tool-call?
-say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
-say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
-say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
-say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
-say-tool-call-name ::= "\"say\""
-say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
-space ::= " "?
-string ::=  "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space
-superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
-superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
-superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
-superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
-superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
-superSecretTool-tool-call-name ::= "\"superSecretTool\""
-superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
-tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
-```
-
-
-### llama2 / TOOLS_HERMES_2_PRO / without tools
-
-
-Prompt:
-
-```js
-<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
-Add two numbers for the purpose of this test. [/INST] I've thought a lot about this.
-<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call> </s><s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST] The sum of 2535 and 32222000403 is 42. </s>
-```
-
-
-Output format prompt:
-
-```json
-Please respond in JSON format with the following schema: number
-```
-
-
-Grammar:
-
-```js
-decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
-root ::= ("-"? integral-part) space
-space ::= " "?
-```
-

From 701a66d80f5fbe1fc74f214f1f6f43708e26d57f Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Tue, 9 Apr 2024 02:14:08 +0100
Subject: [PATCH 39/68] agent: fix response_format

---
 examples/agent/README.md        | 15 +++++++--
 examples/agent/agent.py         | 11 ++++---
 examples/openai/api.py          |  4 +--
 examples/openai/prompting.py    |  8 +++--
 examples/openai/server.py       |  6 ++--
 examples/openai/ts_converter.py | 54 +++++++++++++++++++++++++++++++--
 6 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 8e9dec4695254..48a4e99316d59 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -138,19 +138,28 @@ If you'd like to debug each binary separately (rather than have an agent spawing
 ```bash
 # C++ server
 make -j server
-./server --model mixtral.gguf --port 8081
+./server \
+    --model mixtral.gguf \
+    --metrics \
+    -ctk q4_0 \
+    -ctv f16 \
+    -c 32768 \
+    --port 8081
 
 # OpenAI compatibility layer
 python -m examples.openai \
-    --port 8080
+    --port 8080 \
     --endpoint http://localhost:8081 \
-    --template_hf_model_id_fallback mistralai/Mixtral-8x7B-Instruct-v0.1
+    --template-hf-model-id-fallback mistralai/Mixtral-8x7B-Instruct-v0.1
 
 # Or have the OpenAI compatibility layer spawn the C++ server under the hood:
 #   python -m examples.openai --model mixtral.gguf
 
 # Agent itself:
 python -m examples.agent --endpoint http://localhost:8080 \
+    --tools examples/agent/tools/example_summaries.py \
+    --format PyramidalSummary \
+    --goal "Create a pyramidal summary of Mankind's recent advancements"
 ```
 
 ## Use existing tools (WIP)
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index e7373eb285970..7249a1fbd3972 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -10,7 +10,7 @@
 
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.agent.tools.std_tools import StandardTools
-from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, Tool, ToolFunction
+from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, ResponseFormat, Tool, ToolFunction
 from examples.agent.utils import collect_functions, load_module
 from examples.openai.prompting import ToolsPromptStyle
 
@@ -46,7 +46,7 @@ def completion_with_tool_usage(
         else:
             type_adapter = TypeAdapter(response_model)
             schema = type_adapter.json_schema()
-        response_format={"type": "json_object", "schema": schema }
+        response_format=ResponseFormat(type="json_object", schema=schema)
 
     tool_map = {fn.__name__: fn for fn in tools}
     tools_schemas = [
@@ -77,14 +77,15 @@ def completion_with_tool_usage(
         if auth:
             headers["Authorization"] = auth
         response = requests.post(
-            endpoint,
+            f'{endpoint}/v1/chat/completions',
             headers=headers,
             json=request.model_dump(),
         )
         if response.status_code != 200:
             raise Exception(f"Request failed ({response.status_code}): {response.text}")
 
-        response = ChatCompletionResponse(**response.json())
+        response_json = response.json()
+        response = ChatCompletionResponse(**response_json)
         if verbose:
             sys.stderr.write(f'# RESPONSE: {response.model_dump_json(indent=2)}\n')
         if response.error:
@@ -169,7 +170,7 @@ def main(
     if not endpoint:
         server_port = 8080
         server_host = 'localhost'
-        endpoint: str = f'http://{server_host}:{server_port}/v1/chat/completions'
+        endpoint = f'http://{server_host}:{server_port}'
         if verbose:
             sys.stderr.write(f"# Starting C++ server with model {model} on {endpoint}\n")
         cmd = [
diff --git a/examples/openai/api.py b/examples/openai/api.py
index 7780d8bc4e848..8f74cf99fdcb4 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -28,8 +28,8 @@ class Tool(BaseModel):
     function: ToolFunction
 
 class ResponseFormat(BaseModel):
-    type: str
-    json_schema: Optional[Any] = None
+    type: Literal["json_object"]
+    schema: Optional[Dict] = None
 
 class LlamaCppParams(BaseModel):
     n_predict: Optional[int] = None
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 29a48ef43f5e2..b3b4848a09d59 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -712,15 +712,19 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
     else:
         raise ValueError(f"Unsupported tool call style: {args.chat_template.tool_style}")
 
-_ts_converter = SchemaToTypeScriptConverter()
-
 # os.environ.get('NO_TS')
 def _please_respond_with_schema(schema: dict) -> str:
     # sig = json.dumps(schema, indent=2)
+    _ts_converter = SchemaToTypeScriptConverter()
+    _ts_converter.resolve_refs(schema, 'schema')
     sig = _ts_converter.visit(schema)
     return f'Please respond in JSON format with the following schema: {sig}'
 
 def _tools_typescript_signatures(tools: list[Tool]) -> str:
+    _ts_converter = SchemaToTypeScriptConverter()
+    for tool in tools:
+        _ts_converter.resolve_refs(tool.function.parameters, tool.function.name)
+
     return 'namespace functions {\n' + '\n'.join(
         '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
         'type ' + tool.function.name + ' = (_: ' + _ts_converter.visit(tool.function.parameters) + ") => any;\n"
diff --git a/examples/openai/server.py b/examples/openai/server.py
index aa2dba211b9c7..21e69fa92b619 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -73,7 +73,7 @@ def main(
         ]
         server_process = subprocess.Popen(cmd, stdout=sys.stderr)
         atexit.register(server_process.kill)
-        endpoint = f"http://{server_host}:{server_port}/completions"
+        endpoint = f"http://{server_host}:{server_port}"
 
 
     # print(chat_template.render([
@@ -125,7 +125,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         if chat_request.response_format is not None:
             assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
-            response_schema = chat_request.response_format.json_schema or {}
+            response_schema = chat_request.response_format.schema or {}
         else:
             response_schema = None
 
@@ -164,7 +164,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
 
         async with httpx.AsyncClient() as client:
             response = await client.post(
-                f"{endpoint}",
+                f'{endpoint}/completions',
                 json=data,
                 headers=headers,
                 timeout=None)
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index 7ba5c439f86d9..3c04bab7dd15a 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -14,6 +14,56 @@ class SchemaToTypeScriptConverter:
     # // where to get weather.
     # location: string,
     # }) => any;
+
+    def __init__(self):
+        self._refs = {}
+        self._refs_being_resolved = set()
+
+    def resolve_refs(self, schema: dict, url: str):
+        '''
+            Resolves all $ref fields in the given schema, fetching any remote schemas,
+            replacing $ref with absolute reference URL and populating self._refs with the
+            respective referenced (sub)schema dictionaries.
+        '''
+        def visit(n: dict):
+            if isinstance(n, list):
+                return [visit(x) for x in n]
+            elif isinstance(n, dict):
+                ref = n.get('$ref')
+                if ref is not None and ref not in self._refs:
+                    if ref.startswith('https://'):
+                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
+                        import requests
+
+                        frag_split = ref.split('#')
+                        base_url = frag_split[0]
+
+                        target = self._refs.get(base_url)
+                        if target is None:
+                            target = self.resolve_refs(requests.get(ref).json(), base_url)
+                            self._refs[base_url] = target
+
+                        if len(frag_split) == 1 or frag_split[-1] == '':
+                            return target
+                    elif ref.startswith('#/'):
+                        target = schema
+                        ref = f'{url}{ref}'
+                        n['$ref'] = ref
+                    else:
+                        raise ValueError(f'Unsupported ref {ref}')
+
+                    for sel in ref.split('#')[-1].split('/')[1:]:
+                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                        target = target[sel]
+
+                    self._refs[ref] = target
+                else:
+                    for v in n.values():
+                        visit(v)
+
+            return n
+        return visit(schema)
+
     def _desc_comment(self, schema: dict):
         desc = schema.get("description", "").replace("\n", "\n// ") if 'description' in schema else None
         return f'// {desc}\n' if desc else ''
@@ -78,7 +128,7 @@ def add_component(comp_schema, is_required):
                 else:
                     add_component(t, is_required=True)
 
-            return self._build_object_rule(properties, required, additional_properties=[])
+            return self._build_object_rule(properties, required, additional_properties={})
 
         elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
             items = schema.get('items') or schema['prefixItems']
@@ -94,4 +144,4 @@ def add_component(comp_schema, is_required):
             return 'any'
 
         else:
-            return 'number' if schema_type == 'integer' else schema_type
+            return 'number' if schema_type == 'integer' else schema_type or 'any'

From b447a743fb432fdbcb59eea11dffd805158c29b3 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Tue, 9 Apr 2024 09:41:45 +0100
Subject: [PATCH 40/68] agent: revert to json schemas (ts not ready for refs)

---
 examples/openai/prompting.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index b3b4848a09d59..3de25206949ea 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -608,8 +608,8 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
             content='\n'.join([
                 'You are a function calling AI model.',
                 'Here are the tools available:',
-                # _tools_schema_signatures(self.args.tools, indent=2),
-                _tools_typescript_signatures(self.args.tools),
+                _tools_schema_signatures(self.args.tools, indent=2),
+                # _tools_typescript_signatures(self.args.tools),
                 _please_respond_with_schema(
                     _make_bespoke_schema(
                         response_schema,
@@ -714,16 +714,16 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
 
 # os.environ.get('NO_TS')
 def _please_respond_with_schema(schema: dict) -> str:
-    # sig = json.dumps(schema, indent=2)
-    _ts_converter = SchemaToTypeScriptConverter()
-    _ts_converter.resolve_refs(schema, 'schema')
-    sig = _ts_converter.visit(schema)
+    sig = json.dumps(schema, indent=2)
+    # _ts_converter = SchemaToTypeScriptConverter()
+    # # _ts_converter.resolve_refs(schema, 'schema')
+    # sig = _ts_converter.visit(schema)
     return f'Please respond in JSON format with the following schema: {sig}'
 
 def _tools_typescript_signatures(tools: list[Tool]) -> str:
     _ts_converter = SchemaToTypeScriptConverter()
-    for tool in tools:
-        _ts_converter.resolve_refs(tool.function.parameters, tool.function.name)
+    # for tool in tools:
+    #     _ts_converter.resolve_refs(tool.function.parameters, tool.function.name)
 
     return 'namespace functions {\n' + '\n'.join(
         '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''

From 85820f4401f030165b28c2a7d1af9fd2652a5070 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Tue, 9 Apr 2024 21:03:32 +0100
Subject: [PATCH 41/68] agent: fix sandbox dockerfile

---
 examples/agent/run_sandboxed_tools.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/agent/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
index 8eddb5d92ef8a..5dbe19246af97 100755
--- a/examples/agent/run_sandboxed_tools.sh
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -52,14 +52,14 @@ echo "
     RUN      mkdir /src /data
 
     # Copy resources in increasing likelihood of change, to keep as much as possible cached
-    COPY     fastify-requirements.txt /root
+    COPY     fastify-requirements.txt /root/
     RUN      pip install -r /root/fastify-requirements.txt
-    COPY     script-requirements.txt  /root
+    COPY     script-requirements.txt  /root/
     RUN      pip install -r /root/script-requirements.txt
-    COPY     fastify.py utils.py      /root
+    COPY     fastify.py utils.py      /root/examples/agent/
 
     WORKDIR  /data
-    ENTRYPOINT PYTHONPATH=/src python /root/fastify.py --port=$PORT '/src/$( basename "$script" )'
+    ENTRYPOINT PYTHONPATH=/src:/root python -m examples.agent.fastify --port=$PORT '/src/$( basename "$script" )'
 " | docker build "$BUILD_DIR" -f - -t "$LLAMA_IMAGE_NAME"
 
 echo "#"

From 6880f1d4c01d7a2ffeaf8453966ed1d0f60fbb49 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Tue, 9 Apr 2024 23:40:11 +0100
Subject: [PATCH 42/68] agent: support basic openapi tools (incl. from fastify
 sandbox)

---
 examples/agent/README.md                    | 24 ++++--
 examples/agent/agent.py                     | 34 ++++----
 examples/agent/fastify.py                   |  2 +-
 examples/agent/openapi_client.py            | 88 +++++++++++++++++++++
 examples/agent/tools/unsafe_python_tools.py | 30 +++++--
 examples/openai/prompting.py                | 26 +++---
 6 files changed, 167 insertions(+), 37 deletions(-)
 create mode 100644 examples/agent/openapi_client.py

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 48a4e99316d59..6cdabd0e2b525 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -108,15 +108,25 @@ The agent can use tools written in Python, or (soon) exposed under OpenAPI endpo
 so we provide a script to run them in a Docker-sandboxed environment, exposed as an OpenAPI server:
 
     ```bash
-    examples/openai/run_sandboxed_tools.sh \
-        examples/agent/tools/unsafe_python_tools.py 6666 &
+    PORT=9999 examples/openai/run_sandboxed_tools.sh \
+        examples/agent/tools/unsafe_python_tools.py &
 
-    python -m examples.openai.reactor \
-        --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
-        --tools http://localhost:6666 \
+    python -m examples.agent \
+        --tools http://localhost:9999 \
         --goal "Whats cos(123) / 23 * 12.6 ?"
     ```
 
+    <details>
+    <summary>Show output</summary>
+
+    ```
+    💭 Calculate the expression using Python
+    ⚙️  execute_python(source="import math\nresult = math.cos(123) / 23 * 12.6") -> {'result': -0.4864525314920599}
+    ➡️ "-0.4864525314920599"
+    ```
+
+    </details>
+
     - [fastify.py](./fastify.py) turns a python module into an OpenAPI endpoint using FastAPI
 
     - [run_sandboxed_tools.sh](./run_sandboxed_tools.sh) builds and runs a Docker environment with fastify inside it, and exposes its port locally
@@ -125,7 +135,6 @@ so we provide a script to run them in a Docker-sandboxed environment, exposed as
 
     ```bash
     python -m examples.agent \
-        --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
         --tools examples/agent/tools/example_summaries.py \
         --format PyramidalSummary \
         --goal "Create a pyramidal summary of Mankind's recent advancements"
@@ -156,7 +165,8 @@ python -m examples.openai \
 #   python -m examples.openai --model mixtral.gguf
 
 # Agent itself:
-python -m examples.agent --endpoint http://localhost:8080 \
+python -m examples.agent \
+    --endpoint http://localhost:8080 \
     --tools examples/agent/tools/example_summaries.py \
     --format PyramidalSummary \
     --goal "Create a pyramidal summary of Mankind's recent advancements"
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 7249a1fbd3972..ca5d2bd9c602b 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -1,21 +1,25 @@
 import atexit
+import os
 from pathlib import Path
 import subprocess
 import sys
 from time import sleep
 import typer
-from pydantic import Json, TypeAdapter
+from pydantic import BaseModel, Json, TypeAdapter
 from typing import Annotated, Callable, List, Union, Optional, Type
 import json, requests
 
-from examples.json_schema_to_grammar import SchemaConverter
+from examples.agent.openapi_client import OpenAPIMethod, openapi_methods_from_endpoint
 from examples.agent.tools.std_tools import StandardTools
 from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, ResponseFormat, Tool, ToolFunction
 from examples.agent.utils import collect_functions, load_module
 from examples.openai.prompting import ToolsPromptStyle
 
 def _get_params_schema(fn: Callable, verbose):
-    converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+    if isinstance(fn, OpenAPIMethod):
+        return fn.parameters_schema
+    
+    # converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
     schema = TypeAdapter(fn).json_schema()
     # Do NOT call converter.resolve_refs(schema) here. Let the server resolve local refs.
     if verbose:
@@ -81,9 +85,7 @@ def completion_with_tool_usage(
             headers=headers,
             json=request.model_dump(),
         )
-        if response.status_code != 200:
-            raise Exception(f"Request failed ({response.status_code}): {response.text}")
-
+        response.raise_for_status()
         response_json = response.json()
         response = ChatCompletionResponse(**response_json)
         if verbose:
@@ -101,8 +103,9 @@ def completion_with_tool_usage(
                 if content:
                     print(f'💭 {content}')
 
-                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v}" for k, v in tool_call.function.arguments.items())})'
+                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v.model_dump_json() if isinstance(v, BaseModel) else json.dumps(v)}" for k, v in tool_call.function.arguments.items())})'
                 sys.stdout.write(f'⚙️  {pretty_call}')
+                sys.stdout.flush()
                 tool_result = tool_map[tool_call.function.name](**tool_call.function.arguments)
                 sys.stdout.write(f" -> {tool_result}\n")
                 messages.append(Message(
@@ -188,13 +191,16 @@ def main(
     tool_functions = []
     types = {}
     for f in tools:
-        module = load_module(f)
-        tool_functions.extend(collect_functions(module))
-        types.update({
-            k: v
-            for k, v in module.__dict__.items()
-            if isinstance(v, type)
-        })
+        if f.startswith('http://') or f.startswith('https://'):
+            tool_functions.extend(openapi_methods_from_endpoint(f))
+        else:
+            module = load_module(f)
+            tool_functions.extend(collect_functions(module))
+            types.update({
+                k: v
+                for k, v in module.__dict__.items()
+                if isinstance(v, type)
+            })
 
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
diff --git a/examples/agent/fastify.py b/examples/agent/fastify.py
index ccffe9d84a4b9..0cfd5f86888bd 100644
--- a/examples/agent/fastify.py
+++ b/examples/agent/fastify.py
@@ -27,7 +27,7 @@ def bind_functions(app, module):
 
         print(f'INFO:     Binding /{k}')
         try:
-            app.post(k)(v)
+            app.post('/' + k)(v)
         except Exception as e:
             print(f'WARNING:    Failed to bind /{k}\n\t{e}')
 
diff --git a/examples/agent/openapi_client.py b/examples/agent/openapi_client.py
new file mode 100644
index 0000000000000..0a6980b73f19e
--- /dev/null
+++ b/examples/agent/openapi_client.py
@@ -0,0 +1,88 @@
+
+import json
+import requests
+import urllib
+
+
+class OpenAPIMethod:
+    def __init__(self, url, name, descriptor, catalog):
+        self.url = url
+        self.__name__ = name
+
+        assert 'post' in descriptor, 'Only POST methods are supported'
+        post_descriptor = descriptor['post']
+
+        self.__doc__ = post_descriptor['description']
+        parameters = post_descriptor.get('parameters', [])
+        request_body = post_descriptor.get('requestBody')
+
+        self.parameters = {p['name']: p for p in parameters}
+        assert all(param['in'] == 'query' for param in self.parameters.values()), f'Only query path parameters are supported (path: {path}, descriptor: {json.dumps(descriptor)})'
+
+        self.body = None
+        self.body_name = None
+        if request_body:
+            assert 'application/json' in request_body['content'], f'Only application/json is supported for request body (path: {path}, descriptor: {json.dumps(descriptor)})'
+            self.body = dict(
+                required=request_body['required'],
+                schema=request_body['content']['application/json']['schema'],
+            )
+
+            self.body_name = 'body'
+            i = 2
+            while self.body_name in self.parameters:
+                self.body_name = f'body{i}'
+                i += 1
+
+        self.parameters_schema = dict(
+            type='object',
+            properties={
+                **({
+                    self.body_name: self.body['schema']
+                } if self.body else {}),
+                **{
+                    name: param['schema']
+                    for name, param in self.parameters.items()
+                }
+            },
+            components=catalog.get('components'),
+            required=[name for name, param in self.parameters.items() if param['required']] + ([self.body_name] if self.body and self.body['required'] else [])
+        )
+
+    def __call__(self, **kwargs):
+        if self.body:
+            body = kwargs.pop(self.body_name, None)
+            if self.body['required']:
+                assert body is not None, f'Missing required body parameter: {self.body_name}'
+        else:
+            body = None
+
+        query_params = {}
+        for name, param in self.parameters.items():
+            value = kwargs.pop(name, None)
+            if param['required']:
+                assert value is not None, f'Missing required parameter: {name}'
+
+            assert param['in'] == 'query', 'Only query parameters are supported'
+            query_params[name] = value
+
+        params = "&".join(f"{name}={urllib.parse.quote(value)}" for name, value in query_params.items())
+        url = f'{self.url}?{params}'
+        response = requests.post(url, json=body)
+        response.raise_for_status()
+        response_json = response.json()
+
+        return response_json
+
+
+def openapi_methods_from_endpoint(url):
+    catalog_url = f'{url}/openapi.json'
+    catalog_response = requests.get(catalog_url)
+    catalog_response.raise_for_status()
+    catalog = catalog_response.json()
+
+    methods = [
+        OpenAPIMethod(url=f'{url}{path}', name=path.replace('/', ' ').strip().replace(' ', '_'), descriptor=descriptor, catalog=catalog)
+        for path, descriptor in catalog['paths'].items()
+    ]
+    return methods
diff --git a/examples/agent/tools/unsafe_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
index 2b2d60e51f888..4a8a103c5a008 100644
--- a/examples/agent/tools/unsafe_python_tools.py
+++ b/examples/agent/tools/unsafe_python_tools.py
@@ -1,8 +1,28 @@
-import math
+import json
+import sys
+import types
+from typing import Dict, Union
 
-def eval_python_expression(expr: str) -> float:
+def execute_python(source: str) -> Union[Dict, str]:
     """
-        Evaluate a Python expression reliably.
-        This can be used to compute complex nested mathematical expressions, or any python, really.
+        Evaluate a Python program and return the globals it declared.
+        Can be used to compute mathematical expressions.
+
+        Args:
+            source: contain valid, executable and pure Python code. Should also import any required Python packages.
+                For example: "import math\nresult = math.cos(2) * 10"
+
+        Returns:
+            dict | str: A dictionary containing variables declared, or an error message if an exception occurred.
     """
-    return eval(expr)
+    namespace = {}
+    sys.stderr.write(f"Executing Python program:\n{source}\n")
+    exec(source, namespace)
+    results = {
+        k: v
+        for k, v in namespace.items()
+        if not k.startswith('_') and not isinstance(v, type) and not callable(v) and not isinstance(v, types.ModuleType)
+    }
+    sys.stderr.write(f"Results: {json.dumps(results, indent=2)}\n")
+
+    return results
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 3de25206949ea..10f68fdce1aa1 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -54,7 +54,7 @@ class ChatTemplate(BaseModel):
     template: str
     eos_token: str
     bos_token: str
-    
+
     inferred_tool_style: Annotated[Optional['ToolsPromptStyle'], Field(exclude=True)] = None
     expects_stringified_function_arguments: Annotated[Optional[bool], Field(exclude=True)] = None
     expects_strict_user_assistant_alternance: Annotated[Optional[bool], Field(exclude=True)] = None
@@ -103,7 +103,7 @@ def succeeds(messages: list[Message], strings_to_find = ()):
         # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
         user_msg = Message(role="user", content="Hey")
         assistant_msg = Message(role="assistant", content="I, Robot")
-        
+
         self.expects_strict_user_assistant_alternance = not succeeds([assistant_msg, user_msg]) and succeeds([user_msg, assistant_msg])
 
         thought = "Precious thought"
@@ -193,7 +193,7 @@ def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle]):
     @abstractmethod
     def parse(self, s: str) -> Optional[Message]:
         raise NotImplementedError()
-    
+
 
     def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
         assert system_prompt.role == "system"
@@ -233,7 +233,7 @@ def normalize(m: Message):
                         }, indent=2)
                     )
                 # Fall through to benefit from role normalization
-                
+
             if m.tool_calls:
                 if not self.args.chat_template.formats_tool_call or not self.args.chat_template.formats_tool_call_content:
                     return Message(
@@ -276,9 +276,9 @@ def normalize(m: Message):
                     return Message(role="user", content=f'[{m.role.upper()}]{m.content}[/{m.role.upper()}]')
             else:
                 return m
-    
+
         messages=[normalize(m) for m in messages]
-        
+
         if self.args.chat_template.expects_strict_user_assistant_alternance:
             new_messages=[]
             current_role = 'user'
@@ -580,7 +580,13 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         # args.response_schema = args.response_schema or {}
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
 
-        response_schema = args.response_schema or {"type": "string"}
+        response_schema = converter.resolve_refs(args.response_schema or {"type": "string"}, 'response')
+        tool_parameter_schemas = {
+            tool.function.name: converter.resolve_refs(tool.function.parameters, tool.function.name)
+            for tool in self.args.tools
+        }
+        # sys.stderr.write(f"# RESOLVED RESPONSE SCHEMA: {json.dumps(response_schema, indent=2)}\n")
+        # sys.stderr.write(f"# RESOLVED TOOL PARAMETER SCHEMA: {json.dumps(tool_parameter_schemas, indent=2)}\n")
         converter.visit(
             _make_bespoke_schema(
                 response_schema,
@@ -589,12 +595,12 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
                         {
                             "type": "object",
                             "properties": {
-                                "name": {"const": tool.function.name},
-                                "arguments": tool.function.parameters,
+                                "name": {"const": tool_name},
+                                "arguments": tool_parameters,
                             },
                             "required": ["name", "arguments"]
                         }
-                        for tool in self.args.tools
+                        for tool_name, tool_parameters in tool_parameter_schemas.items()
                     ]
                 },
                 parallel_calls=parallel_calls,

From 0532680f4060f651f1d953385ff0f3048ea2668a Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Tue, 9 Apr 2024 23:50:08 +0100
Subject: [PATCH 43/68] agent: nits

---
 examples/agent/fastify-requirements.txt |  2 +-
 examples/agent/requirements.txt         |  2 +-
 examples/agent/run_sandboxed_tools.sh   |  2 +-
 examples/openai/api.py                  |  2 +-
 examples/openai/llama_cpp_server_api.py |  2 +-
 examples/openai/prompt1.txt             | 43 -------------------------
 examples/openai/requirements.txt        |  2 +-
 examples/openai/server.py               |  2 +-
 examples/openai/test_chat_handlers.py   | 39 ++--------------------
 gguf-py/examples/reader.py              |  1 -
 gguf-py/gguf/gguf_reader.py             |  1 +
 11 files changed, 10 insertions(+), 88 deletions(-)
 delete mode 100644 examples/openai/prompt1.txt

diff --git a/examples/agent/fastify-requirements.txt b/examples/agent/fastify-requirements.txt
index abd7fe8d1f62f..28604117432f8 100644
--- a/examples/agent/fastify-requirements.txt
+++ b/examples/agent/fastify-requirements.txt
@@ -2,4 +2,4 @@ fastapi[all]
 pydantic
 sse-starlette
 uvicorn[all]
-typer[all]
\ No newline at end of file
+typer[all]
diff --git a/examples/agent/requirements.txt b/examples/agent/requirements.txt
index 01aab7cae824d..6ff121e93ec28 100644
--- a/examples/agent/requirements.txt
+++ b/examples/agent/requirements.txt
@@ -5,4 +5,4 @@ pydantic
 requests
 sse-starlette
 uvicorn[all]
-typer[all]
\ No newline at end of file
+typer[all]
diff --git a/examples/agent/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
index 5dbe19246af97..2fde295686e50 100755
--- a/examples/agent/run_sandboxed_tools.sh
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -71,4 +71,4 @@ docker run \
     --mount "type=bind,source=$( realpath "$script_folder" ),target=/src,readonly" \
     --mount "type=bind,source=$( realpath "$DATA_DIR" ),target=/data" \
     -p "$PORT:$PORT" \
-    -it "$LLAMA_IMAGE_NAME"
\ No newline at end of file
+    -it "$LLAMA_IMAGE_NAME"
diff --git a/examples/openai/api.py b/examples/openai/api.py
index 8f74cf99fdcb4..2de0ea686e9e0 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -87,4 +87,4 @@ class ChatCompletionResponse(BaseModel):
     choices: list[Choice]
     usage: Usage
     system_fingerprint: str
-    error: Optional[CompletionError] = None
\ No newline at end of file
+    error: Optional[CompletionError] = None
diff --git a/examples/openai/llama_cpp_server_api.py b/examples/openai/llama_cpp_server_api.py
index d7cd08c4446d0..db934919d310e 100644
--- a/examples/openai/llama_cpp_server_api.py
+++ b/examples/openai/llama_cpp_server_api.py
@@ -9,4 +9,4 @@ class LlamaCppServerCompletionRequest(LlamaCppParams):
     cache_prompt: Optional[bool] = None
 
     grammar: Optional[str] = None
-    json_schema: Optional[Json] = None
\ No newline at end of file
+    json_schema: Optional[Json] = None
diff --git a/examples/openai/prompt1.txt b/examples/openai/prompt1.txt
deleted file mode 100644
index afae47380a46a..0000000000000
--- a/examples/openai/prompt1.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-<|im_start|>system
-Role:
-  You are a function calling AI agent with self-recursion.
-  You can call only one function at a time and analyse data you get from function response.
-  You are provided with function signatures within <tools></tools> XML tags.
-  The current date is: March 25, 2024.
-
-Objective:
-  You may use agentic frameworks for reasoning and planning to help with user query.
-  Please call a function and wait for function results to be provided to you in the next iteration.
-  Don't make assumptions about what values to plug into function arguments.
-  Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags.
-  Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet.
-  Analyze the data once you get the results and call another function.
-  At each iteration please continue adding the your analysis to previous summary.
-  Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
-
-Tools:
-  Here are the available tools:
-  <tools> 
-  {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."}},"required":["location","format"]}}}
-  {"type":"function","function":{"name":"get_n_day_weather_forecast","description":"Get an N-day weather forecast","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."},"num_days":{"type":"integer","description":"The number of days to forecast"}},"required":["location","format","num_days"]}}} 
-  </tools>
-  If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
-  <tool_call>
-  {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}}
-  </tool_call>
-  Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
-
-Instructions:
-  At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
-  Please keep a running summary with analysis of previous function results and summaries from previous iterations.
-  Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
-  Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
-  If you plan to continue with analysis, always call another function.
-  For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
-  <tool_call>
-  {"arguments": <args-dict>, "name": <function-name>}
-  </tool_call>
-<|im_end|>
-<|im_start|>user
-what is the weather going to be like in San Francisco and Glasgow over the next 4 days (temperature in celsius for both)<|im_end|>
-<|im_start|>assistant
\ No newline at end of file
diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
index b092bf19f9ba7..368dacf9a727d 100644
--- a/examples/openai/requirements.txt
+++ b/examples/openai/requirements.txt
@@ -4,4 +4,4 @@ jinja2
 pydantic
 sse-starlette
 uvicorn[all]
-typer[all]
\ No newline at end of file
+typer[all]
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 21e69fa92b619..7bacc55b41083 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -139,7 +139,7 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         messages = chat_request.messages
 
         prompt = chat_handler.render_prompt(messages)
-        
+
         if verbose:
             sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
             # sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
index 7d4c89c87cc7c..50b39f47cbbd5 100644
--- a/examples/openai/test_chat_handlers.py
+++ b/examples/openai/test_chat_handlers.py
@@ -202,14 +202,14 @@ def check(b: bool, msg: str):
                 ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
 
                 print(f'\n### {model_name} / {style.name} / {tool_situation}\n')
-                
+
                 print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES_THOUGHT)}\n```\n')
 
                 print(f'\nOutput format prompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
 
                 print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
 
-                        
+
                 # if model_name == 'hermes_2_pro_mistral':
                 #     print("Skipping hermes_2_pro_mistral")
                 #     continue
@@ -233,38 +233,3 @@ def check_finds(msgs, strings_to_find):
             print(f'{f}\n\n')
 
         assert not failures
-    # test_templates([
-    #     Message(**{
-    #         "role": "user",
-    #         "name": None,
-    #         "tool_call_id": None,
-    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
-    #         "tool_calls": None
-    #     }),
-    #     Message(**{
-    #         "role": "assistant",
-    #         # "name": None,
-    #         "tool_call_id": None,
-    #         "content": "?",
-    #         "tool_calls": [
-    #             {
-    #                 # "id": "call_531873",
-    #                 "type": "function",
-    #                 "function": {
-    #                     "name": TOOL_NAME,
-    #                     "arguments": {
-    #                         "a": 2535,
-    #                         "b": 32222000403
-    #                     }
-    #                 }
-    #             }
-    #         ]
-    #     }),
-    #     Message(**{
-    #         "role": "tool",
-    #         "name": TOOL_NAME,
-    #         "tool_call_id": "call_531873",
-    #         "content": "32222002938",
-    #         "tool_calls": None
-    #     })
-    # ])
diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py
index 62e0769dacee2..aaebe05d586d2 100644
--- a/gguf-py/examples/reader.py
+++ b/gguf-py/examples/reader.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 from gguf.gguf_reader import GGUFReader
 
-
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 33afac552ca75..3500c761328d3 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -50,6 +50,7 @@ class ReaderField(NamedTuple):
     types: list[GGUFValueType] = []
 
 
+
 class ReaderTensor(NamedTuple):
     name: str
     tensor_type: GGMLQuantizationType

From a634e03abae6e55a7d712def087bc3d7bdd8fa19 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 01:03:39 +0100
Subject: [PATCH 44/68] agent: cache_prompt=True

---
 examples/agent/agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index ca5d2bd9c602b..651047674fb7b 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -71,6 +71,7 @@ def completion_with_tool_usage(
             messages=messages,
             response_format=response_format,
             tools=tools_schemas,
+            cache_prompt=True,
             **kwargs,
         )
         if verbose:

From 9fe269e24abd1ba80a747a41bebf66f37e757817 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 01:08:07 +0100
Subject: [PATCH 45/68] openai: nit

---
 examples/openai/server.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/examples/openai/server.py b/examples/openai/server.py
index 7bacc55b41083..e12a1d7371d4f 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -75,44 +75,6 @@ def main(
         atexit.register(server_process.kill)
         endpoint = f"http://{server_host}:{server_port}"
 
-
-    # print(chat_template.render([
-    #     Message(**{
-    #         "role": "user",
-    #         "name": None,
-    #         "tool_call_id": None,
-    #         "content": "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?",
-    #         "tool_calls": None
-    #     }),
-    #     Message(**{
-    #         "role": "assistant",
-    #         # "name": None,
-    #         "tool_call_id": None,
-    #         "content": "?",
-    #         "tool_calls": [
-    #             {
-    #                 # "id": "call_531873",
-    #                 "type": "function",
-    #                 "function": {
-    #                     "name": "add",
-    #                     "arguments": {
-    #                         "a": 2535,
-    #                         "b": 32222000403
-    #                     }
-    #                 }
-    #             }
-    #         ]
-    #     }),
-    #     Message(**{
-    #         "role": "tool",
-    #         "name": "add",
-    #         "tool_call_id": "call_531873",
-    #         "content": "32222002938",
-    #         "tool_calls": None
-    #     })
-    # ], add_generation_prompt=True))
-    # exit(0)
-
     app = FastAPI()
 
     @app.post("/v1/chat/completions")

From a61ebebaa0d873997f6ac3ebb33d22a7655c6d97 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 01:22:09 +0100
Subject: [PATCH 46/68] agent: hint at math import in python tool

---
 examples/agent/tools/unsafe_python_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/agent/tools/unsafe_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
index 4a8a103c5a008..0a3a2cb846885 100644
--- a/examples/agent/tools/unsafe_python_tools.py
+++ b/examples/agent/tools/unsafe_python_tools.py
@@ -6,7 +6,7 @@
 def execute_python(source: str) -> Union[Dict, str]:
     """
         Evaluate a Python program and return the globals it declared.
-        Can be used to compute mathematical expressions.
+        Can be used to compute mathematical expressions (e.g. after importing math module).
 
         Args:
             source: contain valid, executable and pure Python code. Should also import any required Python packages.

From 24e34f174b2361a8cabfe7756050ca8095e52db5 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 01:29:20 +0100
Subject: [PATCH 47/68] agent: nit

---
 examples/agent/agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 651047674fb7b..8e95a7c40c22e 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -18,7 +18,7 @@
 def _get_params_schema(fn: Callable, verbose):
     if isinstance(fn, OpenAPIMethod):
         return fn.parameters_schema
-    
+
     # converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
     schema = TypeAdapter(fn).json_schema()
     # Do NOT call converter.resolve_refs(schema) here. Let the server resolve local refs.

From 1475b1eefa44d3c14076d29038398c13efb5d67a Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 08:05:03 +0100
Subject: [PATCH 48/68] agent: fix killing of subprocesses

subprocesses again
---
 examples/agent/agent.py         |  7 ++-----
 examples/openai/server.py       |  8 +++++---
 examples/openai/subprocesses.py | 30 ++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 8 deletions(-)
 create mode 100644 examples/openai/subprocesses.py

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 8e95a7c40c22e..1cbc254fe2bdf 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -1,7 +1,4 @@
-import atexit
-import os
 from pathlib import Path
-import subprocess
 import sys
 from time import sleep
 import typer
@@ -14,6 +11,7 @@
 from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, ResponseFormat, Tool, ToolFunction
 from examples.agent.utils import collect_functions, load_module
 from examples.openai.prompting import ToolsPromptStyle
+from examples.openai.subprocesses import spawn_subprocess
 
 def _get_params_schema(fn: Callable, verbose):
     if isinstance(fn, OpenAPIMethod):
@@ -185,8 +183,7 @@ def main(
             *([f'--context-length={context_length}'] if context_length else []),
             *([f'--style={style.value}'] if style else []),
         ]
-        server_process = subprocess.Popen(cmd, stdout=sys.stderr)
-        atexit.register(server_process.kill)
+        spawn_subprocess(cmd)
         sleep(5)
 
     tool_functions = []
diff --git a/examples/openai/server.py b/examples/openai/server.py
index e12a1d7371d4f..a23f3bb4bb477 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -1,7 +1,7 @@
 # https://gist.github.com/ochafik/a3d4a5b9e52390544b205f37fb5a0df3
 # pip install "fastapi[all]" "uvicorn[all]" sse-starlette jsonargparse jinja2 pydantic
 
-import json, sys, subprocess, atexit
+import json, sys
 from pathlib import Path
 import time
 
@@ -22,6 +22,8 @@
 from typing import Annotated, Optional
 import typer
 
+from examples.openai.subprocesses import spawn_subprocess
+
 def generate_id(prefix):
     return f"{prefix}{random.randint(0, 1 << 32)}"
 
@@ -71,8 +73,8 @@ def main(
             "-c", f"{context_length}",
             *([] if verbose else ["--log-disable"]),
         ]
-        server_process = subprocess.Popen(cmd, stdout=sys.stderr)
-        atexit.register(server_process.kill)
+
+        spawn_subprocess(cmd)
         endpoint = f"http://{server_host}:{server_port}"
 
     app = FastAPI()
diff --git a/examples/openai/subprocesses.py b/examples/openai/subprocesses.py
new file mode 100644
index 0000000000000..33ee8a50715eb
--- /dev/null
+++ b/examples/openai/subprocesses.py
@@ -0,0 +1,30 @@
+
+import atexit
+import os
+import signal
+import subprocess
+import sys
+
+
+def _cleanup_process(p):
+    pid = p.pid
+
+    if sys.platform == 'win32':
+        os.system(f'taskkill /PID {pid} /T /F')
+    else:
+        pgid = os.getpgid(pid)
+        os.killpg(pgid, signal.SIGTERM)
+
+        p.wait()
+        if p.poll() is None:
+            os.killpg(pgid, signal.SIGKILL)
+
+def spawn_subprocess(cmd, **kwargs):
+    server_process = subprocess.Popen(
+        cmd,
+        stdout=sys.stderr,
+        start_new_session=True,
+        **kwargs
+    )
+    atexit.register(_cleanup_process, server_process)
+    return server_process

From 6c003786306c63406d0bdad6d94ea84395941b94 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 01:08:07 +0100
Subject: [PATCH 49/68] agent: nits

---
 examples/openai/requirements.txt |  2 +-
 examples/openai/server.py        | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
index 368dacf9a727d..fad994e0d9f09 100644
--- a/examples/openai/requirements.txt
+++ b/examples/openai/requirements.txt
@@ -1,5 +1,5 @@
 fastapi[all]
-gguf
+# gguf
 jinja2
 pydantic
 sse-starlette
diff --git a/examples/openai/server.py b/examples/openai/server.py
index a23f3bb4bb477..b03d7e098d671 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -1,18 +1,11 @@
-# https://gist.github.com/ochafik/a3d4a5b9e52390544b205f37fb5a0df3
-# pip install "fastapi[all]" "uvicorn[all]" sse-starlette jsonargparse jinja2 pydantic
-
 import json, sys
 from pathlib import Path
 import time
 
-from pydantic import TypeAdapter
-
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-from examples.openai.api import ChatCompletionResponse, Choice, Message, ChatCompletionRequest, Usage
-from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler, ChatHandler
+from examples.openai.api import ChatCompletionResponse, Choice, ChatCompletionRequest, Usage
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse

From 082d54db142604c8ff076c0ba4cf7ae96ac0a6ed Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 08:37:09 +0100
Subject: [PATCH 50/68] agent: rename fake weather tools

---
 examples/agent/README.md                                        | 2 +-
 .../tools/{example_weather_tools.py => fake_weather_tools.py}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/agent/tools/{example_weather_tools.py => fake_weather_tools.py} (100%)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 6cdabd0e2b525..606ad4c0071ce 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -29,7 +29,7 @@ python -m examples.agent \
 
 ```bash
 python -m examples.agent \
-    --tools examples/agent/tools/example_weather_tools.py \
+    --tools examples/agent/tools/fake_weather_tools.py \
     --goal "What is the weather going to be like in San Francisco and Glasgow over the next 4 days." \
     --greedy
 ```
diff --git a/examples/agent/tools/example_weather_tools.py b/examples/agent/tools/fake_weather_tools.py
similarity index 100%
rename from examples/agent/tools/example_weather_tools.py
rename to examples/agent/tools/fake_weather_tools.py

From f9afb041e2db04a93845e2ab505e768c7a5cb626 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 09:14:24 +0100
Subject: [PATCH 51/68] agent: python tool: test serializability of variables

---
 examples/agent/tools/unsafe_python_tools.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/agent/tools/unsafe_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
index 0a3a2cb846885..3473a92540bdf 100644
--- a/examples/agent/tools/unsafe_python_tools.py
+++ b/examples/agent/tools/unsafe_python_tools.py
@@ -3,6 +3,13 @@
 import types
 from typing import Dict, Union
 
+def _is_serializable(obj) -> bool:
+    try:
+        json.dumps(obj)
+        return True
+    except Exception as e:
+        return False
+
 def execute_python(source: str) -> Union[Dict, str]:
     """
         Evaluate a Python program and return the globals it declared.
@@ -21,7 +28,11 @@ def execute_python(source: str) -> Union[Dict, str]:
     results = {
         k: v
         for k, v in namespace.items()
-        if not k.startswith('_') and not isinstance(v, type) and not callable(v) and not isinstance(v, types.ModuleType)
+        if not k.startswith('_') \
+            and not isinstance(v, type) \
+            and not isinstance(v, types.ModuleType) \
+            and not callable(v) \
+            and _is_serializable(v)
     }
     sys.stderr.write(f"Results: {json.dumps(results, indent=2)}\n")
 

From a98f48315cbce19db868740426c63ad3aa12b7d1 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 09:27:25 +0100
Subject: [PATCH 52/68] agent: python tool: return errors

---
 examples/agent/tools/unsafe_python_tools.py | 33 ++++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/examples/agent/tools/unsafe_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
index 3473a92540bdf..b187f219e989d 100644
--- a/examples/agent/tools/unsafe_python_tools.py
+++ b/examples/agent/tools/unsafe_python_tools.py
@@ -22,18 +22,23 @@ def execute_python(source: str) -> Union[Dict, str]:
         Returns:
             dict | str: A dictionary containing variables declared, or an error message if an exception occurred.
     """
-    namespace = {}
-    sys.stderr.write(f"Executing Python program:\n{source}\n")
-    exec(source, namespace)
-    results = {
-        k: v
-        for k, v in namespace.items()
-        if not k.startswith('_') \
-            and not isinstance(v, type) \
-            and not isinstance(v, types.ModuleType) \
-            and not callable(v) \
-            and _is_serializable(v)
-    }
-    sys.stderr.write(f"Results: {json.dumps(results, indent=2)}\n")
+    try:
+        namespace = {}
+        sys.stderr.write(f"Executing Python program:\n{source}\n")
+        exec(source, namespace)
+        results = {
+            k: v
+            for k, v in namespace.items()
+            if not k.startswith('_') \
+                and not isinstance(v, type) \
+                and not isinstance(v, types.ModuleType) \
+                and not callable(v) \
+                and _is_serializable(v)
+        }
+        sys.stderr.write(f"Results: {json.dumps(results, indent=2)}\n")
+        return results
+    except Exception as e:
+        msg = f"Error: {sys.exc_info()[1]}"
+        sys.stderr.write(f"{msg}\n")
+        return msg
 
-    return results

From ea0c31b10b195e7fc566e03b0eef81770a37bc2e Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 11:39:35 +0100
Subject: [PATCH 53/68] agent: ensure DATA_DIR exists

skip-checks:true
---
 examples/agent/README.md              | 9 ++++++---
 examples/agent/fastify.py             | 2 ++
 examples/agent/run_sandboxed_tools.sh | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/agent/README.md b/examples/agent/README.md
index 606ad4c0071ce..c2df9c8f55526 100644
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@@ -108,7 +108,10 @@ The agent can use tools written in Python, or (soon) exposed under OpenAPI endpo
 so we provide a script to run them in a Docker-sandboxed environment, exposed as an OpenAPI server:
 
     ```bash
-    PORT=9999 examples/openai/run_sandboxed_tools.sh \
+    # With limactl, the default sandbox location ~/.llama.cpp/sandbox won't be writable
+    # (see https://github.com/lima-vm/lima/discussions/393)
+    # export DATA_DIR=/tmp/lima/llama.cpp/sandbox
+    PORT=9999 examples/agent/run_sandboxed_tools.sh \
         examples/agent/tools/unsafe_python_tools.py &
 
     python -m examples.agent \
@@ -127,11 +130,11 @@ so we provide a script to run them in a Docker-sandboxed environment, exposed as
 
     </details>
 
-    - [fastify.py](./fastify.py) turns a python module into an OpenAPI endpoint using FastAPI
+    - [fastify.py](./fastify.py) turns a python module into an [OpenAPI](https://www.openapis.org/) endpoint using [FastAPI](https://fastapi.tiangolo.com/)
 
     - [run_sandboxed_tools.sh](./run_sandboxed_tools.sh) builds and runs a Docker environment with fastify inside it, and exposes its port locally
 
-- Beyond just "tools", output format can be constrained using JSON schemas or Pydantic types
+- Beyond just "tools", output format can be constrained using [JSON schemas](https://json-schema.org/) or [Pydantic](https://docs.pydantic.dev/latest/) types
 
     ```bash
     python -m examples.agent \
diff --git a/examples/agent/fastify.py b/examples/agent/fastify.py
index 0cfd5f86888bd..02d475b409237 100644
--- a/examples/agent/fastify.py
+++ b/examples/agent/fastify.py
@@ -3,6 +3,7 @@
 
     This is useful in combination w/ the examples/agent/run_sandboxed_tools.sh
 '''
+import os
 import fastapi, uvicorn
 import typer
 from typing import Type, List
@@ -37,6 +38,7 @@ def main(files: List[str], host: str = '0.0.0.0', port: int = 8000):
     for f in files:
         bind_functions(app, load_module(f))
 
+    print(f'INFO:     CWD = {os.getcwd()}')
     uvicorn.run(app, host=host, port=port)
 
 if __name__ == '__main__':
diff --git a/examples/agent/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
index 2fde295686e50..4f502e12dbe89 100755
--- a/examples/agent/run_sandboxed_tools.sh
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -20,6 +20,8 @@ BUILD_DIR=$(mktemp -d)
 DATA_DIR="${DATA_DIR:-$HOME/.llama.cpp/sandbox}"
 SCRIPT_DIR=$( cd "$(dirname "$0")" ; pwd )
 
+mkdir -p "$DATA_DIR"
+
 REQUIREMENTS_FILE="${REQUIREMENTS_FILE:-}"
 if [[ -z "$REQUIREMENTS_FILE" && -f "$script_folder/requirements.txt" ]]; then
     REQUIREMENTS_FILE="$script_folder/requirements.txt"
@@ -69,6 +71,6 @@ set -x
 docker run \
     "$@" \
     --mount "type=bind,source=$( realpath "$script_folder" ),target=/src,readonly" \
-    --mount "type=bind,source=$( realpath "$DATA_DIR" ),target=/data" \
+    --mount "type=bind,source=$DATA_DIR,target=/data" \
     -p "$PORT:$PORT" \
     -it "$LLAMA_IMAGE_NAME"

From 89dcc062a497e89ae36c275b88817beb41b15ee2 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 19:45:13 +0100
Subject: [PATCH 54/68] agent: mypy type fixes

mypy examples/agent/__main__.py
mypy examples/agent/fastify.py
mypy examples/openai/__main__.py
---
 examples/agent/agent.py                 | 25 ++++-----
 examples/agent/fastify.py               |  2 +-
 examples/agent/openapi_client.py        | 27 +++++-----
 examples/agent/tools/std_tools.py       | 13 +++--
 examples/agent/utils.py                 |  4 +-
 examples/json_schema_to_grammar.py      | 12 ++---
 examples/openai/api.py                  | 14 ++---
 examples/openai/llama_cpp_server_api.py |  4 +-
 examples/openai/prompting.py            | 68 ++++++++++++-------------
 examples/openai/server.py               | 15 +++---
 examples/openai/ts_converter.py         | 23 +++++----
 11 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 1cbc254fe2bdf..a283e06287073 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -3,7 +3,7 @@
 from time import sleep
 import typer
 from pydantic import BaseModel, Json, TypeAdapter
-from typing import Annotated, Callable, List, Union, Optional, Type
+from typing import Annotated, Any, Callable, Dict, List, Union, Optional, Type
 import json, requests
 
 from examples.agent.openapi_client import OpenAPIMethod, openapi_methods_from_endpoint
@@ -13,7 +13,7 @@
 from examples.openai.prompting import ToolsPromptStyle
 from examples.openai.subprocesses import spawn_subprocess
 
-def _get_params_schema(fn: Callable, verbose):
+def _get_params_schema(fn: Callable[[Any], Any], verbose):
     if isinstance(fn, OpenAPIMethod):
         return fn.parameters_schema
 
@@ -26,9 +26,9 @@ def _get_params_schema(fn: Callable, verbose):
 
 def completion_with_tool_usage(
         *,
-        response_model: Optional[Union[Json, Type]]=None,
+        response_model: Optional[Union[Json[Any], type]]=None,
         max_iterations: Optional[int]=None,
-        tools: List[Callable],
+        tools: List[Callable[..., Any]],
         endpoint: str,
         messages: List[Message],
         auth: Optional[str],
@@ -56,7 +56,7 @@ def completion_with_tool_usage(
             type="function",
             function=ToolFunction(
                 name=fn.__name__,
-                description=fn.__doc__,
+                description=fn.__doc__ or '',
                 parameters=_get_params_schema(fn, verbose=verbose)
             )
         )
@@ -128,7 +128,7 @@ def completion_with_tool_usage(
 def main(
     goal: Annotated[str, typer.Option()],
     tools: Optional[List[str]] = None,
-    format: Annotated[str, typer.Option(help="The output format: either a Python type (e.g. 'float' or a Pydantic model defined in one of the tool files), or a JSON schema, e.g. '{\"format\": \"date\"}'")] = None,
+    format: Annotated[Optional[str], typer.Option(help="The output format: either a Python type (e.g. 'float' or a Pydantic model defined in one of the tool files), or a JSON schema, e.g. '{\"format\": \"date\"}'")] = None,
     max_iterations: Optional[int] = 10,
     std_tools: Optional[bool] = False,
     auth: Optional[str] = None,
@@ -136,7 +136,7 @@ def main(
     verbose: bool = False,
     style: Optional[ToolsPromptStyle] = None,
 
-    model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    model: Annotated[str, typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
     endpoint: Optional[str] = None,
     context_length: Optional[int] = None,
     # endpoint: str = 'http://localhost:8080/v1/chat/completions',
@@ -187,8 +187,8 @@ def main(
         sleep(5)
 
     tool_functions = []
-    types = {}
-    for f in tools:
+    types: Dict[str, type] = {}
+    for f in (tools or []):
         if f.startswith('http://') or f.startswith('https://'):
             tool_functions.extend(openapi_methods_from_endpoint(f))
         else:
@@ -203,7 +203,7 @@ def main(
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
 
-    response_model = None #str
+    response_model: Union[type, Json[Any]] = None #str
     if format:
         if format in types:
             response_model = types[format]
@@ -246,10 +246,7 @@ def main(
         seed=seed,
         n_probs=n_probs,
         min_keep=min_keep,
-        messages=[{
-            "role": "user",
-            "content": goal,
-        }]
+        messages=[Message(role="user", content=goal)],
     )
     print(result if response_model else f'➡️ {result}')
     # exit(0)
diff --git a/examples/agent/fastify.py b/examples/agent/fastify.py
index 02d475b409237..cf02ccc3102ff 100644
--- a/examples/agent/fastify.py
+++ b/examples/agent/fastify.py
@@ -17,7 +17,7 @@ def bind_functions(app, module):
         if k == k.capitalize():
             continue
         v = getattr(module, k)
-        if not callable(v) or isinstance(v, Type):
+        if not callable(v) or isinstance(v, type):
             continue
         if not hasattr(v, '__annotations__'):
             continue
diff --git a/examples/agent/openapi_client.py b/examples/agent/openapi_client.py
index 0a6980b73f19e..d336c7436dec7 100644
--- a/examples/agent/openapi_client.py
+++ b/examples/agent/openapi_client.py
@@ -17,28 +17,29 @@ def __init__(self, url, name, descriptor, catalog):
         request_body = post_descriptor.get('requestBody')
 
         self.parameters = {p['name']: p for p in parameters}
-        assert all(param['in'] == 'query' for param in self.parameters.values()), f'Only query path parameters are supported (path: {path}, descriptor: {json.dumps(descriptor)})'
+        assert all(param['in'] == 'query' for param in self.parameters.values()), f'Only query path parameters are supported (path: {url}, descriptor: {json.dumps(descriptor)})'
 
         self.body = None
-        self.body_name = None
         if request_body:
-            assert 'application/json' in request_body['content'], f'Only application/json is supported for request body (path: {path}, descriptor: {json.dumps(descriptor)})'
+            assert 'application/json' in request_body['content'], f'Only application/json is supported for request body (path: {url}, descriptor: {json.dumps(descriptor)})'
+
+            body_name = 'body'
+            i = 2
+            while body_name in self.parameters:
+                body_name = f'body{i}'
+                i += 1
+
             self.body = dict(
+                name=body_name,
                 required=request_body['required'],
                 schema=request_body['content']['application/json']['schema'],
             )
 
-            self.body_name = 'body'
-            i = 2
-            while self.body_name in self.parameters:
-                self.body_name = f'body{i}'
-                i += 1
-
         self.parameters_schema = dict(
             type='object',
             properties={
                 **({
-                    self.body_name: self.body['schema']
+                    self.body['name']: self.body['schema']
                 } if self.body else {}),
                 **{
                     name: param['schema']
@@ -46,14 +47,14 @@ def __init__(self, url, name, descriptor, catalog):
                 }
             },
             components=catalog.get('components'),
-            required=[name for name, param in self.parameters.items() if param['required']] + ([self.body_name] if self.body and self.body['required'] else [])
+            required=[name for name, param in self.parameters.items() if param['required']] + ([self.body['name']] if self.body and self.body['required'] else [])
         )
 
     def __call__(self, **kwargs):
         if self.body:
-            body = kwargs.pop(self.body_name, None)
+            body = kwargs.pop(self.body['name'], None)
             if self.body['required']:
-                assert body is not None, f'Missing required body parameter: {self.body_name}'
+                assert body is not None, f'Missing required body parameter: {self.body["name"]}'
         else:
             body = None
 
diff --git a/examples/agent/tools/std_tools.py b/examples/agent/tools/std_tools.py
index 9093e8dc2cf4c..4d1e132a1e9ad 100644
--- a/examples/agent/tools/std_tools.py
+++ b/examples/agent/tools/std_tools.py
@@ -15,6 +15,9 @@ class Duration(BaseModel):
     months: Optional[int] = None
     years: Optional[int] = None
 
+    def __str__(self) -> str:
+        return f"{self.years} years, {self.months} months, {self.days} days, {self.hours} hours, {self.minutes} minutes, {self.seconds} seconds"
+
     @property
     def get_total_seconds(self) -> int:
         return sum([
@@ -29,6 +32,10 @@ def get_total_seconds(self) -> int:
 class WaitForDuration(BaseModel):
     duration: Duration
 
+    def __call__(self):
+        sys.stderr.write(f"Waiting for {self.duration}...\n")
+        time.sleep(self.duration.get_total_seconds)
+
 class WaitForDate(BaseModel):
     until: date
 
@@ -43,7 +50,7 @@ def __call__(self):
 
         days, seconds = time_diff.days, time_diff.seconds
 
-        sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {d}...\n")
+        sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {self.until}...\n")
         time.sleep(days * 86400 + seconds)
         sys.stderr.write(f"Reached the target date: {self.until}\n")
 
@@ -67,8 +74,8 @@ def wait(_for: Union[WaitForDuration, WaitForDate]) -> None:
         return _for()
 
     @staticmethod
-    def say_out_loud(something: str) -> str:
+    def say_out_loud(something: str) -> None:
         """
             Just says something. Used to say each thought out loud
         """
-        return subprocess.check_call(["say", something])
+        subprocess.check_call(["say", something])
diff --git a/examples/agent/utils.py b/examples/agent/utils.py
index 4eff7f6ad72a1..b381e8ef6a171 100644
--- a/examples/agent/utils.py
+++ b/examples/agent/utils.py
@@ -9,8 +9,10 @@ def load_source_as_module(source):
         i += 1
 
     spec = importlib.util.spec_from_file_location(module_name, source)
+    assert spec, f'Failed to load {source} as module'
     module = importlib.util.module_from_spec(spec)
     sys.modules[module_name] = module
+    assert spec.loader, f'{source} spec has no loader'
     spec.loader.exec_module(module)
     return module
 
@@ -29,7 +31,7 @@ def collect_functions(module):
         if k == k.capitalize():
             continue
         v = getattr(module, k)
-        if not callable(v) or isinstance(v, Type):
+        if not callable(v) or isinstance(v, type):
             continue
         if not hasattr(v, '__annotations__'):
             continue
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 826cd3f7271d1..7ce0e13c4da5a 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -55,9 +55,9 @@ def opt_repetitions(up_to_n, prefix_with_sep=False):
 
 
 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: List[str]):
         self.content = content
-        self.deps = deps or []
+        self.deps = deps
 
 _up_to_15_digits = _build_repetition('[0-9]', 0, 15)
 
@@ -118,7 +118,7 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
 
     def _format_literal(self, literal):
         escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES[m.group(0)], literal
         )
         return f'"{escaped}"'
 
@@ -157,13 +157,13 @@ def _add_rule(self, name, rule):
         self._rules[key] = rule
         return key
 
-    def resolve_refs(self, schema: dict, url: str):
+    def resolve_refs(self, schema: Any, url: str):
         '''
             Resolves all $ref fields in the given schema, fetching any remote schemas,
             replacing $ref with absolute reference URL and populating self._refs with the
             respective referenced (sub)schema dictionaries.
         '''
-        def visit(n: dict):
+        def visit(n: Any):
             if isinstance(n, list):
                 return [visit(x) for x in n]
             elif isinstance(n, dict):
@@ -223,7 +223,7 @@ def _visit_pattern(self, pattern, name):
 
         assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
         pattern = pattern[1:-1]
-        sub_rule_ids = {}
+        sub_rule_ids: Dict[str, str] = {}
 
         i = 0
         length = len(pattern)
diff --git a/examples/openai/api.py b/examples/openai/api.py
index 2de0ea686e9e0..49f4a5f7bb634 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Any, Dict, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Json, TypeAdapter
 
 class FunctionCall(BaseModel):
@@ -16,7 +16,7 @@ class Message(BaseModel):
     name: Optional[str] = None
     tool_call_id: Optional[str] = None
     content: Optional[str]
-    tool_calls: Optional[list[ToolCall]] = None
+    tool_calls: Optional[List[ToolCall]] = None
 
 class ToolFunction(BaseModel):
     name: str
@@ -29,7 +29,7 @@ class Tool(BaseModel):
 
 class ResponseFormat(BaseModel):
     type: Literal["json_object"]
-    schema: Optional[Dict] = None
+    schema: Optional[Json[Any]] = None  # type: ignore
 
 class LlamaCppParams(BaseModel):
     n_predict: Optional[int] = None
@@ -56,8 +56,8 @@ class LlamaCppParams(BaseModel):
 
 class ChatCompletionRequest(LlamaCppParams):
     model: str
-    tools: Optional[list[Tool]] = None
-    messages: list[Message] = None
+    tools: Optional[List[Tool]] = None
+    messages: Optional[List[Message]] = None
     prompt: Optional[str] = None
     response_format: Optional[ResponseFormat] = None
 
@@ -67,7 +67,7 @@ class ChatCompletionRequest(LlamaCppParams):
 class Choice(BaseModel):
     index: int
     message: Message
-    logprobs: Optional[Json] = None
+    logprobs: Optional[Json[Any]] = None
     finish_reason: Union[Literal["stop"], Literal["tool_calls"]]
 
 class Usage(BaseModel):
@@ -84,7 +84,7 @@ class ChatCompletionResponse(BaseModel):
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: list[Choice]
+    choices: List[Choice]
     usage: Usage
     system_fingerprint: str
     error: Optional[CompletionError] = None
diff --git a/examples/openai/llama_cpp_server_api.py b/examples/openai/llama_cpp_server_api.py
index db934919d310e..db1c860411ae8 100644
--- a/examples/openai/llama_cpp_server_api.py
+++ b/examples/openai/llama_cpp_server_api.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Any, Optional
 from pydantic import Json
 
 from examples.openai.api import LlamaCppParams
@@ -9,4 +9,4 @@ class LlamaCppServerCompletionRequest(LlamaCppParams):
     cache_prompt: Optional[bool] = None
 
     grammar: Optional[str] = None
-    json_schema: Optional[Json] = None
+    json_schema: Optional[Json[Any]] = None
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 10f68fdce1aa1..d64db02ff2d9e 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -6,12 +6,12 @@
 import random
 import re
 import sys
-from typing import Annotated, Optional
-from pydantic import BaseModel, Field
+from typing import Annotated, Any, Optional
+from pydantic import BaseModel, Field, Json
 
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.openai.api import Tool, Message, FunctionCall, ToolCall
-from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys  # type: ignore
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 
 # _THOUGHT_KEY = "thought"
@@ -65,7 +65,7 @@ class ChatTemplate(BaseModel):
 
     @property
     def potentially_supports_parallel_calls(self) -> bool:
-        return self.formats_tool_result and self.formats_tool_name
+        return bool(self.formats_tool_result and self.formats_tool_name)
 
     def __init__(self, template: str, eos_token: str, bos_token: str):
         super().__init__(template=template, eos_token=eos_token, bos_token=bos_token)
@@ -161,7 +161,7 @@ def from_gguf(metadata: GGUFKeyValues):
 
     @staticmethod
     def from_huggingface(model_id: str):
-        from transformers import LlamaTokenizer
+        from transformers import LlamaTokenizer  # type: ignore
         tokenizer = LlamaTokenizer.from_pretrained(model_id)
         return ChatTemplate(
             template = tokenizer.chat_template or tokenizer.default_chat_template,
@@ -170,7 +170,7 @@ def from_huggingface(model_id: str):
 
     def raw_render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
         result = self._template.render(
-            messages=messages,
+            messages=[messages.model_dump() for messages in messages],
             eos_token=self.eos_token,
             bos_token='' if omit_bos else self.bos_token,
             raise_exception=raise_exception,
@@ -180,7 +180,7 @@ def raw_render(self, messages: list[Message], add_generation_prompt: bool, omit_
 
 class ChatHandlerArgs(BaseModel):
     chat_template: ChatTemplate
-    response_schema: Optional[dict] = None
+    response_schema: Optional[Json[Any]] = None
     tools: Optional[list[Tool]] = None
 
 class ChatHandler(ABC):
@@ -199,9 +199,9 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
         assert system_prompt.role == "system"
         # TODO: add to last system message, or create a new one just before the last user message
         system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
-        if system_message is not None:
+        if system_message:
             (i, m) = system_message
-            return messages[:i] + [Message(role="system", content=system_prompt.content + '\n' + m.content)] + messages[i+1:]
+            return messages[:i] + [Message(role="system", content=(system_prompt.content + '\n' if system_prompt.content else '') + (m.content or ''))] + messages[i+1:]
         else:
             return [system_prompt] + messages
 
@@ -282,7 +282,7 @@ def normalize(m: Message):
         if self.args.chat_template.expects_strict_user_assistant_alternance:
             new_messages=[]
             current_role = 'user'
-            current_content = []
+            current_content: list[str] = []
 
             def flush():
                 nonlocal current_content
@@ -311,24 +311,24 @@ def flush():
             messages = new_messages
 
         # JSON!
-        messages = [m.model_dump() for m in messages]
+        # messages = [m.model_dump() for m in messages]
 
         # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
         if self.args.chat_template.expects_stringified_function_arguments:
             messages = [
-                {
-                    **m,
+                Message(**{
+                    **m.model_dump(),
                     "tool_calls": [
-                        {
-                            **tc,
+                        ToolCall(**{
+                            **tc.model_dump(),
                             "function": {
-                                "name": tc["function"]["name"],
-                                "arguments": json.dumps(tc["function"]["arguments"]),
+                                "name": tc.function.name,
+                                "arguments": tc.function.arguments,
                             }
-                        }
-                        for tc in m["tool_calls"]
-                    ] if m.get("tool_calls") else None
-                }
+                        })
+                        for tc in m.tool_calls
+                    ] if m.tool_calls else None
+                })
                 for m in messages
             ]
 
@@ -364,7 +364,7 @@ def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle], esc
 
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
         tool_rules = []
-        for tool in self.args.tools:
+        for tool in self.args.tools or []:
 
             parameters_schema = tool.function.parameters
             parameters_schema = converter.resolve_refs(parameters_schema, tool.function.name)
@@ -416,7 +416,7 @@ def parse(self, s: str) -> Optional[Message]:
         if len(parts) == 1:
             return Message(role="assistant", content=s)
         else:
-            content = []
+            content: list[str] = []
             tool_calls = []
             for i, part in enumerate(parts):
                 if i % 2 == 0:
@@ -431,8 +431,8 @@ def parse(self, s: str) -> Optional[Message]:
                             id=gen_callid(),
                             function=FunctionCall(**fc)))
 
-            content = '\n'.join(content).strip()
-            return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
+            content_str = '\n'.join(content).strip()
+            return Message(role="assistant", content=content_str if content_str else None, tool_calls=tool_calls)
 
 
 class TemplatedToolsChatHandler(ToolCallTagsChatHandler):
@@ -444,7 +444,7 @@ def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, e
             role="system",
             content=template.replace(
                 '{tools}',
-                '\n'.join(json.dumps(tool.model_dump(), indent=2) for tool in self.args.tools),
+                '\n'.join(json.dumps(tool.model_dump(), indent=2) for tool in (self.args.tools or [])),
             )
         )
 
@@ -456,11 +456,11 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         path = str(Path(__file__).parent / "hermes_function_calling")
         if path not in sys.path: sys.path.insert(0, path)
         try:
-            from examples.openai.hermes_function_calling.prompter import PromptManager
+            from examples.openai.hermes_function_calling.prompter import PromptManager  # type: ignore
         except ImportError:
             raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
 
-        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[tool.model_dump_json() for tool in args.tools])
+        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[tool.model_dump_json() for tool in args.tools or []])
         assert len(prompt) == 1 and prompt[0]["role"] == "system"
         self.output_format_prompt = Message(**prompt[0])
 
@@ -471,7 +471,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         self.output_format_prompt = Message(
             role="system",
             content= '// Supported function definitions that should be called when necessary.\n' +
-                _tools_typescript_signatures(args.tools)
+                _tools_typescript_signatures(args.tools or [])
         )
 
         converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
@@ -481,7 +481,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
                 converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
                 converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
                 converter._format_literal('\n'))
-            for i, tool in enumerate(self.args.tools)
+            for i, tool in enumerate(self.args.tools or [])
         ]
 
         not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
@@ -583,7 +583,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
         response_schema = converter.resolve_refs(args.response_schema or {"type": "string"}, 'response')
         tool_parameter_schemas = {
             tool.function.name: converter.resolve_refs(tool.function.parameters, tool.function.name)
-            for tool in self.args.tools
+            for tool in self.args.tools or []
         }
         # sys.stderr.write(f"# RESOLVED RESPONSE SCHEMA: {json.dumps(response_schema, indent=2)}\n")
         # sys.stderr.write(f"# RESOLVED TOOL PARAMETER SCHEMA: {json.dumps(tool_parameter_schemas, indent=2)}\n")
@@ -614,7 +614,7 @@ def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
             content='\n'.join([
                 'You are a function calling AI model.',
                 'Here are the tools available:',
-                _tools_schema_signatures(self.args.tools, indent=2),
+                _tools_schema_signatures(self.args.tools or [], indent=2),
                 # _tools_typescript_signatures(self.args.tools),
                 _please_respond_with_schema(
                     _make_bespoke_schema(
@@ -716,10 +716,10 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
     elif tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
         return Hermes2ProToolsChatHandler(args, parallel_calls=parallel_calls)
     else:
-        raise ValueError(f"Unsupported tool call style: {args.chat_template.tool_style}")
+        raise ValueError(f"Unsupported tool call style: {tool_style}")
 
 # os.environ.get('NO_TS')
-def _please_respond_with_schema(schema: dict) -> str:
+def _please_respond_with_schema(schema: Json[Any]) -> str:
     sig = json.dumps(schema, indent=2)
     # _ts_converter = SchemaToTypeScriptConverter()
     # # _ts_converter.resolve_refs(schema, 'schema')
diff --git a/examples/openai/server.py b/examples/openai/server.py
index b03d7e098d671..672b6176d7d8a 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -3,7 +3,7 @@
 import time
 
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
-from examples.openai.gguf_kvs import GGUFKeyValues, Keys
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys  # type: ignore
 from examples.openai.api import ChatCompletionResponse, Choice, ChatCompletionRequest, Usage
 from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
 
@@ -21,12 +21,12 @@ def generate_id(prefix):
     return f"{prefix}{random.randint(0, 1 << 32)}"
 
 def main(
-    model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    model: Annotated[str, typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
     template_hf_model_id_fallback: Annotated[Optional[str], typer.Option(help="If the GGUF model does not contain a chat template, get it from this HuggingFace tokenizer")] = 'meta-llama/Llama-2-7b-chat-hf',
     # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
     host: str = "localhost",
     port: int = 8080,
-    parallel_calls: Optional[bool] = False,
+    parallel_calls: bool = False,
     style: Optional[ToolsPromptStyle] = None,
     auth: Optional[str] = None,
     verbose: bool = False,
@@ -39,10 +39,11 @@ def main(
 
     if endpoint:
         sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+        assert template_hf_model_id_fallback, "template_hf_model_id_fallback is required when using an endpoint"
         chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
 
     else:
-        metadata = GGUFKeyValues(model)
+        metadata = GGUFKeyValues(Path(model))
 
         if not context_length:
             context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
@@ -51,6 +52,7 @@ def main(
             chat_template = ChatTemplate.from_gguf(metadata)
         else:
             sys.stderr.write(f"# WARNING: Model does not contain a chat template, fetching it from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+            assert template_hf_model_id_fallback, "template_hf_model_id_fallback is required when the model does not contain a chat template"
             chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
 
         if verbose:
@@ -93,9 +95,8 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             verbose=verbose,
         )
 
-        messages = chat_request.messages
-
-        prompt = chat_handler.render_prompt(messages)
+        prompt = chat_handler.render_prompt(chat_request.messages) if chat_request.messages else chat_request.prompt
+        assert prompt is not None, "One of prompt or messages field is required"
 
         if verbose:
             sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
index 3c04bab7dd15a..245e389c103c8 100644
--- a/examples/openai/ts_converter.py
+++ b/examples/openai/ts_converter.py
@@ -1,6 +1,8 @@
-from typing import Any, List, Set, Tuple, Union
+from typing import Any, Dict, List, Set, Tuple, Union
 import json
 
+from pydantic import Json
+
 class SchemaToTypeScriptConverter:
     # TODO: comments for arguments!
     # // Get the price of a particular car model
@@ -15,17 +17,18 @@ class SchemaToTypeScriptConverter:
     # location: string,
     # }) => any;
 
-    def __init__(self):
-        self._refs = {}
-        self._refs_being_resolved = set()
+    def __init__(self, allow_fetch: bool = True):
+        self._refs: Dict[str, Json[Any]] = {}
+        self._refs_being_resolved: Set[str] = set()
+        self._allow_fetch = allow_fetch
 
-    def resolve_refs(self, schema: dict, url: str):
+    def resolve_refs(self, schema: Json[Any], url: str):
         '''
             Resolves all $ref fields in the given schema, fetching any remote schemas,
             replacing $ref with absolute reference URL and populating self._refs with the
             respective referenced (sub)schema dictionaries.
         '''
-        def visit(n: dict):
+        def visit(n: Json[Any]):
             if isinstance(n, list):
                 return [visit(x) for x in n]
             elif isinstance(n, dict):
@@ -64,7 +67,7 @@ def visit(n: dict):
             return n
         return visit(schema)
 
-    def _desc_comment(self, schema: dict):
+    def _desc_comment(self, schema: Json[Any]):
         desc = schema.get("description", "").replace("\n", "\n// ") if 'description' in schema else None
         return f'// {desc}\n' if desc else ''
 
@@ -78,11 +81,11 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st
             f'{self._desc_comment(prop_schema)}{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
             for prop_name, prop_schema in properties
         ] + (
-            [f"{self._desc_comment(additional_properties) if additional_properties else ''}[key: string]: {self.visit(additional_properties)}"]
+            [f"{self._desc_comment(additional_properties) if isinstance(additional_properties, dict) else ''}[key: string]: {self.visit(additional_properties)}"]
             if additional_properties is not None else []
         )) + "\n}"
 
-    def visit(self, schema: dict):
+    def visit(self, schema: Json[Any]):
         def print_constant(v):
             return json.dumps(v)
 
@@ -90,7 +93,7 @@ def print_constant(v):
         schema_format = schema.get('format')
 
         if 'oneOf' in schema or 'anyOf' in schema:
-            return '|'.join(self.visit(s) for s in schema.get('oneOf') or schema.get('anyOf'))
+            return '|'.join(self.visit(s) for s in schema.get('oneOf') or schema.get('anyOf') or [])
 
         elif isinstance(schema_type, list):
             return '|'.join(self.visit({'type': t}) for t in schema_type)

From 0120f7cc954e012338814c835072859b1a07fb7d Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Wed, 10 Apr 2024 19:47:01 +0100
Subject: [PATCH 55/68] agent: fix wait --std-tools

---
 examples/agent/agent.py           | 51 ++++++++++++++-----------
 examples/agent/tools/std_tools.py | 62 ++++++++++++++++++-------------
 2 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index a283e06287073..03fb96dca6038 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -3,6 +3,7 @@
 from time import sleep
 import typer
 from pydantic import BaseModel, Json, TypeAdapter
+from pydantic_core import SchemaValidator, core_schema
 from typing import Annotated, Any, Callable, Dict, List, Union, Optional, Type
 import json, requests
 
@@ -13,16 +14,12 @@
 from examples.openai.prompting import ToolsPromptStyle
 from examples.openai.subprocesses import spawn_subprocess
 
-def _get_params_schema(fn: Callable[[Any], Any], verbose):
-    if isinstance(fn, OpenAPIMethod):
-        return fn.parameters_schema
-
-    # converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
-    schema = TypeAdapter(fn).json_schema()
-    # Do NOT call converter.resolve_refs(schema) here. Let the server resolve local refs.
-    if verbose:
-        sys.stderr.write(f'# PARAMS SCHEMA: {json.dumps(schema, indent=2)}\n')
-    return schema
+def make_call_adapter(ta: TypeAdapter, fn: Callable[..., Any]):
+    args_validator = SchemaValidator(core_schema.call_schema(
+        arguments=ta.core_schema['arguments_schema'],
+        function=fn,
+    ))
+    return lambda **kwargs: args_validator.validate_python(kwargs)
 
 def completion_with_tool_usage(
         *,
@@ -50,18 +47,28 @@ def completion_with_tool_usage(
             schema = type_adapter.json_schema()
         response_format=ResponseFormat(type="json_object", schema=schema)
 
-    tool_map = {fn.__name__: fn for fn in tools}
-    tools_schemas = [
-        Tool(
-            type="function",
-            function=ToolFunction(
-                name=fn.__name__,
-                description=fn.__doc__ or '',
-                parameters=_get_params_schema(fn, verbose=verbose)
+    tool_map = {}
+    tools_schemas = []
+    for fn in tools:
+        if isinstance(fn, OpenAPIMethod):
+            tool_map[fn.__name__] = fn
+            parameters_schema = fn.parameters_schema
+        else:
+            ta = TypeAdapter(fn)
+            tool_map[fn.__name__] = make_call_adapter(ta, fn)
+            parameters_schema = ta.json_schema()
+        if verbose:
+            sys.stderr.write(f'# PARAMS SCHEMA ({fn.__name__}): {json.dumps(parameters_schema, indent=2)}\n')
+        tools_schemas.append(
+            Tool(
+                type="function",
+                function=ToolFunction(
+                    name=fn.__name__,
+                    description=fn.__doc__ or '',
+                    parameters=parameters_schema,
+                )
             )
         )
-        for fn in tools
-    ]
 
     i = 0
     while (max_iterations is None or i < max_iterations):
@@ -106,7 +113,7 @@ def completion_with_tool_usage(
                 sys.stdout.write(f'⚙️  {pretty_call}')
                 sys.stdout.flush()
                 tool_result = tool_map[tool_call.function.name](**tool_call.function.arguments)
-                sys.stdout.write(f" -> {tool_result}\n")
+                sys.stdout.write(f" → {tool_result}\n")
                 messages.append(Message(
                     tool_call_id=tool_call.id,
                     role="tool",
@@ -203,6 +210,8 @@ def main(
     if std_tools:
         tool_functions.extend(collect_functions(StandardTools))
 
+    sys.stdout.write(f'🛠️  {", ".join(fn.__name__ for fn in tool_functions)}\n')
+
     response_model: Union[type, Json[Any]] = None #str
     if format:
         if format in types:
diff --git a/examples/agent/tools/std_tools.py b/examples/agent/tools/std_tools.py
index 4d1e132a1e9ad..f4ee850365c6d 100644
--- a/examples/agent/tools/std_tools.py
+++ b/examples/agent/tools/std_tools.py
@@ -16,7 +16,18 @@ class Duration(BaseModel):
     years: Optional[int] = None
 
     def __str__(self) -> str:
-        return f"{self.years} years, {self.months} months, {self.days} days, {self.hours} hours, {self.minutes} minutes, {self.seconds} seconds"
+        return ', '.join([
+            x
+            for x in [
+                f"{self.years} years" if self.years else None,
+                f"{self.months} months" if self.months else None,
+                f"{self.days} days" if self.days else None,
+                f"{self.hours} hours" if self.hours else None,
+                f"{self.minutes} minutes" if self.minutes else None,
+                f"{self.seconds} seconds" if self.seconds else None,
+            ]
+            if x is not None
+        ])
 
     @property
     def get_total_seconds(self) -> int:
@@ -36,25 +47,6 @@ def __call__(self):
         sys.stderr.write(f"Waiting for {self.duration}...\n")
         time.sleep(self.duration.get_total_seconds)
 
-class WaitForDate(BaseModel):
-    until: date
-
-    def __call__(self):
-        # Get the current date
-        current_date = datetime.date.today()
-
-        if self.until < current_date:
-            raise ValueError("Target date cannot be in the past.")
-
-        time_diff = datetime.datetime.combine(self.until, datetime.time.min) - datetime.datetime.combine(current_date, datetime.time.min)
-
-        days, seconds = time_diff.days, time_diff.seconds
-
-        sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {self.until}...\n")
-        time.sleep(days * 86400 + seconds)
-        sys.stderr.write(f"Reached the target date: {self.until}\n")
-
-
 class StandardTools:
 
     @staticmethod
@@ -66,12 +58,32 @@ def ask_user(question: str) -> str:
         return typer.prompt(question)
 
     @staticmethod
-    def wait(_for: Union[WaitForDuration, WaitForDate]) -> None:
-        '''
-            Wait for a certain amount of time before continuing.
-            This can be used to wait for a specific duration or until a specific date.
+    def wait_for_duration(duration: Duration) -> None:
+        'Wait for a certain amount of time before continuing.'
+
+        # sys.stderr.write(f"Waiting for {duration}...\n")
+        time.sleep(duration.get_total_seconds)
+
+    @staticmethod
+    def wait_for_date(target_date: date) -> None:
+        f'''
+            Wait until a specific date is reached before continuing.
+            Today's date is {datetime.date.today()}
         '''
-        return _for()
+
+        # Get the current date
+        current_date = datetime.date.today()
+
+        if target_date < current_date:
+            raise ValueError("Target date cannot be in the past.")
+
+        time_diff = datetime.datetime.combine(target_date, datetime.time.min) - datetime.datetime.combine(current_date, datetime.time.min)
+
+        days, seconds = time_diff.days, time_diff.seconds
+
+        # sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {target_date}...\n")
+        time.sleep(days * 86400 + seconds)
+        # sys.stderr.write(f"Reached the target date: {target_date}\n")
 
     @staticmethod
     def say_out_loud(something: str) -> None:

From 09c256594dc19467871b03588d07ccc7a4b35db4 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 21 Apr 2024 01:12:05 +0100
Subject: [PATCH 56/68] grammars: early exit when no next_candidates to reject

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 49f2b559e965e..33b94847065e6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12970,6 +12970,10 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
         }
     }
 
+    if (next_candidates.empty()) {
+        return rejects;
+    }
+
     const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
 
     // update top of stack to next element, if any

From 00c709eb4a2429bcee2fc3a58a050de76e2cef55 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 21 Apr 2024 15:52:16 +0100
Subject: [PATCH 57/68] grammars: cache decoded tokens

---
 llama.cpp | 33 +++++++++++++++++++++++++--------
 llama.h   |  7 ++++++-
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 33b94847065e6..704c5e24b40ca 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13051,7 +13051,7 @@ struct llama_grammar * llama_grammar_init(
         }
     } while (true);
 
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
+    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} };
 }
 
 void llama_grammar_free(struct llama_grammar * grammar) {
@@ -13059,7 +13059,7 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -13540,7 +13540,7 @@ void llama_sample_repetition_penalties(
     }
 }
 
-void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
+void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_grammar * grammar) {
     GGML_ASSERT(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -13552,21 +13552,36 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         }
     }
 
+    if (grammar->token_codepoints.empty()) {
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+        grammar->token_codepoints.resize(n_vocab);
+        grammar->token_pieces.resize(n_vocab);
+        for (llama_token id = 0; id < n_vocab; ++id) {
+            const std::string piece = llama_token_to_piece(ctx, id, false);
+            grammar->token_pieces[id] = piece;
+            grammar->token_codepoints[id] = decode_utf8(piece, {0, 0});
+        }
+    }
+
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
-    candidates_decoded.reserve(candidates->size);
+    if (grammar->partial_utf8.n_remain > 0) {
+        candidates_decoded.reserve(candidates->size);
+    }
     std::vector<llama_grammar_candidate>                              candidates_grammar;
     candidates_grammar.reserve(candidates->size);
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id, false);
-
+        const auto & piece      = grammar->token_pieces[id];
         if (llama_token_is_eog(&ctx->model, id)) {
             if (!allow_eog) {
                 candidates->data[i].logit = -INFINITY;
             }
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
+        } else if (grammar->partial_utf8.n_remain == 0){
+            const auto & decoded = grammar->token_codepoints.at(id);
+            candidates_grammar.push_back({ i, decoded.first.data(), decoded.second });
         } else {
             candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
@@ -13763,10 +13778,12 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string piece = llama_token_to_piece(ctx, token, false);
+    const auto & piece = grammar->token_pieces.at(token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
+    const auto   decoded     = grammar->partial_utf8.n_remain == 0
+        ? grammar->token_codepoints[token]
+        : decode_utf8(piece, grammar->partial_utf8);
     const auto & code_points = decoded.first;
     std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
diff --git a/llama.h b/llama.h
index 8b1b15ed4ad55..13c5963539748 100644
--- a/llama.h
+++ b/llama.h
@@ -961,7 +961,7 @@ extern "C" {
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
+            struct llama_grammar * grammar);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -1099,6 +1099,11 @@ struct llama_grammar {
 
     // buffer for partially generated UTF-8 sequence from accepted tokens
     llama_partial_utf8                                      partial_utf8;
+
+    // caching the token pieces & their decoded codepoints.
+    std::vector<std::string>                                token_pieces;
+    std::vector<std::pair<std::vector<uint32_t>,
+                                  llama_partial_utf8>>      token_codepoints;
 };
 
 struct llama_grammar_candidate {

From 8d503ef48223e372455abd319d09bd37a089914e Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 21 Apr 2024 15:52:25 +0100
Subject: [PATCH 58/68] grammars: faster llama_grammar_copy

---
 llama.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 704c5e24b40ca..aaae003991a15 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13061,16 +13061,22 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
     llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints };
 
+    std::unordered_map<const llama_grammar_element *, const llama_grammar_element *> element_map;
+    element_map.reserve(std::accumulate(
+        grammar->rules.begin(), grammar->rules.end(), 0,
+        [](size_t acc, const std::vector<llama_grammar_element> & rule) {
+            return acc + rule.size();
+        }));
+    for (size_t ir = 0; ir < grammar->rules.size(); ir++) {
+        for (size_t ie = 0; ie < grammar->rules[ir].size(); ie++) {
+            element_map[&grammar->rules[ir][ie]] = &result->rules[ir][ie];
+        }
+    }
+
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
         for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
-                    }
-                }
-            }
+            result->stacks[is][ie] = element_map.at(grammar->stacks[is][ie]);
         }
     }
 

From 312e20b54a80693f82793f073b5651ebce41bed9 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Tue, 30 Apr 2024 18:29:08 +0100
Subject: [PATCH 59/68] openai: update after merge

typos
---
 examples/openai/api.py       |  4 ++--
 examples/openai/gguf_kvs.py  |  9 ++++-----
 examples/openai/prompting.py |  4 ++--
 examples/openai/server.py    | 14 ++++++++------
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/examples/openai/api.py b/examples/openai/api.py
index 49f4a5f7bb634..705c5654b5d1b 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -29,7 +29,7 @@ class Tool(BaseModel):
 
 class ResponseFormat(BaseModel):
     type: Literal["json_object"]
-    schema: Optional[Json[Any]] = None  # type: ignore
+    schema: Optional[dict[str, Any]] = None  # type: ignore
 
 class LlamaCppParams(BaseModel):
     n_predict: Optional[int] = None
@@ -67,7 +67,7 @@ class ChatCompletionRequest(LlamaCppParams):
 class Choice(BaseModel):
     index: int
     message: Message
-    logprobs: Optional[Json[Any]] = None
+    logprobs: Optional[dict[str, Any]] = None
     finish_reason: Union[Literal["stop"], Literal["tool_calls"]]
 
 class Usage(BaseModel):
diff --git a/examples/openai/gguf_kvs.py b/examples/openai/gguf_kvs.py
index 2eba427b33eec..4bb24b5e75d5f 100644
--- a/examples/openai/gguf_kvs.py
+++ b/examples/openai/gguf_kvs.py
@@ -8,13 +8,12 @@
 
 class GGUFKeyValues:
     def __init__(self, model: Path):
-        reader = GGUFReader(model.as_posix())
-        self.fields = reader.fields
+        self.reader = GGUFReader(model.as_posix())
     def __getitem__(self, key: str):
         if '{arch}' in key:
             key = key.replace('{arch}', self[Keys.General.ARCHITECTURE])
-        return self.fields[key].read()
+        return self.reader.read_field(self.reader.fields[key])
     def __contains__(self, key: str):
-        return key in self.fields
+        return key in self.reader.fields
     def keys(self):
-        return self.fields.keys()
+        return self.reader.fields.keys()
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index d64db02ff2d9e..f1b7d17eb3489 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -180,7 +180,7 @@ def raw_render(self, messages: list[Message], add_generation_prompt: bool, omit_
 
 class ChatHandlerArgs(BaseModel):
     chat_template: ChatTemplate
-    response_schema: Optional[Json[Any]] = None
+    response_schema: Optional[dict[str,Any]] = None
     tools: Optional[list[Tool]] = None
 
 class ChatHandler(ABC):
@@ -719,7 +719,7 @@ def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Op
         raise ValueError(f"Unsupported tool call style: {tool_style}")
 
 # os.environ.get('NO_TS')
-def _please_respond_with_schema(schema: Json[Any]) -> str:
+def _please_respond_with_schema(schema: dict[str, Any]) -> str:
     sig = json.dumps(schema, indent=2)
     # _ts_converter = SchemaToTypeScriptConverter()
     # # _ts_converter.resolve_refs(schema, 'schema')
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 672b6176d7d8a..21903f81272e3 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -37,12 +37,8 @@ def main(
 ):
     import uvicorn
 
-    if endpoint:
-        sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
-        assert template_hf_model_id_fallback, "template_hf_model_id_fallback is required when using an endpoint"
-        chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
-
-    else:
+    chat_template = None
+    if model:
         metadata = GGUFKeyValues(Path(model))
 
         if not context_length:
@@ -58,6 +54,12 @@ def main(
         if verbose:
             sys.stderr.write(f"# CHAT TEMPLATE:\n\n{chat_template}\n\n")
 
+    if not chat_template:
+        sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+        assert template_hf_model_id_fallback or chat_template, "template_hf_model_id_fallback is required when using an endpoint without a model"
+        chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
+
+    if not endpoint:
         if verbose:
             sys.stderr.write(f"# Starting C++ server with model {model} on {server_host}:{server_port}\n")
         cmd = [

From ca1a640da206359f36d0aa7b7bc1c97ac068a213 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 2 May 2024 03:20:00 +0100
Subject: [PATCH 60/68] server: tool call grammar-constraints

fix
---
 common/json-schema-to-grammar.cpp | 138 ++++++++++++++++++++++--------
 common/json-schema-to-grammar.h   |   1 +
 examples/server/server.cpp        |   2 +-
 examples/server/utils.hpp         |  28 +++++-
 4 files changed, 127 insertions(+), 42 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 0f8f1b1d41bdc..393153b403775 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,6 +11,9 @@
 
 using json = nlohmann::ordered_json;
 
+const char * DOTALL = "[\\U00000000-\\U0010FFFF]";
+const char * DOT = "[^\\x0A\\x0D]";
+
 template <typename Iterator>
 static std::string join(Iterator begin, Iterator end, const std::string & separator);
 
@@ -198,6 +201,29 @@ static std::string format_literal(const std::string & literal) {
 }
 
 
+/*
+    not_literal('a') -> '[^a]'
+    not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+*/
+static std::string not_literal(const std::string & literal, bool dotall = true) {
+    assert(literal.size() > 0);
+    std::stringstream out;
+    std::function<void(int)> recurse = [&](size_t i) {
+        const auto & c = literal[i];
+        out << "[^" << c << "]";
+        if (i < literal.size() - 1) {
+            out << " | " << format_literal(std::to_string(c)) << " (";
+            recurse(i + 1);
+            out << ")?";
+        }
+    };
+    out << "(";
+    recurse(0);
+    out << ")" << (dotall ? DOTALL : DOT) << "*";
+    return out.str();
+}
+
+
 class SchemaConverter {
 private:
     std::function<json(const std::string &)> _fetch_json;
@@ -208,22 +234,6 @@ class SchemaConverter {
     std::vector<std::string> _errors;
     std::vector<std::string> _warnings;
 
-    std::string _add_rule(const std::string & name, const std::string & rule) {
-        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
-        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
-            _rules[esc_name] = rule;
-            return esc_name;
-        } else {
-            int i = 0;
-            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
-                i++;
-            }
-            std::string key = esc_name + std::to_string(i);
-            _rules[key] = rule;
-            return key;
-        }
-    }
-
     std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
         std::vector<std::string> rules;
         for (size_t i = 0; i < alt_schemas.size(); i++) {
@@ -256,11 +266,11 @@ class SchemaConverter {
             auto get_dot = [&]() {
                 std::string rule;
                 if (_dotall) {
-                    rule = "[\\U00000000-\\U0010FFFF]";
+                    rule = DOTALL;
                 } else {
-                    rule = "[^\\x0A\\x0D]";
+                    rule = DOT;
                 }
-                return _add_rule("dot", rule);
+                return add_rule("dot", rule);
             };
 
             // Joins the sequence, merging consecutive literals together.
@@ -377,7 +387,7 @@ class SchemaConverter {
                     if (!sub_is_literal) {
                         std::string & sub_id = sub_rule_ids[sub];
                         if (sub_id.empty()) {
-                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                            sub_id = add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
                         }
                         sub = sub_id;
                     }
@@ -423,7 +433,7 @@ class SchemaConverter {
             }
             return join_seq();
         };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
     }
 
     std::string _resolve_ref(const std::string & ref) {
@@ -451,7 +461,7 @@ class SchemaConverter {
             const auto &prop_schema = kv.second;
 
             std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
-            prop_kv_rule_names[prop_name] = _add_rule(
+            prop_kv_rule_names[prop_name] = add_rule(
                 name + (name.empty() ? "" : "-") + prop_name + "-kv",
                 format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
             );
@@ -464,7 +474,7 @@ class SchemaConverter {
         if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
             std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
             std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
             prop_kv_rule_names["*"] = kv_rule;
             optional_props.push_back("*");
         }
@@ -491,7 +501,7 @@ class SchemaConverter {
                 std::string k = ks[0];
                 std::string kv_rule_name = prop_kv_rule_names[k];
                 if (k == "*") {
-                    res = _add_rule(
+                    res = add_rule(
                         name + (name.empty() ? "" : "-") + "additional-kvs",
                         kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
                     );
@@ -501,7 +511,7 @@ class SchemaConverter {
                     res = kv_rule_name;
                 }
                 if (ks.size() > 1) {
-                    res += " " + _add_rule(
+                    res += " " + add_rule(
                         name + (name.empty() ? "" : "-") + k + "-rest",
                         get_recursive_refs(std::vector<std::string>(ks.begin() + 1, ks.end()), true)
                     );
@@ -527,7 +537,7 @@ class SchemaConverter {
     }
 
     std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
-        auto n = _add_rule(name, rule.content);
+        auto n = add_rule(name, rule.content);
         for (const auto & dep : rule.deps) {
             BuiltinRule dep_rule;
             auto it = PRIMITIVE_RULES.find(dep);
@@ -615,6 +625,22 @@ class SchemaConverter {
         visit_refs(schema);
     }
 
+    std::string add_rule(const std::string & name, const std::string & rule) {
+        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
+            _rules[esc_name] = rule;
+            return esc_name;
+        } else {
+            int i = 0;
+            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+                i++;
+            }
+            std::string key = esc_name + std::to_string(i);
+            _rules[key] = rule;
+            return key;
+        }
+    }
+
     std::string _generate_constant_rule(const json & value) {
         return format_literal(value.dump());
     }
@@ -625,24 +651,24 @@ class SchemaConverter {
         std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
 
         if (schema.contains("$ref")) {
-            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
+            return add_rule(rule_name, _resolve_ref(schema["$ref"]));
         } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
             std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
-            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+            return add_rule(rule_name, _generate_union_rule(name, alt_schemas));
         } else if (schema_type.is_array()) {
             std::vector<json> schema_types;
             for (const auto & t : schema_type) {
                 schema_types.push_back({{"type", t}});
             }
-            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
+            return add_rule(rule_name, _generate_union_rule(name, schema_types));
         } else if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return add_rule(rule_name, _generate_constant_rule(schema["const"]));
         } else if (schema.contains("enum")) {
             std::vector<std::string> enum_values;
             for (const auto & v : schema["enum"]) {
                 enum_values.push_back(_generate_constant_rule(v));
             }
-            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+            return add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
         } else if ((schema_type.is_null() || schema_type == "object")
                 && (schema.contains("properties") ||
                     (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -660,7 +686,7 @@ class SchemaConverter {
                     properties.emplace_back(prop.key(), prop.value());
                 }
             }
-            return _add_rule(rule_name,
+            return add_rule(rule_name,
                 _build_object_rule(
                     properties, required, name,
                     schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
@@ -691,7 +717,7 @@ class SchemaConverter {
                     add_component(t, true);
                 }
             }
-            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
+            return add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
         } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
             json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
             if (items.is_array()) {
@@ -703,14 +729,14 @@ class SchemaConverter {
                     rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                 }
                 rule += " \"]\" space";
-                return _add_rule(rule_name, rule);
+                return add_rule(rule_name, rule);
             } else {
                 std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
                 int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                 json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
                 int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
 
-                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+                return add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
             }
         } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
             return _visit_pattern(schema["pattern"], rule_name);
@@ -718,14 +744,14 @@ class SchemaConverter {
             return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
         } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
             auto prim_name = schema_format + "-string";
-            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+            return add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
         } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
             std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
             int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
             int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+            return add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
         } else if (schema.empty() || schema_type == "object") {
-            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
+            return add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
         } else {
             if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                 _errors.push_back("Unrecognized schema: " + schema.dump());
@@ -762,3 +788,39 @@ std::string json_schema_to_grammar(const json & schema) {
     converter.check_errors();
     return converter.format_grammar();
 }
+
+std::string tool_call_grammar(const json & tools) {
+    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+    
+    std::vector<std::string> tool_rules;
+    
+    for (const auto & tool : tools) {
+        const auto & function = tool["function"];
+        std::string name = function["name"];
+        std::string description = function["description"];
+        auto parameters_copy = function["parameters"];
+        converter.resolve_refs(parameters_copy, name);
+
+        tool_rules.push_back(converter.visit(json {
+            {"type", "object"},
+            {"description", description},
+            {"properties", json {
+                {"name", json {{"const", name}}},
+                {"arguments", parameters_copy},
+            }},
+            {"required", json::array({"name", "arguments"})},
+        }, name + "-tool-call"));
+    }
+
+    converter.add_rule(
+        "root",
+        not_literal("<tool_call>") + " | "
+        + converter.add_rule(
+            "tool_call",
+            "\"<tool_call>\" " 
+            + join(tool_rules.begin(), tool_rules.end(), " | ")
+            + " \"</tool_call>\""));
+
+    converter.check_errors();
+    return converter.format_grammar();
+}
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index e1abed3037582..5594051b1f6de 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,4 +1,5 @@
 #pragma once
 #include "json.hpp"
 
+std::string tool_call_grammar(const nlohmann::ordered_json & tools);
 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f60530cf3db56..da45ab11b2acf 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3031,7 +3031,7 @@ int main(int argc, char ** argv) {
         chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
         chat.push_back({{"role", "user"},      {"content", "How are you?"}});
 
-        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat, "");
 
         LOG_INFO("chat template", {
             {"chat_example", chat_example},
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 1a22125028204..336307a56dda5 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -4,6 +4,7 @@
 #include "common.h"
 
 #include "json.hpp"
+#include "json-schema-to-grammar.h"
 
 #include <string>
 #include <vector>
@@ -122,7 +123,7 @@ inline bool verify_custom_template(const std::string & tmpl) {
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const std::string & tools_tag) {
     size_t alloc_size = 0;
     // vector holding all allocated string to be passed to llama_chat_apply_template
     std::vector<std::string> str(messages.size() * 2);
@@ -137,6 +138,20 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat[i].content = str[i*2 + 1].c_str();
     }
 
+    if (!tools_tag.empty()) {
+        alloc_size += tools_tag.size();
+        if (chat.empty()) {
+            str.resize(2);
+            str[0] = "user";
+            str[1] = tools_tag;
+            chat.push_back({str[0].c_str(), str[1].c_str()});
+        } else {
+            auto & content = str[str.size() - 1];
+            content += tools_tag;
+            chat[chat.size() - 1].content = content.c_str();
+        }
+    }
+
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
     std::vector<char> buf(alloc_size * 2);
 
@@ -372,8 +387,15 @@ static json oaicompat_completion_params_parse(
     llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
     llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
 
+    std::string tools_tag;
+    if (body.contains("tools") && body["tools"].is_array()) {
+        const auto & tools = body["tools"];
+        llama_params["grammar"] = tool_call_grammar(tools);
+        tools_tag = (std::stringstream() << "\n\n<tools>" << tools.dump(2) << "</tools>").str();
+    }
+
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"], tools_tag);
 
     // Handle "stop" field
     if (body.contains("stop") && body["stop"].is_string()) {
@@ -408,7 +430,7 @@ static json oaicompat_completion_params_parse(
     }
 
     // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    static const std::vector<std::string> unsupported_params { "tool_choice" };
     for (auto & param : unsupported_params) {
         if (body.contains(param)) {
             throw std::runtime_error("Unsupported param: " + param);

From 2b2127c2a31d3456fd639b6a15bbb6b271920fea Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 2 May 2024 03:20:25 +0100
Subject: [PATCH 61/68] agent: url params

---
 examples/agent/agent.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index 03fb96dca6038..ebb51e1110a75 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -143,7 +143,11 @@ def main(
     verbose: bool = False,
     style: Optional[ToolsPromptStyle] = None,
 
-    model: Annotated[str, typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    model: Optional[Annotated[str, typer.Option("--model", "-m")]] = None,# = "models/7B/ggml-model-f16.gguf",
+    model_url: Optional[Annotated[str, typer.Option("--model-url", "-mu")]] = None,
+    hf_repo: Optional[Annotated[str, typer.Option("--hf-repo", "-hfr")]] = None,
+    hf_file: Optional[Annotated[str, typer.Option("--hf-file", "-hff")]] = None,
+    
     endpoint: Optional[str] = None,
     context_length: Optional[int] = None,
     # endpoint: str = 'http://localhost:8080/v1/chat/completions',

From e41b6ceee9f0b9bbae28f5c608dff3e3f6fb4864 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 2 May 2024 04:54:58 +0100
Subject: [PATCH 62/68] server: update tool calling, introduce system prompt
 for json schema

---
 common/json-schema-to-grammar.cpp | 99 ++++++++++++++++++++++---------
 common/json-schema-to-grammar.h   |  2 +-
 examples/server/utils.hpp         | 66 ++++++++++++---------
 3 files changed, 109 insertions(+), 58 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 393153b403775..4c09587a03f52 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -200,28 +200,28 @@ static std::string format_literal(const std::string & literal) {
     return "\"" + escaped + "\"";
 }
 
-
 /*
     not_literal('a') -> '[^a]'
     not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
 */
-static std::string not_literal(const std::string & literal, bool dotall = true) {
-    assert(literal.size() > 0);
-    std::stringstream out;
-    std::function<void(int)> recurse = [&](size_t i) {
-        const auto & c = literal[i];
-        out << "[^" << c << "]";
-        if (i < literal.size() - 1) {
-            out << " | " << format_literal(std::to_string(c)) << " (";
-            recurse(i + 1);
-            out << ")?";
-        }
-    };
-    out << "(";
-    recurse(0);
-    out << ")" << (dotall ? DOTALL : DOT) << "*";
-    return out.str();
-}
+// static std::string not_literal(const std::string & literal, bool dotall = true) {
+//     assert(literal.size() > 0);
+//     std::stringstream out;
+//     std::function<void(int)> recurse = [&](size_t i) {
+//         const char & c = literal[i];
+//         out << "[^" << c << "]";
+//         out << " " << (dotall ? DOTALL : DOT) << "*";
+//         if (i < literal.size() - 1) {
+//             out << " | " << format_literal(literal.substr(i, 1)) << " (";
+//             recurse(i + 1);
+//             out << ")?";
+//         }
+//     };
+//     out << "(";
+//     recurse(0);
+//     out << ")";
+//     return out.str();
+// }
 
 
 class SchemaConverter {
@@ -625,17 +625,57 @@ class SchemaConverter {
         visit_refs(schema);
     }
 
+/*
+    reply ::= prefix tool-call*
+
+    prefix ::= [^<] prefix
+                | "<" [^t] prefix
+                | "<t" [^o] prefix
+                | "<to" [^o] prefix
+                | "<too" [^l] prefix
+                | "<tool" [^_] prefix
+                | "<tool_" [^c] prefix
+                | "<tool_c" [^a] prefix
+                | "<tool_ca" [^l] prefix
+                | "<tool_cal" [^l] prefix
+                | "<tool_call" [^l] prefix
+                | "<tool_call" [^>] prefix
+                |
+
+*/
+
+    std::string not_literal(const std::string & literal) {
+        auto rule_name = _find_rule_name("not" + literal, "!!!");
+        std::stringstream out;
+        for (size_t i = 0, n = literal.size(); i < n; i++) {
+            out << " | ";
+            if (i > 0) {
+                out << format_literal(literal.substr(0, i)) << " ";
+            }
+            out << "[^" << literal[i] << "] " << rule_name.c_str();
+        }
+        _rules[rule_name] = out.str();
+        return rule_name;
+    }
+
+    std::string _escape_name(const std::string & name) {
+        return regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+    }
+    std::string _find_rule_name(const std::string & name, const std::string & rule) {
+        auto esc_name = _escape_name(name);
+        int i = 0;
+        while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+            i++;
+        }
+        return esc_name + std::to_string(i);
+    }
     std::string add_rule(const std::string & name, const std::string & rule) {
-        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+        auto esc_name = _escape_name(name);
         if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
             _rules[esc_name] = rule;
             return esc_name;
         } else {
-            int i = 0;
-            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
-                i++;
-            }
-            std::string key = esc_name + std::to_string(i);
+            auto key = _find_rule_name(esc_name, rule);
             _rules[key] = rule;
             return key;
         }
@@ -789,7 +829,7 @@ std::string json_schema_to_grammar(const json & schema) {
     return converter.format_grammar();
 }
 
-std::string tool_call_grammar(const json & tools) {
+std::string tool_call_grammar(const json & tools, bool allow_parallel_calls) {
     SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
     
     std::vector<std::string> tool_rules;
@@ -814,12 +854,13 @@ std::string tool_call_grammar(const json & tools) {
 
     converter.add_rule(
         "root",
-        not_literal("<tool_call>") + " | "
-        + converter.add_rule(
+        converter.not_literal("<tool_call>") + " " +
+        converter.add_rule(
             "tool_call",
-            "\"<tool_call>\" " 
+            "\"<tool_call>\" (" 
             + join(tool_rules.begin(), tool_rules.end(), " | ")
-            + " \"</tool_call>\""));
+            + ") \"</tool_call>\""
+        ) + (allow_parallel_calls ? "*" : "?"));
 
     converter.check_errors();
     return converter.format_grammar();
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index 5594051b1f6de..024825151d5ac 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,5 +1,5 @@
 #pragma once
 #include "json.hpp"
 
-std::string tool_call_grammar(const nlohmann::ordered_json & tools);
+std::string tool_call_grammar(const nlohmann::ordered_json & tools, bool allow_parallel_calls = false);
 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 336307a56dda5..3bcfb2252daf3 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -123,7 +123,7 @@ inline bool verify_custom_template(const std::string & tmpl) {
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const std::string & tools_tag) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const std::string & extra_system_message) {
     size_t alloc_size = 0;
     // vector holding all allocated string to be passed to llama_chat_apply_template
     std::vector<std::string> str(messages.size() * 2);
@@ -138,18 +138,12 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat[i].content = str[i*2 + 1].c_str();
     }
 
-    if (!tools_tag.empty()) {
-        alloc_size += tools_tag.size();
-        if (chat.empty()) {
-            str.resize(2);
-            str[0] = "user";
-            str[1] = tools_tag;
-            chat.push_back({str[0].c_str(), str[1].c_str()});
-        } else {
-            auto & content = str[str.size() - 1];
-            content += tools_tag;
-            chat[chat.size() - 1].content = content.c_str();
-        }
+    if (!extra_system_message.empty()) {
+        alloc_size += extra_system_message.size();
+
+        llama_chat_message msg { "system", extra_system_message.c_str() };
+        chat.insert(chat.begin(), msg);
+        // chat.push_back(msg);
     }
 
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
@@ -387,15 +381,42 @@ static json oaicompat_completion_params_parse(
     llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
     llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
 
-    std::string tools_tag;
-    if (body.contains("tools") && body["tools"].is_array()) {
+    std::string extra_system_message;
+
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+            extra_system_message = (std::stringstream()
+                << "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n"
+                << llama_params["json_schema"].dump().c_str() 
+                << "\n</schema>"
+            ).str();
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    } else if (body.contains("tools") && body["tools"].is_array()) {
         const auto & tools = body["tools"];
         llama_params["grammar"] = tool_call_grammar(tools);
-        tools_tag = (std::stringstream() << "\n\n<tools>" << tools.dump(2) << "</tools>").str();
+
+        extra_system_message = (std::stringstream()
+            << "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. "
+            << "You may call one or more functions to assist with the user query. "
+            << "Don't make assumptions about what values to plug into functions. "
+            << "Here are the available tools: <tools>"
+            << tools.dump().c_str()
+            << "</tools>\n"
+            << "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:"
+            << "<tool_call>"
+            << "{\"arguments\": <args-dict>, \"name\": <function-name>}"
+            << "</tool_call>"
+        ).str();
     }
 
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body["messages"], tools_tag);
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"], extra_system_message);
 
     // Handle "stop" field
     if (body.contains("stop") && body["stop"].is_string()) {
@@ -404,17 +425,6 @@ static json oaicompat_completion_params_parse(
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
-    // Handle "response_format" field
-    if (body.contains("response_format")) {
-        json response_format      = json_value(body, "response_format", json::object());
-        std::string response_type = json_value(response_format, "type", std::string());
-        if (response_type == "json_object") {
-            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
-        } else if (!response_type.empty() && response_type != "text") {
-            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
-        }
-    }
-
     // Handle "n" field
     int n_choices = json_value(body, "n", 1);
     if (n_choices != 1) {

From a1d64cfb924c6edff412ecca5a330f74939a9192 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sat, 18 May 2024 18:19:27 +0100
Subject: [PATCH 63/68] openai: function call arguments must be returned
 stringified!

---
 examples/agent/agent.py      |  5 ++--
 examples/openai/api.py       |  3 ++-
 examples/openai/prompting.py | 52 +++++++-----------------------------
 3 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index ebb51e1110a75..bf2ee907ea5d4 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -109,10 +109,11 @@ def completion_with_tool_usage(
                 if content:
                     print(f'💭 {content}')
 
-                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v.model_dump_json() if isinstance(v, BaseModel) else json.dumps(v)}" for k, v in tool_call.function.arguments.items())})'
+                args = json.loads(tool_call.function.arguments)
+                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v.model_dump_json() if isinstance(v, BaseModel) else json.dumps(v)}" for k, v in args.items())})'
                 sys.stdout.write(f'⚙️  {pretty_call}')
                 sys.stdout.flush()
-                tool_result = tool_map[tool_call.function.name](**tool_call.function.arguments)
+                tool_result = tool_map[tool_call.function.name](**args)
                 sys.stdout.write(f" → {tool_result}\n")
                 messages.append(Message(
                     tool_call_id=tool_call.id,
diff --git a/examples/openai/api.py b/examples/openai/api.py
index 705c5654b5d1b..cafe12752378e 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -4,7 +4,8 @@
 
 class FunctionCall(BaseModel):
     name: str
-    arguments: Union[Dict[str, Any], str]
+    arguments: str
+    # arguments: Union[Dict[str, Any], str]
 
 class ToolCall(BaseModel):
     id: Optional[str] = None
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index f1b7d17eb3489..386d540e870a2 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -56,7 +56,6 @@ class ChatTemplate(BaseModel):
     bos_token: str
 
     inferred_tool_style: Annotated[Optional['ToolsPromptStyle'], Field(exclude=True)] = None
-    expects_stringified_function_arguments: Annotated[Optional[bool], Field(exclude=True)] = None
     expects_strict_user_assistant_alternance: Annotated[Optional[bool], Field(exclude=True)] = None
     formats_tool_call: Annotated[Optional[bool], Field(exclude=True)] = None
     formats_tool_call_content: Annotated[Optional[bool], Field(exclude=True)] = None
@@ -108,7 +107,7 @@ def succeeds(messages: list[Message], strings_to_find = ()):
 
         thought = "Precious thought"
         fn_name = "callMeMaybe"
-        toolcall = ToolCall(id="call_531873", type="function", function=FunctionCall(name=fn_name, arguments={"lol": 123}))
+        toolcall = ToolCall(id="call_531873", type="function", function=FunctionCall(name=fn_name, arguments=json.dumps({"lol": 123})))
         toolcall_msg = Message(role="assistant", content=None, tool_calls=[toolcall])
         tool_result = "Tool result"
         tool_name = "additioner"
@@ -119,8 +118,6 @@ def succeeds(messages: list[Message], strings_to_find = ()):
         self.formats_tool_call = succeeds([user_msg, toolcall_msg], (fn_name,))
         if self.formats_tool_call:
             self.formats_tool_call_content = succeeds([user_msg, toolcall_content_msg], (thought,))
-            self.expects_stringified_function_arguments = \
-                not succeeds([user_msg, toolcall_content_msg]) and succeeds([user_msg, stringified_toolcall_msg], (fn_name,))
 
         self.formats_tool_result = succeeds([user_msg, assistant_msg, tool_msg], (tool_result,))
         self.formats_tool_name = succeeds([user_msg, assistant_msg, tool_msg], (tool_name,))
@@ -246,24 +243,6 @@ def normalize(m: Message):
                             ])
                         ])
                     )
-                elif self.args.chat_template.expects_stringified_function_arguments:
-                    return Message(
-                        role=m.role,
-                        content=m.content,
-                        name=m.name,
-                        tool_call_id=m.tool_call_id,
-                        tool_calls=[
-                            ToolCall(
-                                id=tc.id,
-                                type=tc.type,
-                                function=FunctionCall(
-                                    name=tc.function.name,
-                                    arguments=json.dumps(tc.function.arguments)
-                                )
-                            )
-                            for tc in m.tool_calls
-                        ],
-                    )
                 else:
                     return m
             elif self.args.chat_template.expects_strict_user_assistant_alternance and m.role not in ('user', 'assistant'):
@@ -313,25 +292,6 @@ def flush():
         # JSON!
         # messages = [m.model_dump() for m in messages]
 
-        # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        if self.args.chat_template.expects_stringified_function_arguments:
-            messages = [
-                Message(**{
-                    **m.model_dump(),
-                    "tool_calls": [
-                        ToolCall(**{
-                            **tc.model_dump(),
-                            "function": {
-                                "name": tc.function.name,
-                                "arguments": tc.function.arguments,
-                            }
-                        })
-                        for tc in m.tool_calls
-                    ] if m.tool_calls else None
-                })
-                for m in messages
-            ]
-
         return self.args.chat_template.raw_render(
             messages=messages,
             add_generation_prompt=True,
@@ -429,7 +389,9 @@ def parse(self, s: str) -> Optional[Message]:
                     tool_calls.append(
                         ToolCall(
                             id=gen_callid(),
-                            function=FunctionCall(**fc)))
+                            function=FunctionCall(
+                                name=fc["name"],
+                                arguments=json.dumps(fc["arguments"]))))
 
             content_str = '\n'.join(content).strip()
             return Message(role="assistant", content=content_str if content_str else None, tool_calls=tool_calls)
@@ -653,7 +615,11 @@ def parse(self, s: str) -> Optional[Message]:
                 role="assistant",
                 content=data.get(_THOUGHT_KEY),
                 tool_calls=[
-                    ToolCall(id=gen_callid(), function=FunctionCall(**tc))
+                    ToolCall(
+                        id=gen_callid(),
+                        function=FunctionCall(
+                            name=tc["name"],
+                            arguments=json.dumps(tc["arguments"])))
                     for tc in next_step['tool_calls']
                 ]
             )

From 5ea637e42cc40fc047344026f72d784a7d50c1c6 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Tue, 21 May 2024 18:12:36 +0100
Subject: [PATCH 64/68] openai: fix merge

---
 examples/server/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 8fbc42fbb8b3f..3cdbe0a09d235 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -385,7 +385,7 @@ static json oaicompat_completion_params_parse(
 
     std::string extra_system_message;
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), extra_system_message);
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {

From c8458fa5f71c87a9f565d6adafafcedad4575e88 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 22 May 2024 03:51:20 +0100
Subject: [PATCH 65/68] openai: make content optional for tool call grammar gen

---
 common/json-schema-to-grammar.cpp | 23 ++++++++++++++---------
 common/json-schema-to-grammar.h   |  2 +-
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 8ed8c85ecd0e8..b4a838c549e63 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -829,7 +829,7 @@ std::string json_schema_to_grammar(const json & schema) {
     return converter.format_grammar();
 }
 
-std::string tool_call_grammar(const json & tools, bool allow_parallel_calls) {
+std::string tool_call_grammar(const json & tools, bool allow_parallel_calls, bool allow_content) {
     SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
     
     std::vector<std::string> tool_rules;
@@ -837,7 +837,7 @@ std::string tool_call_grammar(const json & tools, bool allow_parallel_calls) {
     for (const auto & tool : tools) {
         const auto & function = tool["function"];
         std::string name = function["name"];
-        std::string description = function["description"];
+        std::string description = function.contains("description") ? function["description"] : "";
         auto parameters_copy = function["parameters"];
         converter.resolve_refs(parameters_copy, name);
 
@@ -854,13 +854,18 @@ std::string tool_call_grammar(const json & tools, bool allow_parallel_calls) {
 
     converter.add_rule(
         "root",
-        converter.not_literal("<tool_call>") + " " +
-        converter.add_rule(
-            "tool_call",
-            "\"<tool_call>\" (" 
-            + join(tool_rules.begin(), tool_rules.end(), " | ")
-            + ") \"</tool_call>\""
-        ) + (allow_parallel_calls ? "*" : "?"));
+        (allow_content ? converter.not_literal("<tool_call>") + " | " : "") +
+        build_repetition(
+            converter.add_rule(
+                "tool_call",
+                "\"<tool_call>\" (" 
+                + join(tool_rules.begin(), tool_rules.end(), " | ")
+                + ") \"</tool_call>\""
+            ),
+            allow_content ? 0 : 1,
+            allow_parallel_calls ? std::numeric_limits<int>::max() : 1,
+            " \"\\n\" "
+        ));
 
     converter.check_errors();
     return converter.format_grammar();
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index 77e66cb2c8831..e0219cecef809 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -5,5 +5,5 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 
-std::string tool_call_grammar(const nlohmann::ordered_json & tools, bool allow_parallel_calls = false);
+std::string tool_call_grammar(const nlohmann::ordered_json & tools, bool allow_parallel_calls = false, bool allow_content = true);
 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

From a39e6e0758cfd4292d43d603a9566d469f9f269b Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 22 May 2024 03:51:49 +0100
Subject: [PATCH 66/68] openai: pretty indent json response

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 822c8e2b206c7..56d5bbd1856de 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3535,7 +3535,7 @@ int main(int argc, char ** argv) {
             if (!result.error && result.stop) {
                 json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
 
-                res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+                res.set_content(result_oai.dump(2, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
             } else {
                 res_error(res, result.data);
             }

From 793f4ff3f543d6fc5d79eaad0e6f186b4418c017 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 22 May 2024 04:11:48 +0100
Subject: [PATCH 67/68] agent: support OpenAI: --endpoint
 https://api.openai.com --auth "Bearer $OPENAI_API_KEY"

---
 examples/agent/agent.py      | 65 ++++++++++++++++++++++++++++++++++--
 examples/openai/api.py       |  5 +--
 examples/openai/prompting.py |  2 +-
 3 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/examples/agent/agent.py b/examples/agent/agent.py
index bf2ee907ea5d4..b3a5c6d4b9c23 100644
--- a/examples/agent/agent.py
+++ b/examples/agent/agent.py
@@ -30,6 +30,7 @@ def completion_with_tool_usage(
         messages: List[Message],
         auth: Optional[str],
         verbose: bool,
+        assume_llama_cpp_server: bool = False,
         **kwargs):
     '''
     Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
@@ -75,7 +76,7 @@ def completion_with_tool_usage(
         request = ChatCompletionRequest(
             messages=messages,
             response_format=response_format,
-            tools=tools_schemas,
+            tools=tools_schemas if tools_schemas else None,
             cache_prompt=True,
             **kwargs,
         )
@@ -86,10 +87,65 @@ def completion_with_tool_usage(
         }
         if auth:
             headers["Authorization"] = auth
+
+        def drop_nones(o):
+            if isinstance(o, BaseModel):
+                return drop_nones(o.model_dump())
+            if isinstance(o, list):
+                return [drop_nones(i) for i in o if i is not None]
+            if isinstance(o, dict):
+                return {
+                    k: drop_nones(v)
+                    for k, v in o.items()
+                    if v is not None
+                }
+            return o
+        
+        if assume_llama_cpp_server:
+            body = request.model_dump()
+        else:
+            # request_dict = request.model_dump()
+            # body = drop_nones(request)
+            tools_arg = None
+            tool_choice = request.tool_choice
+            response_format = None
+            if request.tools:
+                tools_arg = drop_nones(request.tools)
+            if request.response_format:
+                response_format = {
+                    'type': request.response_format.type,
+                }
+                if request.response_format.schema:
+                    assert tools_arg is None
+                    assert tool_choice is None
+                    tools_arg = [{
+                        "type": "function",
+                        "function": {
+                            "name": "output",
+                            "description": "A JSON object",
+                            "parameters": request.response_format.schema,
+                        }
+                    }]
+                    tool_choice = "output"
+
+            body = drop_nones(dict(
+                messages=drop_nones(request.messages),
+                model=request.model,
+                tools=tools_arg,
+                tool_choice=tool_choice,
+                temperature=request.temperature,
+                response_format=response_format,
+            ))
+
+        if verbose:
+            sys.stderr.write(f'# POSTing to {endpoint}/v1/chat/completions\n')
+            sys.stderr.write(f'# HEADERS: {headers}\n')
+            sys.stderr.write(f'# BODY: {json.dumps(body, indent=2)}\n')
+
         response = requests.post(
             f'{endpoint}/v1/chat/completions',
             headers=headers,
-            json=request.model_dump(),
+            json=body,
         )
         response.raise_for_status()
         response_json = response.json()
@@ -143,6 +199,7 @@ def main(
     parallel_calls: Optional[bool] = False,
     verbose: bool = False,
     style: Optional[ToolsPromptStyle] = None,
+    assume_llama_cpp_server: Optional[bool] = None,
 
     model: Optional[Annotated[str, typer.Option("--model", "-m")]] = None,# = "models/7B/ggml-model-f16.gguf",
     model_url: Optional[Annotated[str, typer.Option("--model-url", "-mu")]] = None,
@@ -184,6 +241,7 @@ def main(
     if not endpoint:
         server_port = 8080
         server_host = 'localhost'
+        assume_llama_cpp_server = True
         endpoint = f'http://{server_host}:{server_port}'
         if verbose:
             sys.stderr.write(f"# Starting C++ server with model {model} on {endpoint}\n")
@@ -231,13 +289,14 @@ def main(
 
 
     result = completion_with_tool_usage(
-        model="...",
+        model="gpt-4o",
         endpoint=endpoint,
         response_model=response_model,
         max_iterations=max_iterations,
         tools=tool_functions,
         auth=auth,
         verbose=verbose,
+        assume_llama_cpp_server=assume_llama_cpp_server or False,
 
         n_predict=n_predict,
         top_k=top_k,
diff --git a/examples/openai/api.py b/examples/openai/api.py
index cafe12752378e..9508592c89467 100644
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@@ -21,8 +21,8 @@ class Message(BaseModel):
 
 class ToolFunction(BaseModel):
     name: str
-    description: str
     parameters: dict[str, Any]
+    description: Optional[str] = None
 
 class Tool(BaseModel):
     type: str
@@ -58,6 +58,7 @@ class LlamaCppParams(BaseModel):
 class ChatCompletionRequest(LlamaCppParams):
     model: str
     tools: Optional[List[Tool]] = None
+    tool_choice: Optional[str] = None
     messages: Optional[List[Message]] = None
     prompt: Optional[str] = None
     response_format: Optional[ResponseFormat] = None
@@ -87,5 +88,5 @@ class ChatCompletionResponse(BaseModel):
     model: str
     choices: List[Choice]
     usage: Usage
-    system_fingerprint: str
+    system_fingerprint: Optional[str] = None
     error: Optional[CompletionError] = None
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index 386d540e870a2..6aef7e437060d 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -698,7 +698,7 @@ def _tools_typescript_signatures(tools: list[Tool]) -> str:
     #     _ts_converter.resolve_refs(tool.function.parameters, tool.function.name)
 
     return 'namespace functions {\n' + '\n'.join(
-        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
+        '// ' + (tool.function.description or '').replace('\n', '\n// ') + '\n' + ''
         'type ' + tool.function.name + ' = (_: ' + _ts_converter.visit(tool.function.parameters) + ") => any;\n"
         for tool in tools
     ) + '} // namespace functions'

From a1c4aac384aed889870aa9d44bf562d505eaf7a3 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 22 May 2024 04:15:14 +0100
Subject: [PATCH 68/68] server: ultra basic tools, tool_choice, tool_calls
 support

---
 examples/server/tests/features/steps/steps.py | 43 ++++++++++---
 examples/server/utils.hpp                     | 63 ++++++++++++++++---
 2 files changed, 91 insertions(+), 15 deletions(-)

diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 26d9359d7f3f8..5c59d079bbfaa 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -66,6 +66,8 @@ def step_server_config(context, server_fqdn, server_port):
     context.server_seed = None
     context.user_api_key = None
     context.response_format = None
+    context.tools = None
+    context.tool_choice = None
     context.temperature = None
 
     context.tasks_result = []
@@ -337,6 +339,13 @@ def step_max_tokens(context, max_tokens):
 def step_response_format(context, response_format):
     context.response_format = json.loads(response_format)
 
+@step('tools {tools}')
+def step_tools(context, tools):
+    context.tools = json.loads(tools)
+
+@step('tool choice {tool_choice}')
+def step_tool_choice(context, tool_choice):
+    context.tool_choice = tool_choice
 
 @step('{temperature:f} temperature')
 def step_temperature(context, temperature):
@@ -471,6 +480,11 @@ async def step_oai_chat_completions(context, api_error):
                                             response_format=context.response_format
                                             if hasattr(context, 'response_format') else None,
 
+                                            tools=context.tools
+                                            if hasattr(context, 'tools') else None,
+
+                                            tool_choice=context.tool_choice,
+
                                             user_api_key=context.user_api_key
                                             if hasattr(context, 'user_api_key') else None,
 
@@ -541,6 +555,9 @@ async def step_oai_chat_completions(context):
                               if hasattr(context, 'enable_streaming') else None,
                               response_format=context.response_format
                               if hasattr(context, 'response_format') else None,
+                              tools=context.tools
+                              if hasattr(context, 'tools') else None,
+                              tool_choice=context.tool_choice,
                               user_api_key=context.user_api_key
                               if hasattr(context, 'user_api_key') else None)
 
@@ -554,16 +571,18 @@ async def step_oai_chat_completions(context):
                               context.base_url,
                               '/chat/completions',
                               True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
+                              model=context.model,
+                            #   if hasattr(context, 'model') else None,
+                              n_predict=context.n_predict,
+                            #   if hasattr(context, 'n_predict') else None,
                               enable_streaming=context.enable_streaming
                               if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
+                              response_format=context.response_format,
+                            #   if hasattr(context, 'response_format') else None,
+                              tools=context.tools,# if hasattr(context, 'tools') else None,
+                              tool_choice=context.tool_choice, # if hasattr(context, 'tool_choice') else None,
+                              user_api_key=context.user_api_key)
+                            #   if hasattr(context, 'user_api_key') else None)
 
 
 @step('all prompts are predicted')
@@ -908,6 +927,8 @@ async def oai_chat_completions(user_prompt,
                                n_predict=None,
                                enable_streaming=None,
                                response_format=None,
+                               tools=None,
+                               tool_choice=None,
                                user_api_key=None,
                                expect_api_error=None):
     if debug:
@@ -935,6 +956,10 @@ async def oai_chat_completions(user_prompt,
     }
     if response_format is not None:
         payload['response_format'] = response_format
+    if tools is not None:
+        payload['tools'] = tools
+    if tool_choice is not None:
+        payload['tool_choice'] = tool_choice
     completion_response = {
         'content': '',
         'timings': {
@@ -996,6 +1021,8 @@ async def oai_chat_completions(user_prompt,
                 max_tokens=n_predict,
                 stream=enable_streaming,
                 response_format=payload.get('response_format'),
+                tools=payload.get('tools'),
+                tool_choice=payload.get('tool_choice'),
                 seed=seed,
                 temperature=payload['temperature']
             )
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 3cdbe0a09d235..5baf7b5543dd3 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -12,6 +12,7 @@
 #include <vector>
 #include <sstream>
 #include <random>
+#include <regex>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
@@ -410,19 +411,40 @@ static json oaicompat_completion_params_parse(
         }
     } else if (body.contains("tools") && body["tools"].is_array()) {
         const auto & tools = body["tools"];
-        llama_params["grammar"] = tool_call_grammar(tools);
-
+        bool built_grammar = false;
+        bool allow_parallel_calls = false;
+        bool allow_content = true;
+        if (body.contains("tool_choice") && body["tool_choice"].is_string() && body["tool_choice"] != "auto") {
+            std::string tool_choice = body["tool_choice"];
+            if (tool_choice == "required") {
+                allow_content = false;
+            } else {
+                for (const auto & tool : tools) {
+                    if (tool["name"] == tool_choice) {
+                        llama_params["grammar"] = tool_call_grammar(json::array({ tool }), allow_parallel_calls, /* allow_content= */ false);
+                        built_grammar = true;
+                        break;
+                    }
+                }
+            }
+        }
+        if (!built_grammar) {
+            llama_params["grammar"] = tool_call_grammar(tools, allow_parallel_calls, allow_content);
+        }
+        // TODO: pass a template file.
         extra_system_message = (std::stringstream()
             << "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. "
             << "You may call one or more functions to assist with the user query. "
-            << "Don't make assumptions about what values to plug into functions. "
+            // << "Don't make assumptions about what values to plug into functions. "
             << "Here are the available tools: <tools>"
-            << tools.dump().c_str()
+            << tools.dump(2).c_str()
             << "</tools>\n"
+            // << "To call a tool give a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:"
             << "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:"
             << "<tool_call>"
-            << "{\"arguments\": <args-dict>, \"name\": <function-name>}"
+            << "{\"name\": <function-name>, \"arguments\": <args-dict>}"
             << "</tool_call>"
+            << "Don't explain which tools you're going to call, just call them."
         ).str();
     }
 
@@ -451,7 +473,7 @@ static json oaicompat_completion_params_parse(
     }
 
     // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "tool_choice" };
+    static const std::vector<std::string> unsupported_params;// { "tool_choice" };
     for (auto & param : unsupported_params) {
         if (body.contains(param)) {
             throw std::runtime_error("Unsupported param: " + param);
@@ -478,10 +500,36 @@ static json format_final_response_oaicompat(const json & request, json result, c
     int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
     std::string content      = json_value(result, "content", std::string(""));
 
+
     std::string finish_reason = "length";
     if (stopped_word || stopped_eos) {
         finish_reason = "stop";
     }
+    json tool_calls;
+    json message_content;
+    if (request.contains("tools")) {
+        std::regex pattern("<tool_call>(.*?)</tool_call>");
+        std::sregex_iterator iter(content.begin(), content.end(), pattern);
+        std::sregex_iterator end;
+        while (iter != end) {
+            std::smatch match = *iter;
+            auto call = json::parse(match[1].str());
+            if (tool_calls.is_null()) {
+                tool_calls = json::array();
+            }
+            tool_calls.push_back({
+                {"function", {
+                    {"name", call["name"]},
+                    {"arguments", call["arguments"].dump()},
+                }},
+            });
+            finish_reason = "tool_calls";
+            ++iter;
+        }
+    }
+    if (tool_calls.is_null()) {
+        message_content = content;
+    }
 
     json choices =
         streaming ? json::array({json{{"finish_reason", finish_reason},
@@ -489,7 +537,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
                                         {"delta", json::object()}}})
                   : json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
-                                        {"message", json{{"content", content},
+                                        {"message", json{{"content", message_content},
+                                                         {"tool_calls", tool_calls},
                                                          {"role", "assistant"}}}}});
 
     std::time_t t = std::time(0);