update README; fix bug for deepseek-chat (could not handle array content)

2025-04-13 00:02:18 +08:00 · 2025-04-13 00:02:18 +08:00 · f6cc8995fb
parent d7f0f63149
commit f6cc8995fb
23 changed files with 295 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@ -80,4 +80,8 @@ examples/server/webui/dist
 /.venv
 __pycache__/
 */poetry.lock
-poetry.toml
+poetry.toml
+
+# Configs
+
+config/config*.toml
--- a/README.md
+++ b/README.md
@ -21,29 +21,39 @@ cmake --build build --config Release

 ## How to Run

+### Configuration
+
 Switch to your own configration first:
-1. Replace `base_url`, `api_key`, .etc in `config/config_llm.toml` according to your need.
-2. Fill in `args` after `"@modelcontextprotocol/server-filesystem"` for `filesystem` to control the access to files. For example:
+1. Copy configuration files from `config/example` to `config`.
+2. Replace `base_url`, `api_key`, .etc in `config/config_llm.toml` and other configurations in `config/config*.toml` according to your need.
+    > Note: `llama-server` in [llama.cpp](https://github.com/ggml-org/llama.cpp) also support embedding models.
+3. Fill in `args` after `"@modelcontextprotocol/server-filesystem"` for `filesystem` to control the access to files. For example:
 ```
 [filesystem]
 type = "stdio"
 command = "npx"
 args = ["-y",
        "@modelcontextprotocol/server-filesystem",
-        "/Users/{username}/Desktop",
+        "/Users/{Username}/Desktop",
        "other/path/to/your/files]
 ```

-Start a MCP server with tool `python_execute` on port 8818:
+### `mcp_server`
+
+(for tools, only `python_execute` as an example now)
+
+Start a MCP server with tool `python_execute` on port 8895 (or pass the port as an argument):
 ```bash
-./build/bin/mcp_server # Unix/MacOS
+./build/bin/mcp_server <port> # Unix/MacOS
 ```

 ```shell
-.\build\bin\Release\mcp_server.exe  # Windows
+.\build\bin\Release\mcp_server.exe  <port> # Windows
 ```

-Run agent `humanus` with tools `python_execute`, `filesystem` and `playwright` (for browser use):
+### `humanus_cli`
+
+Run with tools `python_execute`, `filesystem` and `playwright` (for browser use):

 ```bash
 ./build/bin/humanus_cli # Unix/MacOS
@ -53,7 +63,9 @@ Run agent `humanus` with tools `python_execute`, `filesystem` and `playwright` (
 .\build\bin\Release\humanus_cli.exe # Windows
 ```

-Run experimental planning flow (only agent `humanus` as executor):
+### `humanus_cli_plan` (WIP)
+
+Run planning flow (only agent `humanus` as executor):
 ```bash
 ./build/bin/humanus_cli_plan # Unix/MacOS
 ```
@ -62,6 +74,43 @@ Run experimental planning flow (only agent `humanus` as executor):
 .\build\bin\Release\humanus_cli_plan.exe # Windows
 ```

+### `humanus_server` (WIP)
+
+Run agents in MCP the server (default running on port 8896):
+- `humanus_initialze`: Pass JSON configuration (like in `config/config.toml`) to initialize an agent for a session. (Only one agent will be maintained for each session/client)
+- `humanus_run`: Pass `prompt` to tell the agent what to do. (Only one task at a time)
+- `humanus_terminate`: Stop the current task.
+- `humanus_status`: Get the current states and other information about the agent and the task. Returns:
+  - `state`: Agent state.
+  - `current_step`: Current step index of the agent.
+  - `max_steps`: Maximum steps executing without interaction with the user.
+  - `prompt_tokens`: Prompt (input) tokens consumption.
+  - `completion_tokens`: Completion (output) tokens consumption.
+  - `log_buffer`: Logs in the buffer, like `humanus_cli`. Will be cleared after fetched.
+  - `result`: Explaining what the agent did. Not empty if the task is finished.
+
+```bash
+./build/bin/humanus_server <port> # Unix/MacOS
+```
+
+```shell
+.\build\bin\Release\humanus_cli_plan.exe <port> # Windows
+```
+
+Configure it in Cursor:
+```json
+{
+  "mcpServers": {
+    "humanus": {
+      "url": "http://localhost:8896/sse"
+    }
+  }
+}
+```
+
+> What if add `humanus` to `mcp_servers`? It might be interesting.
+
+
 ## Acknowledgement

 <p align="center">
@ -72,7 +121,7 @@ Run experimental planning flow (only agent `humanus` as executor):
 ## Cite

 ```
-@misc{humanuscpp,
+@misc{humanus_cpp,
  author = {Zihong Zhang and Zuchao Li},
  title = {humanus.cpp: A Lightweight C++ Framework for Local LLM Agents},
  year = {2025}
--- a/config/config.toml
+++ b/config/config.toml
@ -1,5 +1,13 @@
 [humanus_cli]
-llm = "qwen-max-latest"
+llm = "qwen-max-latest"                               # Key in config_llm.toml
+memory = "long-context"                               # Key in config_mem.toml
+tools = ["filesystem", "playwright", "image_loader"]  # Builtin tools configuration
+mcp_servers = ["python_execute"]                      # Key in config_mcp.toml, all MCP tools provided by servers will be added
+max_steps = 30                                        # Maximum automatic steps without user's check
+duplicate_threshold = 2                               # Used to detect repeating condition (will be checked by LCS)
+
+[humanus_plan]
+llm = "deepseek-chat"
 memory = "long-context"
 tools = ["filesystem", "playwright", "image_loader"]
 mcp_servers = ["python_execute"]
--- a/config/config_embd.toml
+++ b/config/config_embd.toml
@ -1,11 +1,11 @@
 ["nomic-embed-text-v1.5"]
-provider = "oai"
-base_url = "http://localhost:8080"
-endpoint = "/v1/embeddings"
-model = "nomic-embed-text-v1.5.f16.gguf"
-api_key = ""
-embeddings_dim = 768
-max_retries = 3
+provider = "oai"                          # Only support OAI-Compatible style for now
+base_url = "http://localhost:8080"        # Base url. Note: Don't add any endpoint behind
+endpoint = "/v1/embeddings"               # Endpoint of embeddings
+model = "nomic-embed-text-v1.5.f16.gguf"  # Model name
+api_key = ""                              # Your API Key
+embeddings_dim = 768                      # Dimension of embeddings (refer to API docs)
+max_retries = 3                           # Maximum retry count

 [qwen-text-embedding-v3]
 provider = "oai"
--- a/config/config_llm.toml
+++ b/config/config_llm.toml
@ -1,3 +1,9 @@
+[qwen-max]
+model = "qwen-max"                                   # Model name
+base_url = "https://dashscope.aliyuncs.com"          # Base url. Note: Don't add any endpoint behind
+endpoint = "/compatible-mode/v1/chat/completions"    # Endpoint of chat completions
+api_key = "sk-cb1bb2a240d84182bb93f6dd0fe03600"      # Your API Key
+
 [qwen-max-latest]
 model = "qwen-max-latest"
 base_url = "https://dashscope.aliyuncs.com"
@ -9,7 +15,7 @@ model = "qwen-vl-max-latest"
 base_url = "https://dashscope.aliyuncs.com"
 endpoint = "/compatible-mode/v1/chat/completions"
 api_key = "sk-cb1bb2a240d84182bb93f6dd0fe03600"
-enable_vision = true
+enable_vision = true                                 # This means the model could accept content item like {"image_url", {"url", "xxx"}}

 ["claude-3.5-sonnet"]
 model = "anthropic/claude-3.5-sonnet"
--- a/config/config_mcp.toml
+++ b/config/config_mcp.toml
@ -3,6 +3,7 @@ type = "sse"
 host = "localhost"
 port = 8895
 sse_endpoint = "/sse"
+message_enpoint = "/message"

 [puppeteer]
 type = "stdio"
@ -19,4 +20,4 @@ type = "stdio"
 command = "npx"
 args = ["-y",
        "@modelcontextprotocol/server-filesystem",
-        "/Users/hyde/Desktop"]
+        "/Users/hyde/Desktop"] # Allowed paths
--- a/config/config_mem.toml
+++ b/config/config_mem.toml
@ -1,12 +1,12 @@
 [default]
-max_messages = 16
-max_tokens_message = 32768
-max_tokens_messages = 65536
-max_tokens_context = 131072
-retrieval_limit = 32
-embedding_model = "qwen-text-embedding-v3"
-vector_store = "hnswlib"
-llm = "qwen-max-latest"
+max_messages = 16                           # Maximum number of messages in short-term memory
+max_tokens_message = 32768                  # Maximum number of tokens in single message
+max_tokens_messages = 65536                 # Maximum number of tokens in short-term memory
+max_tokens_context = 131072                 # Maximum number of tokens in context (used by `get_messages`)
+retrieval_limit = 32                        # Maximum number of results to retrive from long-term memory
+embedding_model = "qwen-text-embedding-v3"  # Key in config_embd.toml
+vector_store = "hnswlib"                    # Key in config_vec.toml
+llm = "qwen-max-latest"                     # Key in config_llm.toml

 [long-context]
 max_messages = 32
--- a/config/example/config.toml
+++ b/config/example/config.toml
@ -0,0 +1,15 @@
+[humanus_cli]
+llm = "qwen-max-latest"                               # Key in config_llm.toml
+memory = "default"                                    # Key in config_mem.toml
+tools = ["filesystem", "playwright", "image_loader"]  # Builtin tools configuration
+mcp_servers = ["python_execute"]                      # Key in config_mcp.toml, all MCP tools provided by servers will be added
+max_steps = 30                                        # Maximum automatic steps without user's check
+duplicate_threshold = 2                               # Used to detect repeating condition (will be checked by LCS)
+
+[humanus_plan]
+llm = "deepseek-chat"
+memory = "long-context"
+tools = ["filesystem", "playwright", "image_loader"]
+mcp_servers = ["python_execute"]
+max_steps = 30
+duplicate_threshold = 2
--- a/config/example/config_embd.toml
+++ b/config/example/config_embd.toml
@ -0,0 +1,17 @@
+["nomic-embed-text-v1.5"]
+provider = "oai"                          # Only support OAI-Compatible style for now
+base_url = "http://localhost:8080"        # Base url. Note: Don't add any endpoint behind
+endpoint = "/v1/embeddings"               # Endpoint of embeddings
+model = "nomic-embed-text-v1.5.f16.gguf"  # Model name
+api_key = ""                              # Your API Key
+embeddings_dim = 768                      # Dimension of embeddings (refer to API docs)
+max_retries = 3                           # Maximum retry count
+
+[qwen-text-embedding-v3]
+provider = "oai"
+base_url = "https://dashscope.aliyuncs.com"
+endpoint = "/compatible-mode/v1/embeddings"
+model = "text-embedding-v3"
+api_key = "sk-"
+embeddings_dim = 1024
+max_retries = 3
--- a/config/example/config_llm.toml
+++ b/config/example/config_llm.toml
@ -0,0 +1,45 @@
+[qwen-max]
+model = "qwen-max"                                   # Model name
+base_url = "https://dashscope.aliyuncs.com"          # Base url. Note: Don't add any endpoint behind
+endpoint = "/compatible-mode/v1/chat/completions"    # Endpoint of chat completions
+api_key = "sk-"                                      # Your API Key
+
+[qwen-max-latest]
+model = "qwen-max-latest"                            # Model name
+base_url = "https://dashscope.aliyuncs.com"          # Base url. Note: Don't add any endpoint behind
+endpoint = "/compatible-mode/v1/chat/completions"    # Endpoint of chat completions
+api_key = "sk-"                                      # Your API Key
+
+[qwen-vl-max-latest]
+model = "qwen-vl-max-latest"
+base_url = "https://dashscope.aliyuncs.com"
+endpoint = "/compatible-mode/v1/chat/completions"
+api_key = "sk-"
+enable_vision = true                                 # This means the model could accept content item like {"image_url", {"url", "xxx"}}
+
+["claude-3.5-sonnet"]
+model = "anthropic/claude-3.5-sonnet"
+base_url = "https://openrouter.ai"
+endpoint = "/api/v1/chat/completions"
+api_key = "sk-"
+enable_vision = true
+
+["claude-3.7-sonnet"]
+model = "anthropic/claude-3.7-sonnet"
+base_url = "https://openrouter.ai"
+endpoint = "/api/v1/chat/completions"
+api_key = "sk-"
+enable_vision = true
+
+[deepseek-chat]
+model = "deepseek-chat"
+base_url = "https://api.deepseek.com"
+endpoint = "/v1/chat/completions"
+api_key = "sk-"
+
+[deepseek-r1]
+model = "deepseek-reasoner"
+base_url = "https://api.deepseek.com"
+endpoint = "/v1/chat/completions"
+api_key = "sk-"
+enable_tool = false                                # The API provider does not support tool use. Use builtin tool hint template.
--- a/config/example/config_mcp.toml
+++ b/config/example/config_mcp.toml
@ -0,0 +1,23 @@
+[python_execute]
+type = "sse"
+host = "localhost"
+port = 8895
+sse_endpoint = "/sse"
+message_enpoint = "/message"
+
+[puppeteer]
+type = "stdio"
+command = "npx"
+args = ["-y", "@modelcontextprotocol/server-puppeteer"]
+
+[playwright]
+type = "stdio"
+command = "npx"
+args = ["-y", "@executeautomation/playwright-mcp-server"]
+
+[filesystem]
+type = "stdio"
+command = "npx"
+args = ["-y",
+        "@modelcontextprotocol/server-filesystem",
+        "/Users/{Username}/Desktop"] # Allowed paths
--- a/config/example/config_mem.toml
+++ b/config/example/config_mem.toml
@ -0,0 +1,19 @@
+[default]
+max_messages = 16                           # Maximum number of messages in short-term memory
+max_tokens_message = 32768                  # Maximum number of tokens in single message
+max_tokens_messages = 65536                 # Maximum number of tokens in short-term memory
+max_tokens_context = 131072                 # Maximum number of tokens in context (used by `get_messages`)
+retrieval_limit = 32                        # Maximum number of results to retrive from long-term memory
+embedding_model = "qwen-text-embedding-v3"  # Key in config_embd.toml
+vector_store = "hnswlib"                    # Key in config_vec.toml
+llm = "qwen-max-latest"                     # Key in config_llm.toml
+
+[long-context]
+max_messages = 32
+max_tokens_message = 64000
+max_tokens_messages = 128000
+max_tokens_context = 128000
+retrieval_limit = 32
+embedding_model = "qwen-text-embedding-v3"
+vector_store = "hnswlib"
+llm = "qwen-max-latest"
--- a/config/example/config_vec.toml
+++ b/config/example/config_vec.toml
@ -0,0 +1,8 @@
+[hnswlib]
+provider = "hnswlib"
+dim = 768                    # Dimension of the elements
+max_elements = 100           # Maximum number of elements, should be known beforehand
+M = 16                       # Tightly connected with internal dimensionality of the data
+                             # strongly affects the memory consumption
+ef_construction = 200        # Controls index search speed/build speed tradeoff
+metric = "L2"                # Distance metric to use, can be L2 or IP
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -56,14 +56,14 @@ int main() {

    const auto& config_table = *config_data["humanus_cli"].as_table();

-    Humanus agent = Humanus::load_from_toml(config_table);
+    auto agent = std::make_shared<Humanus>(Humanus::load_from_toml(config_table));

    while (true) {
-        if (agent.current_step == agent.max_steps) {
-            std::cout << "Automatically paused after " << agent.max_steps << " steps." << std::endl;
+        if (agent->current_step == agent->max_steps) {
+            std::cout << "Automatically paused after " << agent->max_steps << " steps." << std::endl;
            std::cout << "Enter your prompt (enter an empty line to resume or 'exit' to quit): ";
            std::cout.flush();
-            agent.reset(false);
+            agent->reset(false);
        } else {
            std::cout << "Enter your prompt (or 'exit' to quit): ";
            std::cout.flush();
@ -77,7 +77,7 @@ int main() {
        }

        logger->info("Processing your request: " + prompt);
-        agent.run(prompt);
+        agent->run(prompt);
    }

    return 0;
--- a/examples/plan/humanus_plan.cpp
+++ b/examples/plan/humanus_plan.cpp
@ -57,30 +57,26 @@ int main() {

    const auto& config_table = *config_data["humanus_plan"].as_table();

-    Humanus agent = Humanus::load_from_toml(config_table);
+    auto agent = std::make_shared<Humanus>(Humanus::load_from_toml(config_table));

    std::map<std::string, std::shared_ptr<BaseAgent>> agents;
-    agents["default"] = std::make_shared<Humanus>(agent);
+    agents["default"] = agent;

    auto flow = FlowFactory::create_flow(
        FlowType::PLANNING,
-        nullptr,  // llm
-        nullptr,  // planning_tool
-        std::vector<std::string>{},  // executor_keys
-        "",  // active_plan_id
-        agents,  // agents
-        std::vector<std::shared_ptr<BaseTool>>{},  // tools
-        "default"  // primary_agent_key
+        agent->llm,
+        agents,
+        "default" // primary_agent_key
    );

    while (true) {
-        if (agent.current_step == agent.max_steps) {
-            std::cout << "Automatically paused after " << agent.current_step << " steps." << std::endl;
+        if (agent->current_step == agent->max_steps) {
+            std::cout << "Automatically paused after " << agent->current_step << " steps." << std::endl;
            std::cout << "Enter your prompt (enter an empty line to resume or 'exit' to quit): ";
-            agent.reset(false);
-        } else if (agent.state != AgentState::IDLE) {
+            agent->reset(false);
+        } else if (agent->state != AgentState::IDLE) {
            std::cout << "Enter your prompt (enter an empty line to retry or 'exit' to quit): ";
-            agent.reset(false);
+            agent->reset(false);
        } else {
            std::cout << "Enter your prompt (or 'exit' to quit): ";
        }
@ -94,6 +90,6 @@ int main() {

        logger->info("Processing your request: " + prompt);
        auto result = flow->execute(prompt);
-        logger->info("🌟 " + agent.name + "'s summary: " + result);
+        logger->info("🌟 " + agent->name + "'s summary: " + result);
    }
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -169,6 +169,8 @@ int main(int argc, char** argv) {

        agent->reset();

+        session_manager->clear_result(session_id);
+
        std::thread([agent, session_manager, prompt, session_id]() {           
            try {
                session_sink->set_session_id(session_id); 
@ -234,7 +236,7 @@ int main(int argc, char** argv) {
            {"max_steps", agent->max_steps},
            {"prompt_tokens", agent->get_prompt_tokens()},
            {"completion_tokens", agent->get_completion_tokens()},
-            {"logs_buffer", session_sink->get_buffer(session_id)},
+            {"log_buffer", session_sink->get_buffer(session_id)},
            {"result", result}
        };
        
--- a/flow/base.h
+++ b/flow/base.h
@ -17,19 +17,18 @@ const std::map<FlowType, std::string> FLOW_TYPE_MAP = {
 // Base class for execution flows supporting multiple agents
 struct BaseFlow {
    std::map<std::string, std::shared_ptr<BaseAgent>> agents;
-    std::vector<std::shared_ptr<BaseTool>> tools;
    std::string primary_agent_key;

-    BaseFlow(const std::map<std::string, std::shared_ptr<BaseAgent>>& agents = {}, const std::vector<std::shared_ptr<BaseTool>>& tools = {}, const std::string& primary_agent_key = "") 
-    : agents(agents), tools(tools), primary_agent_key(primary_agent_key) {
+    BaseFlow(const std::map<std::string, std::shared_ptr<BaseAgent>>& agents = {}, const std::string& primary_agent_key = "") 
+    : agents(agents), primary_agent_key(primary_agent_key) {
        // If primary agent not specified, use first agent
        if (primary_agent_key.empty() && !agents.empty()) {
            this->primary_agent_key = agents.begin()->first;
        }
    }

-    BaseFlow(const std::shared_ptr<BaseAgent>& agent, const std::vector<std::shared_ptr<BaseTool>>& tools = {}, const std::string& primary_agent_key = "") 
-    : tools(tools), primary_agent_key(primary_agent_key) {
+    BaseFlow(const std::shared_ptr<BaseAgent>& agent, const std::string& primary_agent_key = "") 
+    : primary_agent_key(primary_agent_key) {
        agents["default"] = agent;
        // If primary agent not specified, use first agent
        if (primary_agent_key.empty()) {
@ -37,8 +36,8 @@ struct BaseFlow {
        }
    }

-    BaseFlow(const std::vector<std::shared_ptr<BaseAgent>>& agents_list, const std::vector<std::shared_ptr<BaseTool>>& tools = {}, const std::string& primary_agent_key = "")
-    : tools(tools), primary_agent_key(primary_agent_key) {
+    BaseFlow(const std::vector<std::shared_ptr<BaseAgent>>& agents_list, const std::string& primary_agent_key = "")
+    : primary_agent_key(primary_agent_key) {
        for (size_t i = 0; i < agents_list.size(); i++) {
            agents["agent_" + std::to_string(i)] = agents_list[i];
        }
--- a/flow/planning.cpp
+++ b/flow/planning.cpp
@ -10,13 +10,6 @@ std::shared_ptr<BaseAgent> PlanningFlow::get_executor(const std::string& step_ty
        return agents.at(step_type);
    }

-    // Otherwise use the first available executor or fall back to primary agent
-    for (const auto& key : executor_keys) {
-        if (agents.find(key) != agents.end()) {
-            return agents.at(key);
-        }
-    }
-
    // Fallback to primary agent
    return primary_agent();
 }
@ -72,7 +65,7 @@ std::string PlanningFlow::execute(const std::string& input) {
            result += "##" + step_info.value("type", "Step " + std::to_string(current_step_index)) + ":\n" + prefix_sum + "\n\n";
        }

-        reset(true); // Clear short-termmemory and state for next plan
+        reset(true); // Clear short-term memory and state for next plan

        return result;
    } catch (const std::exception& e) {
@ -140,10 +133,15 @@ void PlanningFlow::_create_initial_plan(const std::string& request) {
    logger->warn("Creating default plan");

    // Create default plan using the ToolCollection
+    auto title = request;
+    if (title.size() > 50) {
+        title = title.substr(0, validate_utf8(title.substr(0, 50))) + "...";
+    }
+
    planning_tool->execute({
        {"command", "create"},
        {"plan_id", active_plan_id},
-        {"title", request.substr(0, std::min(50, static_cast<int>(request.size()))) + (request.size() > 50 ? "..." : "")},
+        {"title", title},
        {"steps", {"Analyze request", "Execute task", "Verify results"}}
    });
 }
--- a/flow/planning.h
+++ b/flow/planning.h
@ -16,37 +16,20 @@ namespace humanus {
 struct PlanningFlow : public BaseFlow {
    std::shared_ptr<LLM> llm;
    std::shared_ptr<PlanningTool> planning_tool;
-    std::vector<std::string> executor_keys;
    std::string active_plan_id;
-    int current_step_index = -1;
+    int current_step_index;

    PlanningFlow(
        const std::shared_ptr<LLM>& llm = nullptr,
-        const std::shared_ptr<PlanningTool>& planning_tool = nullptr,
-        const std::vector<std::string>& executor_keys = {},
-        const std::string& active_plan_id = "",
        const std::map<std::string, std::shared_ptr<BaseAgent>>& agents = {},
-        const std::vector<std::shared_ptr<BaseTool>>& tools = {}, 
-        const std::string& primary_agent_key = ""
-    ) : BaseFlow(agents, tools, primary_agent_key),
-        llm(llm),
-        planning_tool(planning_tool),
-        executor_keys(executor_keys),
-        active_plan_id(active_plan_id) {
+        const std::string& primary_agent_key = "default"
+    ) : BaseFlow(agents, primary_agent_key),
+        llm(llm) {
        if (!llm) {
-            this->llm = LLM::get_instance("default");
-        }
-        if (!planning_tool) {
-            this->planning_tool = std::make_shared<PlanningTool>();
-        }
-        if (active_plan_id.empty()) {
-            this->active_plan_id = "plan_" + std::to_string(std::chrono::system_clock::now().time_since_epoch().count());
-        }
-        if (executor_keys.empty()) {
-            for (const auto& [key, agent] : agents) {
-                this->executor_keys.push_back(key);
-            }
+            this->llm = primary_agent()->llm;
        }
+        planning_tool = std::make_shared<PlanningTool>();
+        reset();
    }
    
    // Get an appropriate executor agent for the current step.
--- a/include/utils.h
+++ b/include/utils.h
@ -1,6 +1,7 @@
 #ifndef HUMANUS_UTILS_H
 #define HUMANUS_UTILS_H

+#include "mcp_message.h"
 #include <filesystem>
 #include <iostream>

@ -14,6 +15,8 @@

 namespace humanus {

+using json = mcp::json;
+
 // Get project root directory
 inline std::filesystem::path get_project_root() {
    return std::filesystem::path(__FILE__).parent_path().parent_path();
@ -28,6 +31,9 @@ size_t validate_utf8(const std::string& text);

 bool readline_utf8(std::string & line, bool multiline_input = false);

+// Parse the content of a message to a string
+std::string parse_json_content(const json& content);
+
 } // namespace humanus

 #endif
--- a/src/llm.cpp
+++ b/src/llm.cpp
@ -78,6 +78,12 @@ json LLM::format_messages(const std::vector<Message>& messages) {

    formatted_messages.erase(formatted_messages.begin() + j + 1, formatted_messages.end());

+    if (!llm_config_->enable_vision) {
+        for (auto& message : formatted_messages) {
+            message["content"] = parse_json_content(message["content"]); // Images will be replaced by [image1], [image2], ...
+        }
+    }
+
    return formatted_messages;
 }

--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -67,4 +67,24 @@ bool readline_utf8(std::string & line, bool multiline_input) {
    return multiline_input;
 }

+// Parse the content of a message to a string
+std::string parse_json_content(const json& content) {
+    if (content.is_string()) {
+        return content.get<std::string>();
+    } else if (content.is_array()) {
+        std::string result;
+        int image_cnt = 0;
+        for (const auto& item : content) {
+            if (item["type"] == "text") {
+                result += item["text"].get<std::string>();
+            } else if (item["type"] == "image_url") {
+                result += "[image" + std::to_string(++image_cnt) + "]";
+            }
+        }
+        return result;
+    } else {
+        return content.dump(2);
+    }
+}
+
 } // namespace humanus
--- a/tool/base.h
+++ b/tool/base.h
@ -3,6 +3,7 @@

 #include "schema.h"
 #include "config.h"
+#include "utils.h"
 #include "mcp_stdio_client.h"
 #include "mcp_sse_client.h"
 #include <string>
@ -51,24 +52,6 @@ struct ToolResult {
        };
    }

-    static std::string parse_json_content(const json& content) {
-        if (content.is_string()) {
-            return content.get<std::string>();
-        } else if (content.is_array()) {
-            std::string result;
-            for (const auto& item : content) {
-                if (item["type"] == "text") {
-                    result += item["text"].get<std::string>();
-                } else if (item["type"] == "image_url") {
-                    result += "<image>" + item["image_url"]["url"].get<std::string>() + "</image>";
-                }
-            }
-            return result;
-        } else {
-            return content.dump(2);
-        }
-    }
-
    std::string to_string(int max_length = -1) const {
        std::string result;
        if (!error.empty()) {