humanus.cpp/tool/content_provider.h

#ifndef HUMANUS_TOOL_CONTENT_PROVIDER_H
#define HUMANUS_TOOL_CONTENT_PROVIDER_H

#include "base.h"
#include "utils.h"
#include <vector>
#include <map>

namespace humanus {

struct ContentProvider : BaseTool {
    inline static const std::string name_ = "content_provider";
    inline static const std::string description_ = "Use this tool to save temporary content for later use. For example, you can save a large code file (like HTML) and read it by chunks later.";
    inline static const json parameters_ = json::parse(R"json(
        {
            "type": "object",
            "properties": {
                "operation": {
                    "type": "string",
                    "description": "The operation to perform: `write` to save content, `read` to retrieve content",
                    "enum": ["write", "read"]
                },
                "content": {
                    "type": "array",
                    "description": "The content to store. Required when operation is `write` (the `read` operation will return the same format). Format: [{'type': 'text', 'text': <content>}, {'type': 'image_url', 'image_url': {'url': <image_url>}}]",
                    "items": {
                        "type": "object",
                        "properties": {
                            "type": {
                                "type": "string",
                                "enum": ["text", "image_url"]
                            },
                            "text": {
                                "type": "string",
                                "description": "Text content. Required when type is `text`."
                            },
                            "image_url": {
                                "type": "object",
                                "description": "Image URL information. Required when type is `image_url`.",
                                "properties": {
                                    "url": {
                                        "type": "string",
                                        "description": "URL of the image"
                                    }
                                }
                            }
                        }
                    }
                },
                "cursor": {
                    "type": "string",
                    "description": "The cursor position for reading content. Required when operation is `read`. Use `start` for the beginning or the cursor returned from a previous read."
                },
                "max_chunk_size": {
                    "type": "integer",
                    "description": "Maximum size in characters for each text chunk. Default is 4000. Used by `write` operation.",
                    "default": 4000
                }
            },
            "required": ["operation"]
        }
    )json");

    inline static std::map<std::string, std::vector<json>> content_store_;
    inline static size_t MAX_STORE_ID = 100;
    inline static size_t current_id_ = 0;

    ContentProvider() : BaseTool(name_, description_, parameters_) {}

    // 将文本分割成合适大小的块
    std::vector<json> split_text_into_chunks(const std::string& text, int max_chunk_size) {
        std::vector<json> chunks;
        
        // 如果文本为空，返回空数组
        if (text.empty()) {
            return chunks;
        }
        
        size_t text_length = text.length();
        size_t offset = 0;
        
        while (offset < text_length) {
            // 首先确定最大可能的块大小
            size_t raw_chunk_size = std::min(static_cast<size_t>(max_chunk_size), text_length - offset);
            
            // 使用 validate_utf8 确保不会截断 UTF-8 字符
            std::string potential_chunk = text.substr(offset, raw_chunk_size);
            size_t valid_utf8_length = validate_utf8(potential_chunk);
            
            // 调整为有效的 UTF-8 字符边界
            size_t chunk_size = valid_utf8_length;
            
            // 如果不是在文本的结尾，并且我们没有因为 UTF-8 截断而减小块大小，
            // 尝试在空格、换行或标点处分割，以获得更自然的分隔点
            if (offset + chunk_size < text_length && chunk_size == raw_chunk_size) {
                size_t break_pos = offset + chunk_size;
                
                // 向后寻找一个合适的分割点
                size_t min_pos = offset + valid_utf8_length / 2; // 不要搜索太远，至少保留一半的有效内容
                while (break_pos > min_pos && 
                       text[break_pos] != ' ' && 
                       text[break_pos] != '\n' && 
                       text[break_pos] != '.' && 
                       text[break_pos] != ',' && 
                       text[break_pos] != ';' && 
                       text[break_pos] != ':' && 
                       text[break_pos] != '!' && 
                       text[break_pos] != '?') {
                    break_pos--;
                }
                
                // 如果找到了合适的分割点且不是原始位置
                if (break_pos > min_pos) {
                    break_pos++; // Include the last character
                    std::string new_chunk = text.substr(offset, break_pos - offset);
                    size_t new_valid_length = validate_utf8(new_chunk); // Validate the new chunk
                    chunk_size = break_pos - offset;
                }
            }
            
            // Create a text chunk
            json chunk;
            chunk["type"] = "text";
            chunk["text"] = text.substr(offset, chunk_size);
            chunks.push_back(chunk);
            
            offset += chunk_size;
        }
        
        return chunks;
    }

    // 处理写入操作
    ToolResult handle_write(const json& args) {
        int max_chunk_size = args.value("max_chunk_size", 4000);
        
        if (!args.contains("content") || !args["content"].is_array()) {
            return ToolError("`content` is required and must be an array");
        }
        
        std::vector<json> processed_content;

        std::string text_content;
        
        // 处理内容，分割大型文本
        for (const auto& item : args["content"]) {
            if (!item.contains("type")) {
                return ToolError("Each content item must have a `type` field");
            }
            
            std::string type = item["type"].get<std::string>();
            
            if (type == "text") {
                if (!item.contains("text") || !item["text"].is_string()) {
                    return ToolError("Text items must have a `text` field with string value");
                }
                
                text_content += item["text"].get<std::string>() + "\n\n"; // Handle them together
            } else if (type == "image_url") {
                if (!text_content.empty()) {
                    auto chunks = split_text_into_chunks(text_content, max_chunk_size);
                    processed_content.insert(processed_content.end(), chunks.begin(), chunks.end());
                    text_content.clear();
                }

                if (!item.contains("image_url") || !item["image_url"].is_object() || 
                    !item["image_url"].contains("url") || !item["image_url"]["url"].is_string()) {
                    return ToolError("Image items must have an `image_url` field with a `url` property");
                }
                
                // 图像保持为一个整体
                processed_content.push_back(item);
            } else {
                return ToolError("Unsupported content type: " + type);
            }
        }

        if (!text_content.empty()) {
            auto chunks = split_text_into_chunks(text_content, max_chunk_size);
            processed_content.insert(processed_content.end(), chunks.begin(), chunks.end());
            text_content.clear();
        }
        
        // 生成一个唯一的存储ID
        std::string store_id = "content_" + std::to_string(current_id_);

        if (content_store_.find(store_id) != content_store_.end()) {
            logger->warn("Store ID `" + store_id + "` already exists, it will be overwritten");
        }

        current_id_ = (current_id_ + 1) % MAX_STORE_ID;
        
        // 存储处理后的内容
        content_store_[store_id] = processed_content;
        
        // 返回存储ID和内容项数
        json result;
        result["store_id"] = store_id;
        result["total_items"] = processed_content.size();
        
        return ToolResult(result);
    }

    // 处理读取操作
    ToolResult handle_read(const json& args) {
        if (!args.contains("cursor") || !args["cursor"].is_string()) {
            return ToolError("`cursor` is required for read operations");
        }
        
        std::string cursor = args["cursor"];
        
        if (cursor == "start") {
            // 列出所有可用的存储ID
            json available_stores = json::array();
            for (const auto& [id, content] : content_store_) {
                json store_info;
                store_info["store_id"] = id;
                store_info["total_items"] = content.size();
                available_stores.push_back(store_info);
            }
            
            if (available_stores.empty()) {
                return ToolResult("No content available. Use `write` operation to store content first.");
            }
            
            json result;
            result["available_stores"] = available_stores;
            result["next_cursor"] = "select_store";
            
            return ToolResult(result);
        } else if (cursor == "select_store") {
            // 用户需要选择一个存储ID
            return ToolError("Please provide a store_id as cursor in format `content_X:Y`");
        } else if (cursor.find(":") != std::string::npos) { // content_X:Y
            // 用户正在浏览特定存储内的内容
            size_t delimiter_pos = cursor.find(":");
            std::string store_id = cursor.substr(0, delimiter_pos);
            size_t index = std::stoul(cursor.substr(delimiter_pos + 1));
            
            if (content_store_.find(store_id) == content_store_.end()) {
                return ToolError("Store ID `" + store_id + "` not found");
            }
            
            if (index >= content_store_[store_id].size()) {
                return ToolError("Index out of range");
            }
            
            // 返回请求的内容项
            json result = content_store_[store_id][index];
            
            // 添加导航信息
            if (index + 1 < content_store_[store_id].size()) {
                result["next_cursor"] = store_id + ":" + std::to_string(index + 1);
                result["remaining_items"] = content_store_[store_id].size() - index - 1;
            } else {
                result["next_cursor"] = "end";
                result["remaining_items"] = 0;
            }
            
            return ToolResult(result);
        } else if (cursor == "end") {
            return ToolResult("You have reached the end of the content.");
        } else {
            return ToolError("Invalid cursor format");
        }
    }

    ToolResult execute(const json& args) override {
        try {
            if (!args.contains("operation")) {
                return ToolError("`operation` is required");
            }
            
            std::string operation = args["operation"];
            
            if (operation == "write") {
                return handle_write(args);
            } else if (operation == "read") {
                return handle_read(args);
            } else {
                return ToolError("Unknown operation `" + operation + "`. Please use `write` or `read`");
            }
        } catch (const std::exception& e) {
            return ToolError(std::string(e.what()));
        }
    }
};

} // namespace humanus

#endif // HUMANUS_TOOL_CONTENT_PROVIDER_H
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`#ifndef HUMANUS_TOOL_CONTENT_PROVIDER_H`
			`#define HUMANUS_TOOL_CONTENT_PROVIDER_H`

			`#include "base.h"`
			`#include "utils.h"`
			`#include <vector>`
			`#include <map>`

			`namespace humanus {`

			`struct ContentProvider : BaseTool {`
			`inline static const std::string name_ = "content_provider";`
			`inline static const std::string description_ = "Use this tool to save temporary content for later use. For example, you can save a large code file (like HTML) and read it by chunks later.";`
			`inline static const json parameters_ = json::parse(R"json(`
			`{`
			`"type": "object",`
			`"properties": {`
			`"operation": {`
			`"type": "string",`
			"description": "The operation to perform: `write` to save content, `read` to retrieve content",
			`"enum": ["write", "read"]`
			`},`
			`"content": {`
			`"type": "array",`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			"description": "The content to store. Required when operation is `write` (the `read` operation will return the same format). Format: [{'type': 'text', 'text': <content>}, {'type': 'image_url', 'image_url': {'url': <image_url>}}]",
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`"items": {`
			`"type": "object",`
			`"properties": {`
			`"type": {`
			`"type": "string",`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`"enum": ["text", "image_url"]`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`},`
			`"text": {`
			`"type": "string",`
			"description": "Text content. Required when type is `text`."
			`},`
			`"image_url": {`
			`"type": "object",`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			"description": "Image URL information. Required when type is `image_url`.",
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`"properties": {`
			`"url": {`
			`"type": "string",`
			`"description": "URL of the image"`
			`}`
			`}`
			`}`
			`}`
			`}`
			`},`
			`"cursor": {`
			`"type": "string",`
			"description": "The cursor position for reading content. Required when operation is `read`. Use `start` for the beginning or the cursor returned from a previous read."
			`},`
			`"max_chunk_size": {`
			`"type": "integer",`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			"description": "Maximum size in characters for each text chunk. Default is 4000. Used by `write` operation.",
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`"default": 4000`
			`}`
			`},`
			`"required": ["operation"]`
			`}`
			`)json");`

			`inline static std::map<std::string, std::vector<json>> content_store_;`
			`inline static size_t MAX_STORE_ID = 100;`
			`inline static size_t current_id_ = 0;`

			`ContentProvider() : BaseTool(name_, description_, parameters_) {}`

			`// 将文本分割成合适大小的块`
			`std::vector<json> split_text_into_chunks(const std::string& text, int max_chunk_size) {`
			`std::vector<json> chunks;`

			`// 如果文本为空，返回空数组`
			`if (text.empty()) {`
			`return chunks;`
			`}`

			`size_t text_length = text.length();`
			`size_t offset = 0;`

			`while (offset < text_length) {`
			`// 首先确定最大可能的块大小`
			`size_t raw_chunk_size = std::min(static_cast<size_t>(max_chunk_size), text_length - offset);`

			`// 使用 validate_utf8 确保不会截断 UTF-8 字符`
			`std::string potential_chunk = text.substr(offset, raw_chunk_size);`
			`size_t valid_utf8_length = validate_utf8(potential_chunk);`

			`// 调整为有效的 UTF-8 字符边界`
			`size_t chunk_size = valid_utf8_length;`

			`// 如果不是在文本的结尾，并且我们没有因为 UTF-8 截断而减小块大小，`
			`// 尝试在空格、换行或标点处分割，以获得更自然的分隔点`
			`if (offset + chunk_size < text_length && chunk_size == raw_chunk_size) {`
			`size_t break_pos = offset + chunk_size;`

			`// 向后寻找一个合适的分割点`
			`size_t min_pos = offset + valid_utf8_length / 2; // 不要搜索太远，至少保留一半的有效内容`
			`while (break_pos > min_pos &&`
			`text[break_pos] != ' ' &&`
			`text[break_pos] != '\n' &&`
			`text[break_pos] != '.' &&`
			`text[break_pos] != ',' &&`
			`text[break_pos] != ';' &&`
			`text[break_pos] != ':' &&`
			`text[break_pos] != '!' &&`
			`text[break_pos] != '?') {`
			`break_pos--;`
			`}`

			`// 如果找到了合适的分割点且不是原始位置`
			`if (break_pos > min_pos) {`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`break_pos++; // Include the last character`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`std::string new_chunk = text.substr(offset, break_pos - offset);`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`size_t new_valid_length = validate_utf8(new_chunk); // Validate the new chunk`
			`chunk_size = break_pos - offset;`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`}`
			`}`

add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`// Create a text chunk`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`json chunk;`
			`chunk["type"] = "text";`
			`chunk["text"] = text.substr(offset, chunk_size);`
			`chunks.push_back(chunk);`

			`offset += chunk_size;`
			`}`

			`return chunks;`
			`}`

			`// 处理写入操作`
			`ToolResult handle_write(const json& args) {`
			`int max_chunk_size = args.value("max_chunk_size", 4000);`

			`if (!args.contains("content") \|\| !args["content"].is_array()) {`
			return ToolError("`content` is required and must be an array");
			`}`

			`std::vector<json> processed_content;`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00
			`std::string text_content;`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00
			`// 处理内容，分割大型文本`
			`for (const auto& item : args["content"]) {`
			`if (!item.contains("type")) {`
			return ToolError("Each content item must have a `type` field");
			`}`

add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`std::string type = item["type"].get<std::string>();`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00
			`if (type == "text") {`
			`if (!item.contains("text") \|\| !item["text"].is_string()) {`
			return ToolError("Text items must have a `text` field with string value");
			`}`

add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			`text_content += item["text"].get<std::string>() + "\n\n"; // Handle them together`
			`} else if (type == "image_url") {`
			`if (!text_content.empty()) {`
			`auto chunks = split_text_into_chunks(text_content, max_chunk_size);`
			`processed_content.insert(processed_content.end(), chunks.begin(), chunks.end());`
			`text_content.clear();`
			`}`

add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`if (!item.contains("image_url") \|\| !item["image_url"].is_object() \|\|`
			`!item["image_url"].contains("url") \|\| !item["image_url"]["url"].is_string()) {`
			return ToolError("Image items must have an `image_url` field with a `url` property");
			`}`

			`// 图像保持为一个整体`
			`processed_content.push_back(item);`
			`} else {`
			`return ToolError("Unsupported content type: " + type);`
			`}`
			`}`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00
			`if (!text_content.empty()) {`
			`auto chunks = split_text_into_chunks(text_content, max_chunk_size);`
			`processed_content.insert(processed_content.end(), chunks.begin(), chunks.end());`
			`text_content.clear();`
			`}`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00
			`// 生成一个唯一的存储ID`
			`std::string store_id = "content_" + std::to_string(current_id_);`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00
			`if (content_store_.find(store_id) != content_store_.end()) {`
			logger->warn("Store ID `" + store_id + "` already exists, it will be overwritten");
			`}`

add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`current_id_ = (current_id_ + 1) % MAX_STORE_ID;`

			`// 存储处理后的内容`
			`content_store_[store_id] = processed_content;`

			`// 返回存储ID和内容项数`
			`json result;`
			`result["store_id"] = store_id;`
			`result["total_items"] = processed_content.size();`

			`return ToolResult(result);`
			`}`

			`// 处理读取操作`
			`ToolResult handle_read(const json& args) {`
			`if (!args.contains("cursor") \|\| !args["cursor"].is_string()) {`
			return ToolError("`cursor` is required for read operations");
			`}`

			`std::string cursor = args["cursor"];`

			`if (cursor == "start") {`
			`// 列出所有可用的存储ID`
			`json available_stores = json::array();`
			`for (const auto& [id, content] : content_store_) {`
			`json store_info;`
			`store_info["store_id"] = id;`
			`store_info["total_items"] = content.size();`
			`available_stores.push_back(store_info);`
			`}`

			`if (available_stores.empty()) {`
			return ToolResult("No content available. Use `write` operation to store content first.");
			`}`

			`json result;`
			`result["available_stores"] = available_stores;`
			`result["next_cursor"] = "select_store";`

			`return ToolResult(result);`
			`} else if (cursor == "select_store") {`
			`// 用户需要选择一个存储ID`
add image_loader and fix some bugs of image_url in tool results 2025-04-10 00:10:05 +08:00			return ToolError("Please provide a store_id as cursor in format `content_X:Y`");
			`} else if (cursor.find(":") != std::string::npos) { // content_X:Y`
add tokenizer (lack of test in use) and content_provider (just implementation, no use) 2025-04-06 16:32:51 +08:00			`// 用户正在浏览特定存储内的内容`
			`size_t delimiter_pos = cursor.find(":");`
			`std::string store_id = cursor.substr(0, delimiter_pos);`
			`size_t index = std::stoul(cursor.substr(delimiter_pos + 1));`

			`if (content_store_.find(store_id) == content_store_.end()) {`
			return ToolError("Store ID `" + store_id + "` not found");
			`}`

			`if (index >= content_store_[store_id].size()) {`
			`return ToolError("Index out of range");`
			`}`

			`// 返回请求的内容项`
			`json result = content_store_[store_id][index];`

			`// 添加导航信息`
			`if (index + 1 < content_store_[store_id].size()) {`
			`result["next_cursor"] = store_id + ":" + std::to_string(index + 1);`
			`result["remaining_items"] = content_store_[store_id].size() - index - 1;`
			`} else {`
			`result["next_cursor"] = "end";`
			`result["remaining_items"] = 0;`
			`}`

			`return ToolResult(result);`
			`} else if (cursor == "end") {`
			`return ToolResult("You have reached the end of the content.");`
			`} else {`
			`return ToolError("Invalid cursor format");`
			`}`
			`}`

			`ToolResult execute(const json& args) override {`
			`try {`
			`if (!args.contains("operation")) {`
			return ToolError("`operation` is required");
			`}`

			`std::string operation = args["operation"];`

			`if (operation == "write") {`
			`return handle_write(args);`
			`} else if (operation == "read") {`
			`return handle_read(args);`
			`} else {`
			return ToolError("Unknown operation `" + operation + "`. Please use `write` or `read`");
			`}`
			`} catch (const std::exception& e) {`
			`return ToolError(std::string(e.what()));`
			`}`
			`}`
			`};`

			`} // namespace humanus`

			`#endif // HUMANUS_TOOL_CONTENT_PROVIDER_H`