humanus.cpp/tool/content_provider.h

302 lines
12 KiB
C++

#ifndef HUMANUS_TOOL_CONTENT_PROVIDER_H
#define HUMANUS_TOOL_CONTENT_PROVIDER_H
#include "base.h"
#include "utils.h"
#include <vector>
#include <map>
namespace humanus {
struct ContentProvider : BaseTool {
inline static const std::string name_ = "content_provider";
inline static const std::string description_ = "Use this tool to save temporary content for later use. For example, you can save a large code file (like HTML) and read it by chunks later.";
inline static const json parameters_ = json::parse(R"json(
{
"type": "object",
"properties": {
"operation": {
"type": "string",
"description": "The operation to perform: `write` to save content, `read` to retrieve content",
"enum": ["write", "read"]
},
"content": {
"type": "array",
"description": "The content to store. Required when operation is `write`. Format: [{`type`: `text`, `text`: `content`}, {`type`: `image`, `image_url`: {`url`: `image_url`}}]",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["text", "image"]
},
"text": {
"type": "string",
"description": "Text content. Required when type is `text`."
},
"image_url": {
"type": "object",
"description": "Image URL information. Required when type is `image`.",
"properties": {
"url": {
"type": "string",
"description": "URL of the image"
}
}
}
}
}
},
"cursor": {
"type": "string",
"description": "The cursor position for reading content. Required when operation is `read`. Use `start` for the beginning or the cursor returned from a previous read."
},
"max_chunk_size": {
"type": "integer",
"description": "Maximum size in characters for each text chunk. Default is 4000.",
"default": 4000
}
},
"required": ["operation"]
}
)json");
inline static std::map<std::string, std::vector<json>> content_store_;
inline static size_t MAX_STORE_ID = 100;
inline static size_t current_id_ = 0;
ContentProvider() : BaseTool(name_, description_, parameters_) {}
// 将文本分割成合适大小的块
std::vector<json> split_text_into_chunks(const std::string& text, int max_chunk_size) {
std::vector<json> chunks;
// 如果文本为空,返回空数组
if (text.empty()) {
return chunks;
}
size_t text_length = text.length();
size_t offset = 0;
while (offset < text_length) {
// 首先确定最大可能的块大小
size_t raw_chunk_size = std::min(static_cast<size_t>(max_chunk_size), text_length - offset);
// 使用 validate_utf8 确保不会截断 UTF-8 字符
std::string potential_chunk = text.substr(offset, raw_chunk_size);
size_t valid_utf8_length = validate_utf8(potential_chunk);
// 调整为有效的 UTF-8 字符边界
size_t chunk_size = valid_utf8_length;
// 如果不是在文本的结尾,并且我们没有因为 UTF-8 截断而减小块大小,
// 尝试在空格、换行或标点处分割,以获得更自然的分隔点
if (offset + chunk_size < text_length && chunk_size == raw_chunk_size) {
size_t break_pos = offset + chunk_size;
// 向后寻找一个合适的分割点
size_t min_pos = offset + valid_utf8_length / 2; // 不要搜索太远,至少保留一半的有效内容
while (break_pos > min_pos &&
text[break_pos] != ' ' &&
text[break_pos] != '\n' &&
text[break_pos] != '.' &&
text[break_pos] != ',' &&
text[break_pos] != ';' &&
text[break_pos] != ':' &&
text[break_pos] != '!' &&
text[break_pos] != '?') {
break_pos--;
}
// 如果找到了合适的分割点且不是原始位置
if (break_pos > min_pos) {
// 向前移动到分隔符后面的位置
break_pos++;
// 检查新的分割点是否会导致 UTF-8 截断
std::string new_chunk = text.substr(offset, break_pos - offset);
size_t new_valid_length = validate_utf8(new_chunk);
if (new_valid_length == new_chunk.size()) {
// 只有在不会截断 UTF-8 字符的情况下使用新的分割点
chunk_size = break_pos - offset;
}
}
}
// 创建一个文本块
json chunk;
chunk["type"] = "text";
chunk["text"] = text.substr(offset, chunk_size);
chunks.push_back(chunk);
offset += chunk_size;
}
return chunks;
}
// 处理写入操作
ToolResult handle_write(const json& args) {
int max_chunk_size = args.value("max_chunk_size", 4000);
if (!args.contains("content") || !args["content"].is_array()) {
return ToolError("`content` is required and must be an array");
}
std::vector<json> processed_content;
// 处理内容,分割大型文本
for (const auto& item : args["content"]) {
if (!item.contains("type")) {
return ToolError("Each content item must have a `type` field");
}
std::string type = item["type"];
if (type == "text") {
if (!item.contains("text") || !item["text"].is_string()) {
return ToolError("Text items must have a `text` field with string value");
}
std::string text = item["text"];
auto chunks = split_text_into_chunks(text, max_chunk_size);
processed_content.insert(processed_content.end(), chunks.begin(), chunks.end());
} else if (type == "image") {
if (!item.contains("image_url") || !item["image_url"].is_object() ||
!item["image_url"].contains("url") || !item["image_url"]["url"].is_string()) {
return ToolError("Image items must have an `image_url` field with a `url` property");
}
// 图像保持为一个整体
processed_content.push_back(item);
} else {
return ToolError("Unsupported content type: " + type);
}
}
// 生成一个唯一的存储ID
std::string store_id = "content_" + std::to_string(current_id_);
current_id_ = (current_id_ + 1) % MAX_STORE_ID;
// 存储处理后的内容
content_store_[store_id] = processed_content;
// 返回存储ID和内容项数
json result;
result["store_id"] = store_id;
result["total_items"] = processed_content.size();
return ToolResult(result);
}
// 处理读取操作
ToolResult handle_read(const json& args) {
if (!args.contains("cursor") || !args["cursor"].is_string()) {
return ToolError("`cursor` is required for read operations");
}
std::string cursor = args["cursor"];
if (cursor == "start") {
// 列出所有可用的存储ID
json available_stores = json::array();
for (const auto& [id, content] : content_store_) {
json store_info;
store_info["store_id"] = id;
store_info["total_items"] = content.size();
available_stores.push_back(store_info);
}
if (available_stores.empty()) {
return ToolResult("No content available. Use `write` operation to store content first.");
}
json result;
result["available_stores"] = available_stores;
result["next_cursor"] = "select_store";
return ToolResult(result);
} else if (cursor == "select_store") {
// 用户需要选择一个存储ID
return ToolError("Please provide a store_id as cursor in format `store_id:content_X`");
} else if (cursor.find("store_id:") == 0) {
// 用户选择了一个存储ID
std::string store_id = cursor.substr(9); // 移除 "store_id:" 前缀
if (content_store_.find(store_id) == content_store_.end()) {
return ToolError("Store ID `" + store_id + "` not found");
}
// 返回该存储的第一个内容项
json result = content_store_[store_id][0];
// 添加导航信息
if (content_store_[store_id].size() > 1) {
result["next_cursor"] = store_id + ":1";
result["remaining_items"] = content_store_[store_id].size() - 1;
} else {
result["next_cursor"] = "end";
result["remaining_items"] = 0;
}
return ToolResult(result);
} else if (cursor.find(":") != std::string::npos) {
// 用户正在浏览特定存储内的内容
size_t delimiter_pos = cursor.find(":");
std::string store_id = cursor.substr(0, delimiter_pos);
size_t index = std::stoul(cursor.substr(delimiter_pos + 1));
if (content_store_.find(store_id) == content_store_.end()) {
return ToolError("Store ID `" + store_id + "` not found");
}
if (index >= content_store_[store_id].size()) {
return ToolError("Index out of range");
}
// 返回请求的内容项
json result = content_store_[store_id][index];
// 添加导航信息
if (index + 1 < content_store_[store_id].size()) {
result["next_cursor"] = store_id + ":" + std::to_string(index + 1);
result["remaining_items"] = content_store_[store_id].size() - index - 1;
} else {
result["next_cursor"] = "end";
result["remaining_items"] = 0;
}
return ToolResult(result);
} else if (cursor == "end") {
return ToolResult("You have reached the end of the content.");
} else {
return ToolError("Invalid cursor format");
}
}
ToolResult execute(const json& args) override {
try {
if (!args.contains("operation")) {
return ToolError("`operation` is required");
}
std::string operation = args["operation"];
if (operation == "write") {
return handle_write(args);
} else if (operation == "read") {
return handle_read(args);
} else {
return ToolError("Unknown operation `" + operation + "`. Please use `write` or `read`");
}
} catch (const std::exception& e) {
return ToolError(std::string(e.what()));
}
}
};
} // namespace humanus
#endif // HUMANUS_TOOL_CONTENT_PROVIDER_H