2025-04-06 16:32:51 +08:00
# ifndef HUMANUS_TOOL_CONTENT_PROVIDER_H
# define HUMANUS_TOOL_CONTENT_PROVIDER_H
# include "base.h"
# include "utils.h"
# include <vector>
# include <map>
namespace humanus {
struct ContentProvider : BaseTool {
inline static const std : : string name_ = " content_provider " ;
inline static const std : : string description_ = " Use this tool to save temporary content for later use. For example, you can save a large code file (like HTML) and read it by chunks later. " ;
inline static const json parameters_ = json : : parse ( R " json(
{
" type " : " object " ,
" properties " : {
" operation " : {
" type " : " string " ,
" description " : " The operation to perform: `write` to save content, `read` to retrieve content " ,
" enum " : [ " write " , " read " ]
} ,
" content " : {
" type " : " array " ,
2025-04-10 00:10:05 +08:00
" description " : " The content to store. Required when operation is `write` (the `read` operation will return the same format). Format: [{'type': 'text', 'text': <content>}, {'type': 'image_url', 'image_url': {'url': <image_url>}}] " ,
2025-04-06 16:32:51 +08:00
" items " : {
" type " : " object " ,
" properties " : {
" type " : {
" type " : " string " ,
2025-04-10 00:10:05 +08:00
" enum " : [ " text " , " image_url " ]
2025-04-06 16:32:51 +08:00
} ,
" text " : {
" type " : " string " ,
" description " : " Text content. Required when type is `text`. "
} ,
" image_url " : {
" type " : " object " ,
2025-04-10 00:10:05 +08:00
" description " : " Image URL information. Required when type is `image_url`. " ,
2025-04-06 16:32:51 +08:00
" properties " : {
" url " : {
" type " : " string " ,
" description " : " URL of the image "
}
}
}
}
}
} ,
" cursor " : {
" type " : " string " ,
" description " : " The cursor position for reading content. Required when operation is `read`. Use `start` for the beginning or the cursor returned from a previous read. "
} ,
" max_chunk_size " : {
" type " : " integer " ,
2025-04-10 00:10:05 +08:00
" description " : " Maximum size in characters for each text chunk. Default is 4000. Used by `write` operation. " ,
2025-04-06 16:32:51 +08:00
" default " : 4000
}
} ,
" required " : [ " operation " ]
}
) json " );
inline static std : : map < std : : string , std : : vector < json > > content_store_ ;
inline static size_t MAX_STORE_ID = 100 ;
inline static size_t current_id_ = 0 ;
ContentProvider ( ) : BaseTool ( name_ , description_ , parameters_ ) { }
// 将文本分割成合适大小的块
std : : vector < json > split_text_into_chunks ( const std : : string & text , int max_chunk_size ) {
std : : vector < json > chunks ;
// 如果文本为空,返回空数组
if ( text . empty ( ) ) {
return chunks ;
}
size_t text_length = text . length ( ) ;
size_t offset = 0 ;
while ( offset < text_length ) {
// 首先确定最大可能的块大小
size_t raw_chunk_size = std : : min ( static_cast < size_t > ( max_chunk_size ) , text_length - offset ) ;
// 使用 validate_utf8 确保不会截断 UTF-8 字符
std : : string potential_chunk = text . substr ( offset , raw_chunk_size ) ;
size_t valid_utf8_length = validate_utf8 ( potential_chunk ) ;
// 调整为有效的 UTF-8 字符边界
size_t chunk_size = valid_utf8_length ;
// 如果不是在文本的结尾,并且我们没有因为 UTF-8 截断而减小块大小,
// 尝试在空格、换行或标点处分割,以获得更自然的分隔点
if ( offset + chunk_size < text_length & & chunk_size = = raw_chunk_size ) {
size_t break_pos = offset + chunk_size ;
// 向后寻找一个合适的分割点
size_t min_pos = offset + valid_utf8_length / 2 ; // 不要搜索太远,至少保留一半的有效内容
while ( break_pos > min_pos & &
text [ break_pos ] ! = ' ' & &
text [ break_pos ] ! = ' \n ' & &
text [ break_pos ] ! = ' . ' & &
text [ break_pos ] ! = ' , ' & &
text [ break_pos ] ! = ' ; ' & &
text [ break_pos ] ! = ' : ' & &
text [ break_pos ] ! = ' ! ' & &
text [ break_pos ] ! = ' ? ' ) {
break_pos - - ;
}
// 如果找到了合适的分割点且不是原始位置
if ( break_pos > min_pos ) {
2025-04-10 00:10:05 +08:00
break_pos + + ; // Include the last character
2025-04-06 16:32:51 +08:00
std : : string new_chunk = text . substr ( offset , break_pos - offset ) ;
2025-04-10 00:10:05 +08:00
size_t new_valid_length = validate_utf8 ( new_chunk ) ; // Validate the new chunk
chunk_size = break_pos - offset ;
2025-04-06 16:32:51 +08:00
}
}
2025-04-10 00:10:05 +08:00
// Create a text chunk
2025-04-06 16:32:51 +08:00
json chunk ;
chunk [ " type " ] = " text " ;
chunk [ " text " ] = text . substr ( offset , chunk_size ) ;
chunks . push_back ( chunk ) ;
offset + = chunk_size ;
}
return chunks ;
}
// 处理写入操作
ToolResult handle_write ( const json & args ) {
int max_chunk_size = args . value ( " max_chunk_size " , 4000 ) ;
if ( ! args . contains ( " content " ) | | ! args [ " content " ] . is_array ( ) ) {
return ToolError ( " `content` is required and must be an array " ) ;
}
std : : vector < json > processed_content ;
2025-04-10 00:10:05 +08:00
std : : string text_content ;
2025-04-06 16:32:51 +08:00
// 处理内容,分割大型文本
for ( const auto & item : args [ " content " ] ) {
if ( ! item . contains ( " type " ) ) {
return ToolError ( " Each content item must have a `type` field " ) ;
}
2025-04-10 00:10:05 +08:00
std : : string type = item [ " type " ] . get < std : : string > ( ) ;
2025-04-06 16:32:51 +08:00
if ( type = = " text " ) {
if ( ! item . contains ( " text " ) | | ! item [ " text " ] . is_string ( ) ) {
return ToolError ( " Text items must have a `text` field with string value " ) ;
}
2025-04-10 00:10:05 +08:00
text_content + = item [ " text " ] . get < std : : string > ( ) + " \n \n " ; // Handle them together
} else if ( type = = " image_url " ) {
if ( ! text_content . empty ( ) ) {
auto chunks = split_text_into_chunks ( text_content , max_chunk_size ) ;
processed_content . insert ( processed_content . end ( ) , chunks . begin ( ) , chunks . end ( ) ) ;
text_content . clear ( ) ;
}
2025-04-06 16:32:51 +08:00
if ( ! item . contains ( " image_url " ) | | ! item [ " image_url " ] . is_object ( ) | |
! item [ " image_url " ] . contains ( " url " ) | | ! item [ " image_url " ] [ " url " ] . is_string ( ) ) {
return ToolError ( " Image items must have an `image_url` field with a `url` property " ) ;
}
// 图像保持为一个整体
processed_content . push_back ( item ) ;
} else {
return ToolError ( " Unsupported content type: " + type ) ;
}
}
2025-04-10 00:10:05 +08:00
if ( ! text_content . empty ( ) ) {
auto chunks = split_text_into_chunks ( text_content , max_chunk_size ) ;
processed_content . insert ( processed_content . end ( ) , chunks . begin ( ) , chunks . end ( ) ) ;
text_content . clear ( ) ;
}
2025-04-06 16:32:51 +08:00
// 生成一个唯一的存储ID
std : : string store_id = " content_ " + std : : to_string ( current_id_ ) ;
2025-04-10 00:10:05 +08:00
if ( content_store_ . find ( store_id ) ! = content_store_ . end ( ) ) {
logger - > warn ( " Store ID ` " + store_id + " ` already exists, it will be overwritten " ) ;
}
2025-04-06 16:32:51 +08:00
current_id_ = ( current_id_ + 1 ) % MAX_STORE_ID ;
// 存储处理后的内容
content_store_ [ store_id ] = processed_content ;
// 返回存储ID和内容项数
json result ;
result [ " store_id " ] = store_id ;
result [ " total_items " ] = processed_content . size ( ) ;
return ToolResult ( result ) ;
}
// 处理读取操作
ToolResult handle_read ( const json & args ) {
if ( ! args . contains ( " cursor " ) | | ! args [ " cursor " ] . is_string ( ) ) {
return ToolError ( " `cursor` is required for read operations " ) ;
}
std : : string cursor = args [ " cursor " ] ;
if ( cursor = = " start " ) {
// 列出所有可用的存储ID
json available_stores = json : : array ( ) ;
for ( const auto & [ id , content ] : content_store_ ) {
json store_info ;
store_info [ " store_id " ] = id ;
store_info [ " total_items " ] = content . size ( ) ;
available_stores . push_back ( store_info ) ;
}
if ( available_stores . empty ( ) ) {
return ToolResult ( " No content available. Use `write` operation to store content first. " ) ;
}
json result ;
result [ " available_stores " ] = available_stores ;
result [ " next_cursor " ] = " select_store " ;
return ToolResult ( result ) ;
} else if ( cursor = = " select_store " ) {
// 用户需要选择一个存储ID
2025-04-10 00:10:05 +08:00
return ToolError ( " Please provide a store_id as cursor in format `content_X:Y` " ) ;
} else if ( cursor . find ( " : " ) ! = std : : string : : npos ) { // content_X:Y
2025-04-06 16:32:51 +08:00
// 用户正在浏览特定存储内的内容
size_t delimiter_pos = cursor . find ( " : " ) ;
std : : string store_id = cursor . substr ( 0 , delimiter_pos ) ;
size_t index = std : : stoul ( cursor . substr ( delimiter_pos + 1 ) ) ;
if ( content_store_ . find ( store_id ) = = content_store_ . end ( ) ) {
return ToolError ( " Store ID ` " + store_id + " ` not found " ) ;
}
if ( index > = content_store_ [ store_id ] . size ( ) ) {
return ToolError ( " Index out of range " ) ;
}
// 返回请求的内容项
json result = content_store_ [ store_id ] [ index ] ;
// 添加导航信息
if ( index + 1 < content_store_ [ store_id ] . size ( ) ) {
result [ " next_cursor " ] = store_id + " : " + std : : to_string ( index + 1 ) ;
result [ " remaining_items " ] = content_store_ [ store_id ] . size ( ) - index - 1 ;
} else {
result [ " next_cursor " ] = " end " ;
result [ " remaining_items " ] = 0 ;
}
return ToolResult ( result ) ;
} else if ( cursor = = " end " ) {
return ToolResult ( " You have reached the end of the content. " ) ;
} else {
return ToolError ( " Invalid cursor format " ) ;
}
}
ToolResult execute ( const json & args ) override {
try {
if ( ! args . contains ( " operation " ) ) {
return ToolError ( " `operation` is required " ) ;
}
std : : string operation = args [ " operation " ] ;
if ( operation = = " write " ) {
return handle_write ( args ) ;
} else if ( operation = = " read " ) {
return handle_read ( args ) ;
} else {
return ToolError ( " Unknown operation ` " + operation + " `. Please use `write` or `read` " ) ;
}
} catch ( const std : : exception & e ) {
return ToolError ( std : : string ( e . what ( ) ) ) ;
}
}
} ;
} // namespace humanus
# endif // HUMANUS_TOOL_CONTENT_PROVIDER_H