#ifndef HUMANUS_TOKENIZER_BPE_H #define HUMANUS_TOKENIZER_BPE_H #include "base.h" #include "../mcp/common/base64.hpp" #include #include #include #include #include #include #include #include #include #include namespace humanus { /** * @brief BPE (Byte Pair Encoding) Tokenizer Implementation * * Uses tiktoken format vocabulary and merge rules file. * Uses a priority queue for efficient BPE merging. */ class BPETokenizer : public BaseTokenizer { private: // Helper structure for hashing pairs struct PairHash { template std::size_t operator() (const std::pair& pair) const { return std::hash()(pair.first) ^ std::hash()(pair.second); } }; // UTF-8 bytes to token mapping std::unordered_map encoder; // Token ID to UTF-8 bytes mapping std::unordered_map decoder; // Merge priority mapping, lower rank with higher priority std::unordered_map, size_t, PairHash> merge_ranks; // Used by tiktoken merge ranks std::string base64_decode(const std::string& encoded) const { if (encoded.empty()) return ""; return base64::decode(encoded); } // Lower rank with higher priority struct MergeComparator { const std::unordered_map, size_t, PairHash>& ranks; MergeComparator(const std::unordered_map, size_t, PairHash>& r) : ranks(r) {} bool operator()(const std::pair, size_t>& a, const std::pair, size_t>& b) const { // First compare by merge_ranks, if not exist then use max value size_t rank_a = ranks.count(a.first) ? ranks.at(a.first) : std::numeric_limits::max(); size_t rank_b = ranks.count(b.first) ? ranks.at(b.first) : std::numeric_limits::max(); // Max heap, so we need to reverse compare (smaller rank has higher priority) return rank_a > rank_b; } }; public: /** * @brief Construct BPE tokenizer from tiktoken format file * @param tokenizer_path path to tiktoken format vocabulary file * * File format: Each line contains a base64 encoded token and its corresponding token ID * Example: "IQ== 0", where "IQ==" is the base64 encoded token and 0 is the corresponding ID */ BPETokenizer(const std::string& tokenizer_path) { std::ifstream file(tokenizer_path); if (!file.is_open()) { throw std::runtime_error("Failed to open tokenizer file: " + tokenizer_path); } std::string line; while (std::getline(file, line)) { std::istringstream iss(line); std::string token_base64; size_t rank; if (iss >> token_base64 >> rank) { std::string token = base64_decode(token_base64); encoder[token] = rank; decoder[rank] = token; } } build_merge_ranks(); } /** * @brief Build merge priority mapping * * Use tokens in vocabulary to infer possible merge rules. * For tokens longer than 1, try all possible splits, if the split parts are also in the vocabulary, * then assume this is a valid merge rule. */ void build_merge_ranks() { for (const auto& [token, id] : encoder) { if (token.length() <= 1) continue; // Try all possible split points for (size_t i = 1; i < token.length(); ++i) { std::string first = token.substr(0, i); std::string second = token.substr(i); // If both parts are in the vocabulary, assume this is a valid merge rule if (encoder.count(first) && encoder.count(second)) { // Use ID as priority - smaller ID means higher priority merge_ranks[{first, second}] = id; } } } } /** * @brief Set merge priority * @param ranks new merge priority mapping */ void set_merge_ranks(const std::unordered_map, size_t, PairHash>& ranks) { merge_ranks = ranks; } /** * @brief Encode text using BPE * @param text text to encode * @return encoded token IDs * * This method uses BPE algorithm to encode the input text. * 1. First decompose the text into single byte tokens * 2. Use priority queue to merge adjacent tokens based on merge_ranks * 3. Convert the final tokens to corresponding IDs */ std::vector encode(const std::string& text) const override { if (text.empty()) { return {}; } // Decompose the text into single character tokens std::vector tokens; for (unsigned char c : text) { tokens.push_back(std::string(1, c)); } // Use priority queue to execute BPE merging while (tokens.size() > 1) { // Build priority queue to select the highest priority merge pair using MergePair = std::pair, size_t>; MergeComparator comparator(merge_ranks); std::priority_queue, MergeComparator> merge_candidates(comparator); // Find all possible merge pairs for (size_t i = 0; i < tokens.size() - 1; ++i) { std::pair pair = {tokens[i], tokens[i+1]}; if (merge_ranks.count(pair)) { merge_candidates.push({pair, i}); } } // If there are no merge pairs, exit loop if (merge_candidates.empty()) { break; } // Execute the highest priority merge (the first in the priority queue) auto top_merge = merge_candidates.top(); auto pair = top_merge.first; // The token pair to merge size_t pos = top_merge.second; // The position to merge // Merge tokens std::string merged_token = pair.first + pair.second; tokens[pos] = merged_token; tokens.erase(tokens.begin() + pos + 1); } // Convert tokens to IDs std::vector ids; ids.reserve(tokens.size()); for (const auto& token : tokens) { auto it = encoder.find(token); if (it != encoder.end()) { ids.push_back(it->second); } // Unknown tokens will be skipped } return ids; } /** * @brief Decode BPE tokens * @param tokens token IDs to decode * @return decoded text * * This method converts encoded token IDs back to the original text. * Simply concatenate the token strings corresponding to each ID. */ std::string decode(const std::vector& tokens) const override { std::string result; result.reserve(tokens.size() * 2); for (size_t id : tokens) { auto it = decoder.find(id); if (it != decoder.end()) { result += it->second; } // Unknown IDs will be skipped } return result; } /** * @brief Load merge ranks from tiktoken format file * @param file_path path to tiktoken format file * @return shared pointer to created BPE tokenizer */ static std::shared_ptr load_from_tiktoken(const std::string& file_path) { return std::make_shared(file_path); } }; } // namespace humanus #endif // HUMANUS_TOKENIZER_BPE_H