Skip to main content
Module

x/clip_bpe/mod.js

A JavaScript port of OpenAI's CLIP byte-pair-encoding tokenizer
Latest
File
import { Html5Entities as htmlEntities } from "https://deno.land/x/html_entities@v1.0/mod.js";import bpeVocabData from "./bpe_simple_vocab_16e6.mjs";// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";

function ord(c) { return c.charCodeAt(0);}function range(start, stop, step=1) { if(stop === undefined) { stop = start; start = 0; }
if((step > 0 && start >= stop) || (step < 0 && start <= stop)) { return []; }
const result = []; for(let i = start; step > 0 ? i < stop : i > stop; i += step) { result.push(i); }
return result;}


function bytesToUnicode() { let bs = [ ...range(ord("!"), ord("~") + 1), ...range(ord("¡"), ord("¬") + 1), ...range(ord("®"), ord("ÿ") + 1), ]; let cs = bs.slice(0); let n = 0; for(let b of range(2**8)) { if(!bs.includes(b)) { bs.push(b); cs.push(2**8 + n); n += 1; } } cs = cs.map(n => String.fromCharCode(n)); return Object.fromEntries(bs.map((v, i) => [v, cs[i]]));}
function getPairs(word) { let pairs = []; let prevChar = word[0]; for(let char of word.slice(1)) { pairs.push([prevChar, char]); prevChar = char; } return pairs;}
function basicClean(text) { // text = ftfy.fix_text(text); text = htmlEntities.decode(htmlEntities.decode(text)); return text.trim();}
function whitespaceClean(text) { return text.replace(/\s+/g, " ").trim();}

export default class { constructor() { this.byteEncoder = bytesToUnicode(); this.byteDecoder = Object.fromEntries(Object.entries(this.byteEncoder).map(([k,v]) => [v,k])); let merges = bpeVocabData.text.split("\n"); merges = merges.slice(1, 49152-256-2+1); merges = merges.map(merge => merge.split(" ")); // There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output: let vocab = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'IJ', 'ij', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń']; vocab = [...vocab, ...vocab.map(v => v+'</w>')]; for(let merge of merges) { vocab.push(merge.join("")); } vocab.push('<|startoftext|>', '<|endoftext|>'); this.encoder = Object.fromEntries(vocab.map((v,i) => [v,i])); this.decoder = Object.fromEntries(Object.entries(this.encoder).map(([k,v]) => [v,k])); this.bpeRanks = Object.fromEntries(merges.map((v,i) => [v.join("·😎·"),i])); // ·😎· because js doesn't yet have tuples this.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}; this.pat = /<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/gui; }
bpe(token) { if(this.cache[token] !== undefined) { return this.cache[token]; }
let word = [...token.slice(0, -1), token.slice(-1)+'</w>']; let pairs = getPairs(word);
if(pairs.length === 0) { return token+'</w>'; }
while(1) {
let bigram = null; let minRank = Infinity; for(let p of pairs) { let r = this.bpeRanks[p.join("·😎·")]; if(r === undefined) continue; if(r < minRank) { minRank = r; bigram = p; } }
if(bigram === null) { break; }
let [first, second] = bigram; let newWord = []; let i = 0; while(i < word.length) {
let j = word.indexOf(first, i);
if(j === -1) { newWord.push(...word.slice(i)); break; }
newWord.push(...word.slice(i, j)); i = j;
if(word[i] === first && i < word.length-1 && word[i+1] === second) { newWord.push(first+second); i += 2; } else { newWord.push(word[i]); i += 1; } } word = newWord; if(word.length === 1) { break; } else { pairs = getPairs(word); } } word = word.join(" "); this.cache[token] = word; return word; }
encode(text) { let bpeTokens = [] text = whitespaceClean(basicClean(text)).toLowerCase(); for(let token of [...text.matchAll(this.pat)].map(m => m[0])) { token = [...token].map(b => this.byteEncoder[b.charCodeAt(0)]).join(""); bpeTokens.push(...this.bpe(token).split(' ').map(bpe_token => this.encoder[bpe_token])); } return bpeTokens; } // adds start and end token, and adds padding 0's and ensures it's 77 tokens long encodeForCLIP(text) { let tokens = this.encode(text); tokens.unshift(49406); // start token tokens = tokens.slice(0, 76); tokens.push(49407); // end token while(tokens.length < 77) tokens.push(0); return tokens; }
decode(tokens) { let text = tokens.map(token => this.decoder[token]).join(""); text = [...text].map(c => this.byteDecoder[c]).map(v => String.fromCharCode(v)).join("").replaceAll('</w>', ' '); return text; }}