/mod.js | clip_bpe@v0.0.6

A JavaScript port of OpenAI's CLIP byte-pair-encoding tokenizer
Popular
Latest
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
import { Html5Entities as htmlEntities } from "https://deno.land/x/html_entities@v1.0/mod.js";import bpeVocabData from "./bpe_simple_vocab_16e6.mjs";// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";

function ord(c) {  return c.charCodeAt(0);}function range(start, stop, step=1) {  if(stop === undefined) {    stop = start;    start = 0;  }
  if((step > 0 && start >= stop) || (step < 0 && start <= stop)) {    return [];  }
  const result = [];  for(let i = start; step > 0 ? i < stop : i > stop; i += step) {    result.push(i);  }
  return result;}


function bytesToUnicode() {  let bs = [    ...range(ord("!"), ord("~") + 1),    ...range(ord("¡"), ord("¬") + 1),    ...range(ord("®"), ord("ÿ") + 1),  ];  let cs = bs.slice(0);  let n = 0;  for(let b of range(2**8)) {    if(!bs.includes(b)) {      bs.push(b);      cs.push(2**8 + n);      n += 1;    }  }  cs = cs.map(n => String.fromCharCode(n));  return Object.fromEntries(bs.map((v, i) => [v, cs[i]]));}
function getPairs(word) {  let pairs = [];  let prevChar = word[0];  for(let char of word.slice(1)) {    pairs.push([prevChar, char]);    prevChar = char;  }  return pairs;}
function basicClean(text) {  // text = ftfy.fix_text(text);  text = htmlEntities.decode(htmlEntities.decode(text));  return text.trim();}
function whitespaceClean(text) {  return text.replace(/\s+/g, " ").trim();}

export default class {  constructor() {    this.byteEncoder = bytesToUnicode();    this.byteDecoder = Object.fromEntries(Object.entries(this.byteEncoder).map(([k,v]) => [v,k]));    let merges = bpeVocabData.text.split("\n");    merges = merges.slice(1, 49152-256-2+1);    merges = merges.map(merge => merge.split(" "));    // There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:    let vocab = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń'];    vocab = [...vocab, ...vocab.map(v => v+'</w>')];    for(let merge of merges) {      vocab.push(merge.join(""));    }    vocab.push('<|startoftext|>', '<|endoftext|>');    this.encoder = Object.fromEntries(vocab.map((v,i) => [v,i]));    this.decoder = Object.fromEntries(Object.entries(this.encoder).map(([k,v]) => [v,k]));    this.bpeRanks = Object.fromEntries(merges.map((v,i) => [v.join("·😎·"),i])); // ·😎· because js doesn't yet have tuples    this.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'};    this.pat = /<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/gui;  }
  bpe(token) {    if(this.cache[token] !== undefined) {      return this.cache[token];    }
    let word = [...token.slice(0, -1), token.slice(-1)+'</w>'];    let pairs = getPairs(word);
    if(pairs.length === 0) {      return token+'</w>';    }
    while(1) {
      let bigram = null;      let minRank = Infinity;      for(let p of pairs) {        let r = this.bpeRanks[p.join("·😎·")];        if(r === undefined) continue;        if(r < minRank) {          minRank = r;          bigram = p;        }      }
      if(bigram === null) {        break;      }
      let [first, second] = bigram;      let newWord = [];      let i = 0;      while(i < word.length) {
        let j = word.indexOf(first, i);
        if(j === -1) {          newWord.push(...word.slice(i));          break;        }
        newWord.push(...word.slice(i, j));        i = j;
        if(word[i] === first && i < word.length-1 && word[i+1] === second) {          newWord.push(first+second);          i += 2;        } else {          newWord.push(word[i]);          i += 1;        }      }      word = newWord;      if(word.length === 1) {        break;      } else {        pairs = getPairs(word);      }    }    word = word.join(" ");    this.cache[token] = word;    return word;  }
  encode(text) {    let bpeTokens = []    text = whitespaceClean(basicClean(text)).toLowerCase();    for(let token of [...text.matchAll(this.pat)].map(m => m[0])) {      token = [...token].map(b => this.byteEncoder[b.charCodeAt(0)]).join("");      bpeTokens.push(...this.bpe(token).split(' ').map(bpe_token => this.encoder[bpe_token]));    }    return bpeTokens;  }    // adds start and end token, and adds padding 0's and ensures it's 77 tokens long  encodeForCLIP(text) {    let tokens = this.encode(text);    tokens.unshift(49406); // start token    tokens = tokens.slice(0, 76);    tokens.push(49407); // end token    while(tokens.length < 77) tokens.push(0);    return tokens;  }
  decode(tokens) {    let text = tokens.map(token => this.decoder[token]).join("");    text = [...text].map(c => this.byteDecoder[c]).map(v => String.fromCharCode(v)).join("").replaceAll('</w>', ' ');    return text;  }}