Skip to main content
Deno 2 is finally here πŸŽ‰οΈ
Learn more

Khmer Tokenizer

A fast Khmer text tokenizer that ensures all characters are included in the process.

Web demo

import { tokenize } from 'khmertokenizer';

tokenize("αž—αžΆαžŸαžΆαžαŸ’αž˜αŸ‚αžšαŸ‘αŸ’ 123 ABC")
// => ["αž—αžΆ","សអ","αžαŸ’αž˜αŸ‚","រ","៑","្"," ","1","2","3"," ","A","B","C"]

Iterator

import { tokenizeAsIterator } from 'khmertokenizer';

for (const c of tokenizeAsIterator("αž—αžΆαžŸαžΆαžαŸ’αž˜αŸ‚αžšαŸ‘αŸ’ 123 ABC")) {
  console.log(c);
}

Grapheme Validation

import { tokenize, isInvalidKhmerGrapheme } from 'khmertokenizer';

const input = "αž—αžΆαžŸαžΆαžαŸ’αž˜αŸ‚αžšαŸ‘αŸ’ 123 ABC ្ αŸ—αžΆαžΆ"
const output = tokenize(input)
  .filter(c => !isInvalidKhmerGrapheme(c)) // remove invalid graphemes
  .join("")

//=> "αž—αžΆαžŸαžΆαžαŸ’αž˜αŸ‚αžšαŸ‘αŸ’ 123 ABC ្ αŸ—"