v0.0.5
A fast Khmer text tokenizer that ensures the all characters are included in the process.
Repository
Current version released
2 years ago
Versions
Khmer Tokenizer
A fast Khmer text tokenizer that ensures all characters are included in the process.
import { tokenize } from 'khmertokenizer';
tokenize("ααΆααΆαααααα‘α’ 123 ABC")
// => ["ααΆ","ααΆ","αααα","α","α‘","α’"," ","1","2","3"," ","A","B","C"]
Iterator
import { tokenizeAsIterator } from 'khmertokenizer';
for (const c of tokenizeAsIterator("ααΆααΆαααααα‘α’ 123 ABC")) {
console.log(c);
}
Grapheme Validation
import { tokenize, isInvalidKhmerGrapheme } from 'khmertokenizer';
const input = "ααΆααΆαααααα‘α’ 123 ABC α’ ααΆαΆ"
const output = tokenize(input)
.filter(c => !isInvalidKhmerGrapheme(c)) // remove invalid graphemes
.join("")
//=> "ααΆααΆαααααα‘α’ 123 ABC α’ α"