Repository
Current version released
a year ago
decancer
A tiny package that removes common confusables from strings.
- It’s core is written in Rust and utilizes a form of Binary Search to ensure speed!
- It stores it’s huge collection of confusables in a customized binary file instead of a huge JSON or text file to optimize it’s bundle size!
- It supports curing 4,800 different confusables into cured-lowercased-strings, including but not limited to:
- Accented characters
- Byte order mark
- Control characters
- Most homoglyphs
- Several foreign characters, including but not limited to Arabic, Cyrillic, Greek, and Japanese
- Several emojis
- Whitespace characters
- Zalgo text
- And it’s supported in the following languages:
- Rust
- JavaScript (Node.js/Deno/Bun/Browser)
- C/C++
- Python (unofficial)
Installation
Rust
In your Cargo.toml
:
decancer = "1.5.3"
Node.js
In your shell:
$ npm install decancer
In your code:
const decancer = require('decancer')
Deno
In your code:
import decancer from 'npm:decancer'
Bun
In your shell:
$ bun install decancer
In your code:
const decancer = require('decancer')
Browser
In your code:
<script type="module">
import init from 'https://cdn.jsdelivr.net/gh/null8626/decancer@v1.5.3/bindings/wasm/bin/decancer.min.js'
const decancer = await init()
</script>
C/C++
Download precompiled binaries
- Download for 64-bit Windows MSVC (Windows 7+)
- Download for 32-bit Windows MSVC (Windows 7+)
- Download for ARM64 Windows MSVC
- Download for 64-bit macOS (10.7+, Lion+)
- Download for ARM64 macOS (11.0+, Big Sur+)
- Download for 64-bit Linux (kernel 3.2+, glibc 2.17+)
- Download for 64-bit Linux with MUSL
- Download for ARM64 Linux (kernel 4.1, glibc 2.17+)
- Download for ARM64 Linux with MUSL
- Download for ARMv7 Linux, hardfloat (kernel 3.2, glibc 2.17)
- Download for 64-bit FreeBSD
Building from source
Prerequisites:
$ git clone https://github.com/null8626/decancer.git --depth 1
$ cd decancer/bindings/native
$ cargo build --release
And the binary files should be generated in the target/release
directory.
Examples
JavaScript
const cured = decancer('vEⓡ𝔂 𝔽𝕌Ňℕy ţ乇𝕏𝓣')
// cured here is a CuredString object wrapping over the cured string
// for comparison purposes, it's more recommended to use the methods provided by the CuredString class.
if (cured.contains('funny')) {
console.log('found the funny')
}
if (
cured.equals('very funny text') &&
cured.startsWith('very') &&
cured.endsWith('text')
) {
console.log('it works!')
}
console.log(cured.toString()) // 'very funny text'
Rust
extern crate decancer;
fn main() {
let cured = decancer::cure("vEⓡ𝔂 𝔽𝕌Ňℕy ţ乇𝕏𝓣");
// cured here is a decancer::CuredString struct wrapping over the cured string
// for comparison purposes, it's more recommended to use the methods provided by the decancer::CuredString struct.
assert_eq!(cured, "very funny text");
assert!(cured.starts_with("very"));
assert!(cured.contains("funny"));
assert!(cured.ends_with("text"));
let _output_str = cured.into_str(); // retrieve the String inside and consume the struct.
}
Web app example
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Decancerer!!! (tm)</title>
<style>
textarea {
font-size: 30px;
}
#cure {
font-size: 20px;
padding: 5px 30px;
}
</style>
</head>
<body>
<h3>Input cancerous text here:</h3>
<textarea rows="10" cols="30"></textarea>
<br />
<button id="cure" onclick="cure()">cure!</button>
<script type="module">
import init from 'https://cdn.jsdelivr.net/gh/null8626/decancer@v1.5.3/bindings/wasm/bin/decancer.min.js'
const decancer = await init()
window.cure = function () {
const textarea = document.querySelector('textarea')
if (!textarea.value.length) {
return alert("There's no text!!!")
}
textarea.value = decancer(textarea.value).toString()
}
</script>
</body>
</html>
C/C++ UTF-8 example
#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
// global variable for assertion purposes only
decancer_cured_t cured;
// our quick assert function
static void assert(const bool expr, const char *message)
{
if (!expr)
{
fprintf(stderr, "assertion failed (%s)\n", message);
if (output_raw != NULL)
{
wdecancer_raw_free(output_raw);
output_raw = NULL;
}
decancer_free(cured);
exit(1);
}
}
int main(void) {
// utf-8 bytes for "vEⓡ𝔂 𝔽𝕌Ňℕy ţ乇𝕏𝓣"
uint8_t string[] = {0x76, 0xef, 0xbc, 0xa5, 0xe2, 0x93, 0xa1, 0xf0, 0x9d, 0x94, 0x82, 0x20, 0xf0, 0x9d,
0x94, 0xbd, 0xf0, 0x9d, 0x95, 0x8c, 0xc5, 0x87, 0xe2, 0x84, 0x95, 0xef, 0xbd, 0x99,
0x20, 0xc5, 0xa3, 0xe4, 0xb9, 0x87, 0xf0, 0x9d, 0x95, 0x8f, 0xf0, 0x9d, 0x93, 0xa3};
// cure string
cured = decancer_cure(string, sizeof(string));
// comparisons
assert(decancer_equals(cured, (uint8_t *)("very funny text"), 15), "equals");
assert(decancer_starts_with(cured, (uint8_t *)("very"), 4), "starts_with");
assert(decancer_ends_with(cured, (uint8_t *)("text"), 4), "ends_with");
assert(decancer_contains(cured, (uint8_t *)("funny"), 5), "contains");
// coerce output as a raw UTF-8 pointer and retrieve it's size (in bytes)
size_t output_size;
const uint8_t *output_raw = decancer_raw(cured, &output_size);
// assert raw cured utf-8 size to be 15 bytes (size of "very funny text")
assert(output_size == 15, "raw output size");
// utf-8 bytes for "very funny text"
const uint8_t expected_raw[] = {0x76, 0x65, 0x72, 0x79, 0x20, 0x66, 0x75, 0x6e,
0x6e, 0x79, 0x20, 0x74, 0x65, 0x78, 0x74};
char assert_message[38];
for (uint32_t i = 0; i < sizeof(expected_raw); i++)
{
sprintf(assert_message, "mismatched utf-8 contents at index %u", i);
assert(output_raw[i] == expected_raw[i], assert_message);
}
// free cured string (required)
decancer_free(cured);
return 0;
}
C/C++ UTF-16 example
#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
// global variable for assertion purposes only
decancer_cured_t cured;
wdecancer_raw_cured_t output_raw = NULL;
// our quick assert function
static void assert(const bool expr, const char *message)
{
if (!expr)
{
fprintf(stderr, "assertion failed (%s)\n", message);
if (output_raw != NULL)
{
wdecancer_raw_free(output_raw);
output_raw = NULL;
}
decancer_free(cured);
exit(1);
}
}
int main(void) {
// utf-16 bytes for "vEⓡ𝔂 𝔽𝕌Ňℕy ţ乇𝕏𝓣"
wchar_t string[] = {0x0076, 0xff25, 0x24e1, 0xd835, 0xdd02, 0x0020, 0xd835, 0xdd3d, 0xd835, 0xdd4c,
0x0147, 0x2115, 0xff59, 0x0020, 0x0163, 0x4e47, 0xd835, 0xdd4f, 0xd835, 0xdce3};
// cure string
cured = wdecancer_cure(string, sizeof(string) / sizeof(wchar_t));
// comparisons
assert(wdecancer_equals(cured, L"very funny text", 15), "wide equals");
assert(wdecancer_starts_with(cured, L"very", 4), "wide starts_with");
assert(wdecancer_ends_with(cured, L"text", 4), "wide ends_with");
assert(wdecancer_contains(cured, L"funny", 5), "wide contains");
// coerce output as a raw UTF-16 pointer and retrieve it's length (in CHARACTERS)
size_t output_length;
output_raw = wdecancer_raw(cured, &output_length);
const wchar_t *output_raw_ptr = wdecancer_raw_ptr(output_raw);
// assert raw cured utf-16 length to be 15 characters (length of "very funny text", NOT in bytes)
assert(output_length == 15, "wide raw output length");
// utf-16 bytes for "very funny text"
const wchar_t expected_raw[] = {0x76, 0x65, 0x72, 0x79, 0x20, 0x66, 0x75, 0x6e,
0x6e, 0x79, 0x20, 0x74, 0x65, 0x78, 0x74};
char assert_message[39];
for (uint32_t i = 0; i < sizeof(expected_raw) / sizeof(wchar_t); i++)
{
sprintf(assert_message, "mismatched utf-16 contents at index %u", i);
assert(output_raw_ptr[i] == expected_raw[i], assert_message);
}
// free raw cured UTF-16 string (required)
wdecancer_raw_free(output_raw);
// free cured string (required)
decancer_free(cured);
return 0;
}
Contributing
Please read CONTRIBUTING.md
for newbie contributors who want to contribute!