/cli/text_encoding.rs | deno@v1.21.3

A modern runtime for JavaScript and TypeScript.
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
use encoding_rs::*;use std::{  borrow::Cow,  io::{Error, ErrorKind},};
pub const BOM_CHAR: char = '\u{FEFF}';
/// Attempts to detect the character encoding of the provided bytes.////// Supports UTF-8, UTF-16 Little Endian and UTF-16 Big Endian.pub fn detect_charset(bytes: &'_ [u8]) -> &'static str {  const UTF16_LE_BOM: &[u8] = b"\xFF\xFE";  const UTF16_BE_BOM: &[u8] = b"\xFE\xFF";
  if bytes.starts_with(UTF16_LE_BOM) {    "utf-16le"  } else if bytes.starts_with(UTF16_BE_BOM) {    "utf-16be"  } else {    // Assume everything else is utf-8    "utf-8"  }}
/// Attempts to convert the provided bytes to a UTF-8 string.////// Supports all encodings supported by the encoding_rs crate, which includes/// all encodings specified in the WHATWG Encoding Standard, and only those/// encodings (see: <https://encoding.spec.whatwg.org/>).pub fn convert_to_utf8<'a>(  bytes: &'a [u8],  charset: &'_ str,) -> Result<Cow<'a, str>, Error> {  match Encoding::for_label(charset.as_bytes()) {    Some(encoding) => encoding      .decode_without_bom_handling_and_without_replacement(bytes)      .ok_or_else(|| ErrorKind::InvalidData.into()),    None => Err(Error::new(      ErrorKind::InvalidInput,      format!("Unsupported charset: {}", charset),    )),  }}
/// Strips the byte order mark from the provided text if it exists.pub fn strip_bom(text: &str) -> &str {  if text.starts_with(BOM_CHAR) {    &text[BOM_CHAR.len_utf8()..]  } else {    text  }}
#[cfg(test)]mod tests {  use super::*;
  fn test_detection(test_data: &[u8], expected_charset: &str) {    let detected_charset = detect_charset(test_data);    assert_eq!(      expected_charset.to_lowercase(),      detected_charset.to_lowercase()    );  }
  #[test]  fn test_detection_utf8_no_bom() {    let test_data = "Hello UTF-8 it is \u{23F0} for Deno!"      .to_owned()      .into_bytes();    test_detection(&test_data, "utf-8");  }
  #[test]  fn test_detection_utf16_little_endian() {    let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec();    test_detection(&test_data, "utf-16le");  }
  #[test]  fn test_detection_utf16_big_endian() {    let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec();    test_detection(&test_data, "utf-16be");  }
  #[test]  fn test_decoding_unsupported_charset() {    let test_data = Vec::new();    let result = convert_to_utf8(&test_data, "utf-32le");    assert!(result.is_err());    let err = result.expect_err("Err expected");    assert!(err.kind() == ErrorKind::InvalidInput);  }
  #[test]  fn test_decoding_invalid_utf8() {    let test_data = b"\xFE\xFE\xFF\xFF".to_vec();    let result = convert_to_utf8(&test_data, "utf-8");    assert!(result.is_err());    let err = result.expect_err("Err expected");    assert!(err.kind() == ErrorKind::InvalidData);  }}