/codegen_landmark.ts | stream_audio_fingerprint@v2.0.2

x/stream_audio_fingerprint/codegen_landmark.ts
Audio landmark fingerprinting in JavaScript
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
// This Source Code Form is subject to the terms of the Mozilla Public// License, v. 2.0. If a copy of the MPL was not distributed with this// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Copyright (c) 2018 Alexandre Storelli
// Online implementation of the landmark audio fingerprinting algorithm.// inspired by D. Ellis (2009), "Robust Landmark-Based Audio Fingerprinting"// http://labrosa.ee.columbia.edu/matlab/fingerprint/// itself inspired by Wang 2003 paper
// This module exports Codegen, an instance of stream.Transform// By default, the writable side must be fed with an input signal with the following properties:// - single channel// - 16bit PCM// - 22050 Hz sampling rate//// The readable side outputs objects of the form// { tcodes: [time stamps], hcodes: [fingerprints] }
import FFT from './lib/fft.ts'
interface CodegenOptions {  verbose: boolean  samplingRate: number  bps: number  mnlm: number  mppp: number  nfft: number  step: number  dt: number  hwin: number[]  maskDecayLog: number  ifMin: number  ifMax: number  windowDf: number  windowDt: number  pruningDt: number  maskDf: number  eww: number[][]}
interface CodegenUserOpts {  verbose?: boolean  samplingRate?: number  bps?: number  mnlm?: number  mppp?: number  nfft?: number  step?: number  dt?: number  hwin?: number[]  maskDecayLog?: number  ifMin?: number  ifMax?: number  windowDf?: number  windowDt?: number  pruningDt?: number  maskDf?: number  eww?: number[][]}
const buildOptions = (options: CodegenUserOpts): CodegenOptions => {  const verbose = options.verbose ?? false
  // sampling rate in Hz. If you change this, you must adapt windowDt and pruningDt below to match your needs  // set the Nyquist frequency, SAMPLING_RATE/2,  // so as to match the max frequencies you want to get landmark fingerprints.  const samplingRate = options.samplingRate ?? 22050
  // bytes per sample, 2 for 16 bit PCM. If you change this, you must change readInt16LE methods in the code.  const bps = options.bps ?? 2
  // maximum number of local maxima for each spectrum. useful to tune the amount of fingerprints at output  const mnlm = options.mnlm ?? 5
  // maximum of hashes each peak can lead to. useful to tune the amount of fingerprints at output  const mppp = options.mppp ?? 3
  // size of the FFT window. As we use real signals, the spectra will have nfft/2 points.  // Increasing it will give more spectral precision, less temporal precision.  // It may be good or bad depending on the sounds you want to match,  // and on whether your input is deformed by EQ or noise.  const nfft = options.nfft ?? 512
  // 50 % overlap  // if SAMPLING_RATE is 22050 Hz, this leads to a sampling frequency  // fs = (SAMPLING_RATE / step) /s = 86/s, or dt = 1/fs = 11,61 ms.  // It's not really useful to change the overlap ratio.  const step = options.step ?? (nfft / 2)
  const dt = options.dt ?? (1 / (samplingRate / step))
  const hwin = options.hwin ??    Array(nfft).fill(null).map((_f, i) => (      0.5 * (1 - Math.cos(((2 * Math.PI) * i) / (nfft - 1)))    ))
  // threshold decay factor between frames.  const maskDecayLog = options.maskDecayLog ?? Math.log(0.995)
  // frequency window to generate landmark pairs, in units of DF = SAMPLING_RATE / NFFT. Values between 0 and NFFT/2  // you can increase this to avoid having fingerprints for low frequencies  const ifMin = options.ifMin ?? 0  // you don't really want to decrease this, better reduce SAMPLING_RATE instead for faster computation.  const ifMax = options.ifMax ?? nfft / 2
  // we set this to avoid getting fingerprints linking very different frequencies.  // useful to reduce the amount of fingerprints. this can be maxed at NFFT/2 if you wish.  const windowDf = options.windowDf ?? 60
  // time window to generate landmark pairs. time in units of dt (see definition above)  // a little more than 1 sec.  const windowDt = options.windowDt ?? 96  // about 250 ms, window to remove previous peaks that are superseded by later ones.  // tune the pruningDt value to match the effects of maskDecayLog.  // also, pruningDt controls the latency of the pipeline. higher pruningDt = higher latency  const pruningDt = options.pruningDt ?? 24
  // prepare the values of exponential masks.  // mask decay scale in DF units on the frequency axis.  const maskDf = options.maskDf ?? 3  // gaussian mask is a polynom when working on the log-spectrum. log(exp()) = Id()  // MASK_DF is multiplied by Math.sqrt(i+3) to have wider masks at higher frequencies  // see the visualization out-thr.png for better insight of what is happening  const eww = options.eww ??    Array(nfft / 2).fill(null).map((_f, i) => (      Array(nfft / 2).fill(null).map((_f, j) => (        -0.5 * Math.pow((j - i) / maskDf / Math.sqrt(i + 3), 2)      ))    ))
  return {    verbose,    samplingRate,    bps,    mnlm,    mppp,    nfft,    step,    dt,    hwin,    maskDecayLog,    ifMin,    ifMax,    windowDf,    windowDt,    pruningDt,    maskDf,    eww  }}
interface Mark {  t: number  i: number[]  v: number[]}
export interface CodegenBuffer {  tcodes: number[]  hcodes: number[]}
class Codegen {  options: CodegenOptions  buffer: Uint8Array  bufferDelta: number  stepIndex: number  marks: Mark[]  threshold: number[]  fft: FFT
  constructor (options?: CodegenUserOpts) {    this.options = buildOptions(options ?? {})
    this.buffer = new Uint8Array(0)    this.bufferDelta = 0
    this.stepIndex = 0    this.marks = []    this.threshold = Array(this.options.nfft).fill(null).map(() => -3)
    this.fft = new FFT(this.options.nfft)  }
  process (chunk: Uint8Array): CodegenBuffer {    const {      verbose,      bps,      mnlm,      mppp,      nfft,      step,      hwin,      maskDecayLog,      ifMin,      ifMax,      windowDf,      windowDt,      pruningDt,      eww    } = this.options
    if (verbose) {      const t = Math.round(this.stepIndex / step).toString()      const received = chunk.length.toString()      console.log(`t=${t} received ${received} bytes`)    }
    const tcodes: number[] = []    const hcodes: number[] = []
    const concatedBuffer = new Uint8Array(this.buffer.length + chunk.length)    concatedBuffer.set(this.buffer, 0)    concatedBuffer.set(chunk, this.buffer.length)    this.buffer = concatedBuffer
    const bufferView = new DataView(concatedBuffer.buffer)
    while ((this.stepIndex + nfft) * bps < this.buffer.length + this.bufferDelta) {      const data = new Array(nfft) // window data      const image = new Array(nfft).fill(0)
      // fill the data, windowed (HWIN) and scaled      for (let i = 0, limit = nfft; i < limit; i++) {        const readInt = bufferView.getInt16((this.stepIndex + i) * bps - this.bufferDelta, true)        data[i] = (hwin[i] * readInt) / Math.pow(2, 8 * bps - 1)      }      this.stepIndex += step
      this.fft.forward(data, image) // compute FFT
      // log-normal surface      for (let i = ifMin; i < ifMax; i += 1) {        // the lower part of the spectrum is damped,        // the higher part is boosted, leading to a better peaks detection.        this.fft.spectrum[i] = Math.abs(this.fft.spectrum[i]) * Math.sqrt(i + 16)      }
      // positive values of the difference between log spectrum and threshold      const diff = new Array(nfft / 2)      for (let i = ifMin; i < ifMax; i += 1) {        diff[i] = Math.max(Math.log(Math.max(1e-6, this.fft.spectrum[i])) - this.threshold[i], 0)      }
      // find at most MNLM local maxima in the spectrum at this timestamp.      const iLocMax = new Array(mnlm)      const vLocMax = new Array(mnlm)      for (let i = 0; i < mnlm; i += 1) {        iLocMax[i] = NaN        vLocMax[i] = Number.NEGATIVE_INFINITY      }      for (let i = ifMin + 1; i < ifMax - 1; i += 1) {        if (          diff[i] > diff[i - 1] &&          diff[i] > diff[i + 1] &&          this.fft.spectrum[i] > vLocMax[mnlm - 1]        ) { // if local maximum big enough          // insert the newly found local maximum in the ordered list of maxima          for (let j = mnlm - 1; j >= 0; j -= 1) {            // navigate the table of previously saved maxima            if (j >= 1 && this.fft.spectrum[i] > vLocMax[j - 1]) continue            for (let k = mnlm - 1; k >= j + 1; k -= 1) {              iLocMax[k] = iLocMax[k - 1] // offset the bottom values              vLocMax[k] = vLocMax[k - 1]            }            iLocMax[j] = i            vLocMax[j] = this.fft.spectrum[i]            break          }        }      }
      // now that we have the MNLM highest local maxima of the spectrum,      // update the local maximum threshold so that only major peaks are taken into account.      for (let i = 0; i < mnlm; i += 1) {        if (vLocMax[i] > Number.NEGATIVE_INFINITY) {          for (let j = ifMin; j < ifMax; j += 1) {            this.threshold[j] = (              Math.max(this.threshold[j], Math.log(this.fft.spectrum[iLocMax[i]]) + eww[iLocMax[i]][j])            )          }        } else {          vLocMax.splice(i, mnlm - i) // remove the last elements.          iLocMax.splice(i, mnlm - i)          break        }      }
      // array that stores local maxima for each time step      this.marks.push({ t: Math.round(this.stepIndex / step), i: iLocMax, v: vLocMax })
      // remove previous (in time) maxima that would be too close and/or too low.      const nm = this.marks.length      const t0 = nm - pruningDt - 1      for (let i = nm - 1; i >= Math.max(t0 + 1, 0); i -= 1) {        for (let j = 0; j < this.marks[i].v.length; j += 1) {          if (            this.marks[i].i[j] !== 0 &&            Math.log(this.marks[i].v[j]) < (              this.threshold[this.marks[i].i[j]] + maskDecayLog * (nm - 1 - i)            )          ) {            this.marks[i].v[j] = Number.NEGATIVE_INFINITY            this.marks[i].i[j] = Number.NEGATIVE_INFINITY          }        }      }
      // generate hashes for peaks that can no longer be pruned. stepIndex:{f1:f2:deltaindex}      let nFingersTotal = 0      if (t0 >= 0) {        const m = this.marks[t0]
        for (let i = 0; i < m.i.length; i += 1) {          let nFingers = 0          let canBreak = false
          for (let j = t0; j >= Math.max(0, t0 - windowDt); j -= 1) {            if (canBreak) break            const m2 = this.marks[j]
            for (let k = 0; k < m2.i.length; k += 1) {              if (canBreak) break              if (m2.i[k] !== m.i[i] && Math.abs(m2.i[k] - m.i[i]) < windowDf) {                tcodes.push(m.t)                hcodes.push(m2.i[k] + (nfft / 2) * (m.i[i] + (nfft / 2) * (t0 - j)))                nFingers += 1                nFingersTotal += 1                if (nFingers >= mppp) canBreak = true              }            }          }        }      }      if (nFingersTotal > 0 && verbose) {        console.log(`t=${Math.round(this.stepIndex / step)} generated ${nFingersTotal} fingerprints`)      }
      this.marks.splice(0, t0 + 1 - windowDt)
      // decrease the threshold for the next iteration      for (let j = 0; j < this.threshold.length; j += 1) {        this.threshold[j] += maskDecayLog      }    }
    if (this.buffer.length > 1000000) {      const delta = this.buffer.length - 20000      this.bufferDelta += delta      this.buffer = this.buffer.slice(delta)    }
    return { tcodes, hcodes }  }}
export default Codegen