Java实现Shazam声音识别算法的实例代码

2021-05-30 17:20llhhzz1989 Java教程

Shazam算法采用傅里叶变换将时域信号转换为频域信号，并获得音频指纹，最后匹配指纹契合度来识别音频。这篇文章给大家介绍Java实现Shazam声音识别算法的实例代码，需要的朋友参考下吧

shazam算法采用傅里叶变换将时域信号转换为频域信号，并获得音频指纹，最后匹配指纹契合度来识别音频。

1、audiosystem获取音频

奈奎斯特-香农采样定理告诉我们，为了能捕获人类能听到的声音频率，我们的采样速率必须是人类听觉范围的两倍。人类能听到的声音频率范围大约在20hz到20000hz之间，所以在录制音频的时候采样率大多是44100hz。这是大多数标准mpeg-1 的采样率。44100这个值最初来源于索尼，因为它可以允许音频在修改过的视频设备上以25帧（pal）或者30帧（ ntsc）每秒进行录制，而且也覆盖了专业录音设备的20000hz带宽。所以当你在选择录音的频率时，选择44100hz就好了。

定义音频格式：

				?

									public static float samplerate = 44100;

									public static int samplesizeinbits = 16;

									public static int channels = 2; // double

									public static boolean signed = true; // indicates whether the data is signed or unsigned

									public static boolean bigendian = true; // indicates whether the audio data is stored in big-endian or little-endian order

									public audioformat getformat() {

									  return new audioformat(samplerate, samplesizeinbits, channels, signed,

									      bigendian);

									}

调用麦克风获取音频，保存到out中

				?

									public static bytearrayoutputstream out = new bytearrayoutputstream();1

									   try {

									     audioformat format = smartauto.getformat(); // fill audioformat with the settings

									     dataline.info info = new dataline.info(targetdataline.class, format);

									     starttime = new date().gettime();

									     system.out.println(starttime);

									     smartauto.line = (targetdataline) audiosystem.getline(info);

									     smartauto.line.open(format);

									     smartauto.line.start();

									     new fileanalysis().getdatatoout("");

									     while (smartauto.running) {

									       checktime(starttime);

									     }

									     smartauto.line.stop();

									     smartauto.line.close();

									   } catch (throwable e) {

									     e.printstacktrace();

									   }

获取到的out数据需要通过傅里叶变换，从时域信号转换为频域信号。

傅里叶变换

				?

									public complex[] fft(complex[] x) {

									    int n = x.length;

									    // 因为exp(-2i*n*pi)=1，n=1时递归原点

									    if (n == 1){

									      return x;

									    }

									    // 如果信号数为奇数，使用dft计算

									    if (n % 2 != 0) {

									      return dft(x);

									    }

									    // 提取下标为偶数的原始信号值进行递归fft计算

									    complex[] even = new complex[n / 2];

									    for (int k = 0; k < n / 2; k++) {

									      even[k] = x[2 * k];

									    }

									    complex[] evenvalue = fft(even);

									    // 提取下标为奇数的原始信号值进行fft计算

									    // 节约内存

									    complex[] odd = even;

									    for (int k = 0; k < n / 2; k++) {

									      odd[k] = x[2 * k + 1];

									    }

									    complex[] oddvalue = fft(odd);

									    // 偶数+奇数

									    complex[] result = new complex[n];

									    for (int k = 0; k < n / 2; k++) {

									      // 使用欧拉公式e^(-i*2pi*k/n) = cos(-2pi*k/n) + i*sin(-2pi*k/n)

									      double p = -2 * k * math.pi / n;

									      complex m = new complex(math.cos(p), math.sin(p));

									      result[k] = evenvalue[k].add(m.multiply(oddvalue[k]));

									      // exp(-2*(k+n/2)*pi/n) 相当于 -exp(-2*k*pi/n)，其中exp(-n*pi)=-1(欧拉公式);

									      result[k + n / 2] = evenvalue[k].subtract(m.multiply(oddvalue[k]));

									    }

									    return result;

									  }

计算out的频域值

				?

									private void setfftresult(){

									   byte audio[] = smartauto.out.tobytearray();

									   final int totalsize = audio.length;

									   system.out.println("totalsize = " + totalsize);

									   int chenksize = 4;

									   int amountpossible = totalsize/chenksize;

									   //when turning into frequency domain we'll need complex numbers: 

									   smartauto.results = new complex[amountpossible][];

									   dftoperate dfaoperate = new dftoperate();

									   //for all the chunks: 

									   for(int times = 0;times < amountpossible; times++) {

									     complex[] complex = new complex[chenksize];

									     for(int i = 0;i < chenksize;i++) {

									       //put the time domain data into a complex number with imaginary part as 0: 

									       complex[i] = new complex(audio[(times*chenksize)+i], 0);

									     }

									     //perform fft analysis on the chunk: 

									     smartauto.results[times] = dfaoperate.fft(complex);

									   }

									   system.out.println("results = " + smartauto.results.tostring());

									 }