/*
* SACD Decoder plugin
* Copyright (c) 2011-2012 Maxim V.Anisiutkin <maxim.anisiutkin@gmail.com>
* Optimization update Damien Plisson <damien78@audirvana.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include <math.h>
#include <memory.h>
#include "dsdpcm_converter_double.h"

#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#endif

void dsdpcm_fir_d::init(ctable_d* fir_ctables, int fir_length, int channels, int decimation) {
	this->fir_ctables = fir_ctables;
	this->fir_order   = fir_length - 1;
	this->fir_length  = CTABLES(fir_length);
	this->fir_size    = this->fir_length;
	this->channels    = channels;
	this->decimation  = decimation / 8;
	free();
	int buf_size = 2 * this->fir_size * sizeof(uint8_t);
	for (int ch = 0; ch < this->channels; ch++) {
#ifdef _WIN32
		this->fir_buffer[ch] = (uint8_t*)_aligned_malloc(buf_size, 64);
#else
		this->fir_buffer[ch] = (uint8_t*)valloc(buf_size);
#endif
		memset(this->fir_buffer[ch], 0xAA, buf_size);
	}
	fir_index = 0;
}

void dsdpcm_fir_d::free() {
	for (int ch = 0; ch < DSDPCM_MAX_CHANNELS; ch++) {
		if (fir_buffer[ch]) {
#ifdef _WIN32
			_aligned_free(fir_buffer[ch]);
#else
			::free(fir_buffer[ch]);
#endif
			fir_buffer[ch] = NULL;
		}
	}
}

float dsdpcm_fir_d::get_delay() {
	return (float)fir_order / 2 / 8 / decimation;
}

int dsdpcm_fir_d::run(uint8_t* dsd_data, dsdpcm_pcmtemp_d pcm_data, int dsd_samples) {
	int pcm_samples = dsd_samples / decimation / channels;
	for (int sample = 0; sample < pcm_samples; sample++) {
		for (int i = 0; i < decimation; i++) {
			for (int ch = 0; ch < channels; ch++) {
				fir_buffer[ch][fir_index + fir_length] = fir_buffer[ch][fir_index] = *(dsd_data++);
			}
			fir_index = (++fir_index) % fir_length;
		}
		for (int ch = 0; ch < channels; ch++) {
			pcm_data[ch][sample] = 0;
			for (int j = 0; j < fir_length; j++) {
				pcm_data[ch][sample] += fir_ctables[j][fir_buffer[ch][fir_index + j]];
			}
		}
	}
	return pcm_samples;
}

void pcmpcm_fir_d::init(double* fir_coefs, int fir_length, int channels, int decimation) {
	this->fir_coefs  = fir_coefs;
	this->fir_order  = fir_length - 1;
	this->fir_length = fir_length;
	this->fir_size   = this->fir_length;
	this->channels   = channels;
	this->decimation = decimation;
	free();
	int buf_size = 2 * this->fir_size * sizeof(double);
	for (int ch = 0; ch < this->channels; ch++) {
#ifdef _WIN32
		this->fir_buffer[ch] = (double*)_aligned_malloc(buf_size, 64);
#else
		this->fir_buffer[ch] = (double*)valloc(buf_size);
#endif
		memset(this->fir_buffer[ch], 0, buf_size);
	}
	fir_index = 0;
}

void pcmpcm_fir_d::free() {
	for (int ch = 0; ch < DSDPCM_MAX_CHANNELS; ch++) {
		if (fir_buffer[ch]) {
#ifdef _WIN32
			_aligned_free(fir_buffer[ch]);
#else
			::free(fir_buffer[ch]);
#endif
			fir_buffer[ch] = NULL;
		}
	}
}

float pcmpcm_fir_d::get_delay() {
	return (float)fir_order / 2 / decimation;
}

int pcmpcm_fir_d::run(dsdpcm_pcmtemp_d pcm_data, dsdpcm_pcmtemp_d out_data, int pcm_samples) {
	int out_samples = pcm_samples / decimation;
	for (int sample = 0; sample < out_samples; sample++) {
		for (int i = 0; i < decimation; i++) {
			for (int ch = 0; ch < channels; ch++) {
				fir_buffer[ch][fir_index + fir_length] = fir_buffer[ch][fir_index] = pcm_data[ch][sample * decimation + i];// *(pcm_data++);
			}
			fir_index = (++fir_index) % fir_length;
		}
		for (int ch = 0; ch < channels; ch++) {
#ifdef __APPLE__
			vDSP_dotprD(fir_coefs, 1, &fir_buffer[ch][fir_index], 1, &out_data[ch][sample], fir_length);
#else
			out_data[ch][sample] = (double)0;
			for (int j = 0; j < fir_length; j++) {
				out_data[ch][sample] += fir_coefs[j] * fir_buffer[ch][fir_index + j];
			}
#endif
		}
	}
	return out_samples;
}

double dsdpcm_converter_d::dsd_fir1_8_ctables[CTABLES(DSDFIR1_8_LENGTH)][256];
double dsdpcm_converter_d::dsd_fir1_16_ctables[CTABLES(DSDFIR1_16_LENGTH)][256];
double dsdpcm_converter_d::dsd_fir1_64_ctables[CTABLES(DSDFIR1_64_LENGTH)][256];
double dsdpcm_converter_d::pcm_fir2_2_coefs[PCMFIR2_2_LENGTH];
double dsdpcm_converter_d::pcm_fir3_2_coefs[PCMFIR3_2_LENGTH];

dsdpcm_converter_d::dsdpcm_converter_d(conv_type_t conv_type) : dsdpcm_conv_impl_t(conv_type) {
}

dsdpcm_converter_d::~dsdpcm_converter_d() {
}

int dsdpcm_converter_d::init(int channels, int dsd_samplerate, int pcm_samplerate) {
	static bool preinitialized = false;
	if (!preinitialized) {
		preinit();
		preinitialized = true;
	}
	this->channels = channels;
	this->dsd_samplerate = dsd_samplerate;
	this->pcm_samplerate = pcm_samplerate;
	switch (dsd_samplerate) {
	case DSDxFs64:
		switch (pcm_samplerate) {
		case DSDxFs1:
			conv_mode = DSD64_44100;
			break;
		case DSDxFs2:
			conv_mode = DSD64_88200;
			break;
		case DSDxFs4:
			conv_mode = DSD64_176400;
			break;
		case DSDxFs8:
			conv_mode = DSD64_352800;
			break;
		default:
			return -2;
		}
		break;
	case DSDxFs128:
		switch (pcm_samplerate) {
		case DSDxFs1:
			conv_mode = DSD128_44100;
			break;
		case DSDxFs2:
			conv_mode = DSD128_88200;
			break;
		case DSDxFs4:
			conv_mode = DSD128_176400;
			break;
		case DSDxFs8:
			conv_mode = DSD128_352800;
			break;
		default:
			return -2;
		}
		break;
	default:
		return -1;
		break;
	}
	float dB_gain_adjust;
	switch (conv_type) {
	case DSDPCM_CONV_MULTISTAGE_DOUBLE:
		dB_gain_adjust = 18.0f;
		switch (conv_mode) {
		case DSD64_44100:
			dsd_fir1.init(dsd_fir1_16_ctables, DSDFIR1_16_LENGTH, channels, 16);
			pcm_fir2a.init(pcm_fir2_2_coefs, PCMFIR2_2_LENGTH, channels, 2);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = (dsd_fir1.get_delay() / pcm_fir2a.get_decimation() + pcm_fir2a.get_delay()) / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD64_88200:
			dsd_fir1.init(dsd_fir1_8_ctables, DSDFIR1_8_LENGTH, channels, 8);
			pcm_fir2a.init(pcm_fir2_2_coefs, PCMFIR2_2_LENGTH, channels, 2);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = (dsd_fir1.get_delay() / pcm_fir2a.get_decimation() + pcm_fir2a.get_delay()) / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD64_176400:
			dsd_fir1.init(dsd_fir1_8_ctables, DSDFIR1_8_LENGTH, channels, 8);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = dsd_fir1.get_delay() / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD64_352800:
			dsd_fir1.init(dsd_fir1_8_ctables, DSDFIR1_8_LENGTH, channels, 8);
			delay = dsd_fir1.get_delay();
			break;
		case DSD128_44100:
			dsd_fir1.init(dsd_fir1_16_ctables, DSDFIR1_16_LENGTH, channels, 16);
			pcm_fir2a.init(pcm_fir2_2_coefs, PCMFIR2_2_LENGTH, channels, 2);
			pcm_fir2b.init(pcm_fir2_2_coefs, PCMFIR2_2_LENGTH, channels, 2);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = ((dsd_fir1.get_delay() / pcm_fir2a.get_decimation() + pcm_fir2a.get_delay()) / pcm_fir2b.get_decimation() + pcm_fir2b.get_delay()) / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD128_88200:
			dsd_fir1.init(dsd_fir1_16_ctables, DSDFIR1_16_LENGTH, channels, 16);
			pcm_fir2a.init(pcm_fir2_2_coefs, PCMFIR2_2_LENGTH, channels, 2);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = (dsd_fir1.get_delay() / pcm_fir2a.get_decimation() + pcm_fir2a.get_delay()) / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD128_176400:
			dsd_fir1.init(dsd_fir1_16_ctables, DSDFIR1_16_LENGTH, channels, 16);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = dsd_fir1.get_delay() / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD128_352800:
			dsd_fir1.init(dsd_fir1_16_ctables, DSDFIR1_16_LENGTH, channels, 16);
			delay = dsd_fir1.get_delay();
			break;
		}
		break;
	case DSDPCM_CONV_DIRECT_DOUBLE:
		dB_gain_adjust = 0.0f;
		switch (conv_mode) {
		case DSD64_44100:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 32);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = dsd_fir1.get_delay() / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD64_88200:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 32);
			delay = dsd_fir1.get_delay();
			break;
		case DSD64_176400:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 16);
			delay = dsd_fir1.get_delay();
			break;
		case DSD64_352800:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 8);
			delay = dsd_fir1.get_delay();
			break;
		case DSD128_44100:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 64);
			pcm_fir3.init(pcm_fir3_2_coefs, PCMFIR3_2_LENGTH, channels, 2);
			delay = dsd_fir1.get_delay() / pcm_fir3.get_decimation() + pcm_fir3.get_delay();
			break;
		case DSD128_88200:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 64);
			delay = dsd_fir1.get_delay();
			break;
		case DSD128_176400:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 32);
			delay = dsd_fir1.get_delay();
			break;
		case DSD128_352800:
			dsd_fir1.init(dsd_fir1_64_ctables, DSDFIR1_64_LENGTH, channels, 16);
			delay = dsd_fir1.get_delay();
			break;
		}
		break;
	default:
		return -1;
		break;
	}
	gain0 = pow(10.0f, dB_gain_adjust / 20.0f);
	gain = (double)gain0;
	conv_called = false;
	return 0;
}

float dsdpcm_converter_d::get_delay() {
	return delay - 1;
}

bool dsdpcm_converter_d::is_convert_called() {
	return conv_called;
}

int dsdpcm_converter_d::convert(uint8_t* dsd_data, int32_t* pcm_data, int dsd_samples) {
	int pcm_samples;
	pcm_samples = convert_internal(dsd_data, pcm_tempo, dsd_samples);
	for (int i = 0; i < pcm_samples; i++) {
		for (int ch = 0; ch < channels; ch++) {
			pcm_data[i * channels + ch] = (int32_t)pcm_tempo[ch][i];
		}
	}
	return pcm_samples * channels;
}

int dsdpcm_converter_d::convert(uint8_t* dsd_data, float* pcm_data, int dsd_samples) {
	int pcm_samples;
	pcm_samples = convert_internal(dsd_data, pcm_tempo, dsd_samples);

#ifdef __APPLE__
	for (int ch = 0; ch < channels; ch++) {
		vDSP_vsmulD(pcm_tempo[ch], 1, &gain, pcm_tempo[ch], 1, pcm_samples);
		vDSP_vdpsp(pcm_tempo[ch], 1, pcm_data + ch, channels, pcm_samples);
	}
#else
	for (int i = 0; i < pcm_samples; i++) {
		for (int ch = 0; ch < channels; ch++) {
			pcm_data[i * channels + ch] = (float)(gain * pcm_tempo[ch][i]);
		}
	}
#endif
	return pcm_samples * channels;
}

int dsdpcm_converter_d::convert(uint8_t* dsd_data, double* pcm_data, int dsd_samples) {
	int pcm_samples;
	pcm_samples = convert_internal(dsd_data, pcm_tempo, dsd_samples);
    
#ifdef __APPLE__
	if (channels == 2) {
		DSPDoubleSplitComplex input = { pcm_tempo[0], pcm_tempo[1] };
        
		vDSP_ztocD(&input, 1, (DSPDoubleComplex*)pcm_data, 2, pcm_samples);
		vDSP_vsmulD(pcm_data, 1, &gain, pcm_data, 1, pcm_samples * 2);
	}
	else
#endif
	for (int i = 0; i < pcm_samples; i++) {
		for (int ch = 0; ch < channels; ch++) {
			pcm_data[i * channels + ch] = (double)(gain * pcm_tempo[ch][i]);
		}
	}
	return pcm_samples * channels;
}

void dsdpcm_converter_d::set_gain(float dB_gain) {
	gain = (double)(gain0 * pow(10.0f, dB_gain / 20.0f));
}

int dsdpcm_converter_d::set_ctables(int32_t* fir_coefs, int fir_length, ctable_d* fir_ctables) {
	int ctables = CTABLES(fir_length);
	for (int ct = 0; ct < ctables; ct++) {
		int k = fir_length - ct * 8;
		if (k > 8) {
			k = 8;
		}
		if (k < 0) {
			k = 0;
		}
		for (int i = 0; i < 256; i++) {
			double cvalue = (double)0;
			for (int j = 0; j < k; j++) {
				cvalue += (double)(((i >> (7 - j)) & 1) * 2 - 1) * (double)fir_coefs[fir_length - 1 - (ct * 8 + j)] * NORM_D;
			}
			fir_ctables[ct][i] = cvalue;
		}
	}
	return ctables;
}

void dsdpcm_converter_d::set_coefs(const int32_t* int_coefs, int fir_length, double* real_coefs) {
	for (int i = 0; i < fir_length; i++) {
		real_coefs[i] = (double)int_coefs[fir_length - 1 - i] * NORM_D;
	}
}

void dsdpcm_converter_d::preinit() {
	set_ctables((int32_t*)DSDFIR1_8_COEFS, DSDFIR1_8_LENGTH, (ctable_d*)dsd_fir1_8_ctables);
	set_ctables((int32_t*)DSDFIR1_16_COEFS, DSDFIR1_16_LENGTH, (ctable_d*)dsd_fir1_16_ctables);
	set_ctables((int32_t*)DSDFIR1_64_COEFS, DSDFIR1_64_LENGTH, (ctable_d*)dsd_fir1_64_ctables);
	set_coefs(PCMFIR2_2_COEFS, PCMFIR2_2_LENGTH, pcm_fir2_2_coefs);
	set_coefs(PCMFIR3_2_COEFS, PCMFIR3_2_LENGTH, pcm_fir3_2_coefs);
}

int dsdpcm_converter_d::convert_internal(uint8_t* dsd_data, dsdpcm_pcmtemp_d pcm_data, int dsd_samples) {
	int pcm_samples;
	switch (conv_type) {
	case DSDPCM_CONV_MULTISTAGE_DOUBLE:
		switch (conv_mode) {
		case DSD64_44100:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir2a.run(pcm_temp1, pcm_temp2, pcm_samples);
			pcm_samples = pcm_fir3.run(pcm_temp2, pcm_data, pcm_samples);
			break;
		case DSD64_88200:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir2a.run(pcm_temp1, pcm_temp2, pcm_samples);
			pcm_samples = pcm_fir3.run(pcm_temp2, pcm_data, pcm_samples);
			break;
		case DSD64_176400:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir3.run(pcm_temp1, pcm_data, pcm_samples);
			break;
		case DSD64_352800:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD128_44100:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir2a.run(pcm_temp1, pcm_temp2, pcm_samples);
			pcm_samples = pcm_fir2b.run(pcm_temp2, pcm_temp3, pcm_samples);
			pcm_samples = pcm_fir3.run(pcm_temp3, pcm_data, pcm_samples);
			break;
		case DSD128_88200:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir2a.run(pcm_temp1, pcm_temp2, pcm_samples);
			pcm_samples = pcm_fir3.run(pcm_temp2, pcm_data, pcm_samples);
			break;
		case DSD128_176400:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir3.run(pcm_temp1, pcm_data, pcm_samples);
			break;
		case DSD128_352800:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		}
		break;
	case DSDPCM_CONV_DIRECT_DOUBLE:
		switch (conv_mode) {
		case DSD64_44100:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir3.run(pcm_temp1, pcm_data, pcm_samples);
			break;
		case DSD64_88200:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD64_176400:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD64_352800:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD128_44100:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_temp1, dsd_samples);
			pcm_samples = pcm_fir3.run(pcm_temp1, pcm_data, pcm_samples);
			break;
		case DSD128_88200:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD128_176400:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		case DSD128_352800:
			pcm_samples = dsd_fir1.run(dsd_data, pcm_data, dsd_samples);
			break;
		}
		break;
 default:
		return -1;
	}
	conv_called = true;
	return pcm_samples;
}

