From 15f559d816f83e107380b7d7592e63be519ef32f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Hamal=20Dvo=C5=99=C3=A1k?= <mordae@anilinux.org>
Date: Sat, 3 Aug 2024 09:48:00 +0200
Subject: [PATCH] Optimize decimation performance

---
 src/main.c | 95 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 40 deletions(-)

diff --git a/src/main.c b/src/main.c
index b35603b..b25a228 100644
--- a/src/main.c
+++ b/src/main.c
@@ -24,7 +24,7 @@
 #define VREG_VOLTAGE VREG_VOLTAGE_1_20
 #define CLK_SYS_HZ (288 * MHZ)
 
-#define INIT_SAMPLE_RATE 100000
+#define INIT_SAMPLE_RATE 200000
 #define INIT_FREQ 94600000
 #define INIT_GAIN 127
 
@@ -64,20 +64,19 @@ static uint32_t nco_addr = (uint32_t)lo_phase;
 #define DECIMATE 16
 #define RX_BITS_DEPTH 8
 #define RX_WORDS (1 << (RX_BITS_DEPTH - 2))
+#define RX_STRIDE (2 * DECIMATE)
 
-static_assert(RX_WORDS >= 4 * DECIMATE, "RX_WORDS >= 4 * DECIMATE");
+static_assert(RX_WORDS >= 2 * RX_STRIDE, "RX_WORDS >= 2 * RX_STRIDE");
 
 static uint32_t rx_cos[RX_WORDS] __attribute__((__aligned__(1 << RX_BITS_DEPTH)));
 
-static const uint32_t *rx_start = rx_cos;
-static const uint32_t *rx_end = rx_cos + RX_WORDS - 1;
-
 #define NUM_GAINS 29
 static int gains[NUM_GAINS] = { 0,   9,	  14,  27,  37,	 77,  87,  125, 144, 157,
 				166, 197, 207, 229, 254, 280, 297, 328, 338, 364,
 				372, 386, 402, 421, 434, 439, 445, 480, 496 };
 static int sample_rate = INIT_SAMPLE_RATE;
-static int dc_level = CLK_SYS_HZ / INIT_SAMPLE_RATE / 2;
+static int max_amplitude = CLK_SYS_HZ / INIT_SAMPLE_RATE / 2;
+static int max_amplitude_mul = 65536 / (CLK_SYS_HZ / INIT_SAMPLE_RATE / 2);
 static int gain = INIT_GAIN;
 static int frequency = INIT_FREQ;
 
@@ -456,97 +455,116 @@ struct IQ {
 	int I, Q;
 };
 
-inline static int get_next_sample()
+inline static const uint32_t *next_stride()
 {
-	static const uint32_t *tail = rx_cos;
+	static int tail = 0;
 
-	const uint32_t *head = (const uint32_t *)dma_hw->ch[dma_ch_in_cos].write_addr;
+	int head, delta;
 
-	while (head == tail) {
-		asm volatile("nop; nop; nop; nop");
-		head = (const uint32_t *)dma_hw->ch[dma_ch_in_cos].write_addr;
-	}
+loop:
+	head = (dma_hw->ch[dma_ch_in_cos].write_addr >> 2) & (RX_WORDS - 1);
+	delta = head - tail;
 
-	int value = -(*tail++);
-	value *= 2;
-	value -= *tail++;
+	if (delta < 0)
+		delta += RX_WORDS;
 
-	if (tail > rx_end)
-		tail = rx_start;
+	if (delta < RX_STRIDE)
+		goto loop;
 
-	return gain * value - dc_level;
+	const uint32_t *stride = rx_cos + tail;
+
+	tail = (tail + RX_STRIDE) & (RX_WORDS - 1);
+
+	return stride;
+}
+
+inline static int nextQ(const uint32_t **stride)
+{
+	int x2 = *(*stride)++;
+	int x1 = *(*stride)++;
+
+	return (x2 << 1) + x1 + max_amplitude;
 }
 
 inline static struct IQ next_sample()
 {
 	int I = 0, Q = 0;
 
-	int x15 = get_next_sample();
+	const uint32_t *stride = next_stride();
+
+	int x15 = nextQ(&stride);
 	I += 93 * x15;
 	Q += 39 * x15;
 
-	int x14 = get_next_sample();
+	int x14 = nextQ(&stride);
 	I += 71 * x14;
 	Q += 71 * x14;
 
-	int x13 = get_next_sample();
+	int x13 = nextQ(&stride);
 	I += 39 * x13;
 	Q += 93 * x13;
 
-	int x12 = get_next_sample();
+	int x12 = nextQ(&stride);
 	I += 0 * x12;
 	Q += 101 * x12;
 
-	int x11 = get_next_sample();
+	int x11 = nextQ(&stride);
 	I += -39 * x11;
 	Q += 93 * x11;
 
-	int x10 = get_next_sample();
+	int x10 = nextQ(&stride);
 	I += -71 * x10;
 	Q += 71 * x10;
 
-	int x09 = get_next_sample();
+	int x09 = nextQ(&stride);
 	I += -93 * x09;
 	Q += 39 * x09;
 
-	int x08 = get_next_sample();
+	int x08 = nextQ(&stride);
 	I += -101 * x08;
 	Q += 0 * x08;
 
-	int x07 = get_next_sample();
+	int x07 = nextQ(&stride);
 	I += -93 * x07;
 	Q += -39 * x07;
 
-	int x06 = get_next_sample();
+	int x06 = nextQ(&stride);
 	I += -71 * x06;
 	Q += -71 * x06;
 
-	int x05 = get_next_sample();
+	int x05 = nextQ(&stride);
 	I += -39 * x05;
 	Q += -93 * x05;
 
-	int x04 = get_next_sample();
+	int x04 = nextQ(&stride);
 	I += 0 * x04;
 	Q += -101 * x04;
 
-	int x03 = get_next_sample();
+	int x03 = nextQ(&stride);
 	I += 39 * x03;
 	Q += -93 * x03;
 
-	int x02 = get_next_sample();
+	int x02 = nextQ(&stride);
 	I += 71 * x02;
 	Q += -71 * x02;
 
-	int x01 = get_next_sample();
+	int x01 = nextQ(&stride);
 	I += 93 * x01;
 	Q += -39 * x01;
 
-	int x00 = get_next_sample();
+	int x00 = nextQ(&stride);
 	I += 101 * x00;
 	Q += 0 * x00;
 
+	I *= gain;
 	I /= 1024;
+	I *= max_amplitude_mul;
+	I /= (1 << 16);
+
+	Q *= gain;
 	Q /= 1024;
+	Q *= max_amplitude_mul;
+	Q /= (1 << 16);
 
 	return (struct IQ){ I, Q };
 }
@@ -568,8 +586,6 @@ static void rf_rx(void)
 			int64_t I = IQ.I;
 			int64_t Q = IQ.Q;
 
-			I /= dc_level;
-
 			if (I > 127)
 				I = 127;
 			else if (I < -128)
@@ -577,8 +593,6 @@ static void rf_rx(void)
 
 			*blockptr++ = (uint8_t)I + 128;
 
-			Q /= dc_level;
-
 			if (Q > 127)
 				Q = 127;
 			else if (Q < -128)
@@ -602,7 +616,8 @@ static void run_command(uint8_t cmd, uint32_t arg)
 	} else if (0x02 == cmd) {
 		/* Set the rate at which IQ sample pairs are sent */
 		sample_rate = arg;
-		dc_level = CLK_SYS_HZ / sample_rate / 2;
+		max_amplitude = CLK_SYS_HZ / sample_rate / 2;
+		max_amplitude_mul = 65536 / max_amplitude;
 		dma_timer_set_fraction(dma_t_samp, 1, CLK_SYS_HZ / (sample_rate * DECIMATE));
 		rx_lo_init(frequency + sample_rate);
 	} else if (0x04 == cmd) {