diff --git a/input_tdm.cpp b/input_tdm.cpp
index 77c57272..e693fdd6 100644
--- a/input_tdm.cpp
+++ b/input_tdm.cpp
@@ -31,17 +31,15 @@
 #include "utility/imxrt_hw.h"
 
 DMAMEM __attribute__((aligned(32)))
-static uint32_t tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*16];
-audio_block_t * AudioInputTDM::block_incoming[16] = {
-	nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-	nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr
-};
+static uint32_t tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*TDM_IN_N_AUDIO_BLOCKS];
+audio_block_t * AudioInputTDM::block_incoming[TDM_IN_N_AUDIO_BLOCKS];
 bool AudioInputTDM::update_responsibility = false;
 DMAChannel AudioInputTDM::dma(false);
 
-
 void AudioInputTDM::begin(void)
 {
+	for (int i = 0; i < TDM_IN_N_AUDIO_BLOCKS; i++)
+		block_incoming[i] = nullptr;
 	dma.begin(true); // Allocate the DMA channel first
 
 	// TODO: should we set & clear the I2S_RCSR_SR bit here?
@@ -68,7 +66,22 @@ void AudioInputTDM::begin(void)
 	dma.attachInterrupt(isr);
 #elif defined(__IMXRT1062__)
 	CORE_PIN8_CONFIG  = 3;  //RX_DATA0
-	IOMUXC_SAI1_RX_DATA0_SELECT_INPUT = 2;
+	IOMUXC_SAI1_RX_DATA0_SELECT_INPUT = 2; // TEENSY P8, GPIO_B1_00_ALT3  as rx 0
+        // When using combine mode, you *must* use the data pins in order.
+	switch (AUDIO_N_SAI1_RX_DATAPINS) {
+        case 4: 
+            CORE_PIN32_CONFIG  = 3;  //RX_DATA3
+            IOMUXC_SAI1_RX_DATA3_SELECT_INPUT = 1; // TEENSY P32, GPIO_B0_12_ALT3 as rx 1
+        case 3:
+            CORE_PIN9_CONFIG  = 3;  //RX_DATA2
+            IOMUXC_SAI1_RX_DATA2_SELECT_INPUT = 1; // TEENSY P9, GPIO_B0_11_ALT3 as rx 2
+        case 2:
+            CORE_PIN6_CONFIG  = 3;  //RX_DATA1
+            IOMUXC_SAI1_RX_DATA1_SELECT_INPUT = 1; // TEENSY P6, GPIO_B0_10_ALT3 as rx 1
+            break;
+        default:
+            break;
+	}
 	dma.TCD->SADDR = &I2S1_RDR0;
 	dma.TCD->SOFF = 0;
 	dma.TCD->ATTR = DMA_TCD_ATTR_SSIZE(2) | DMA_TCD_ATTR_DSIZE(2);
@@ -89,25 +102,57 @@ void AudioInputTDM::begin(void)
 #endif	
 }
 
-// TODO: needs optimization...
-static void memcpy_tdm_rx(uint32_t *dest1, uint32_t *dest2, const uint32_t *src)
-{
-	uint32_t i, in1, in2;
-
-	for (i=0; i < AUDIO_BLOCK_SAMPLES/2; i++) {
-		in1 = *src;
-		in2 = *(src+8);
-		src += 16;
-		*dest1++ = (in1 >> 16) | (in2 & 0xFFFF0000);
-		*dest2++ = (in1 << 16) | (in2 & 0x0000FFFF);
-	}
+// Since we're grabbing data out of the fifo at 32-bits, but the words
+// are 16-bits and we're sharing fifos in a round-robbin manner, with
+//
+// 1 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C02 C04 C06 C08 C10 C12 C14
+// C01 C03 C05 C07 C09 C11 C13 C15
+//
+// 2 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C02 C18 C04 C20 C06 C22 C08 C24 C10 C26 C12 C28 C14 C30 
+// C01 C17 C03 C19 C05 C21 C07 C23 C09 C25 C11 C27 C13 C29 C15 C31 
+//
+// 3 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C32 C02 C18 C34 C04 C20 C36 C06 C22 C38 C08 C24 C40 C10 C26 C42 C12 C28 C44 C14 C30 C46
+// C01 C17 C33 C03 C19 C35 C05 C21 C37 C07 C23 C39 C09 C25 C41 C11 C27 C43 C13 C29 C45 C15 C31 C47
+//
+// 4 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C32 C48|C02 C18 C34 C50|C04 C20 C36 C52|C06 C22 C38 C54|C08 C24 C40 C56|C10 C26 C42 C58|C12 C28 C44 C60|C14 C30 C46 C62
+// C01 C17 C33 C49|C03 C19 C35 C51|C05 C21 C37 C53|C07 C23 C39 C55|C09 C25 C41 C57|C11 C27 C43 C59|C13 C29 C45 C61|C15 C31 C47 C63 
+//  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31 <--- word index
+// where each column is a single 32-bit word popped from the fifo
+static void deinterleave(audio_block_t *block_incoming[TDM_IN_N_AUDIO_BLOCKS], const uint32_t *src) {
+    int l = 0;
+    for (int i=0; i < TDM_IN_N_AUDIO_BLOCKS/2; i++) {
+        // ix = 0,  2,  4,  6,          ... 1 pin
+        // ix = 0, 16,  2, 18,  3, 20, ... 2 pins
+        // ix = 0, 16, 32,  2, 18, 33 ... 3 pins
+        // ix = 0, 16, 32, 48,  2, 18 ... 4 pins
+
+        //  i = 0,  1,  2,  3,  4,  5,
+        // ix = 0, 16, 32,  2, 18, 33 ... 3 pins
+        //    = (0*16 1*16 2*16)+i
+        int ix = (l*16) + i/AUDIO_N_SAI1_RX_DATAPINS*2;
+        l = (l + 1) % AUDIO_N_SAI1_RX_DATAPINS;
+        uint32_t *dest1 = (uint32_t *)(block_incoming[ix]->data);
+        uint32_t *dest2 = (uint32_t *)(block_incoming[ix+1]->data);
+        const uint32_t *psrc = src;
+        for (int j=0; j < AUDIO_BLOCK_SAMPLES/2; j++) {
+            uint32_t in1 = *psrc;
+            uint32_t in2 = *(psrc + TDM_IN_N_AUDIO_BLOCKS * AUDIO_N_SAI1_RX_DATAPINS / 2);
+            psrc += TDM_IN_N_AUDIO_BLOCKS;
+            *dest1++ = ((in1 >> 16) & 0x0000FFFF) | ((in2 <<  0) & 0xFFFF0000);
+            *dest2++ = ((in1 >>  0) & 0x0000FFFF) | ((in2 << 16) & 0xFFFF0000);
+        }
+        src++;
+    }
 }
 
 void AudioInputTDM::isr(void)
 {
 	uint32_t daddr;
 	const uint32_t *src;
-	unsigned int i;
 
 	daddr = (uint32_t)(dma.TCD->DADDR);
 	dma.clearInterrupt();
@@ -115,7 +160,7 @@ void AudioInputTDM::isr(void)
 	if (daddr < (uint32_t)tdm_rx_buffer + sizeof(tdm_rx_buffer) / 2) {
 		// DMA is receiving to the first half of the buffer
 		// need to remove data from the second half
-		src = &tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*8];
+		src = &tdm_rx_buffer[AUDIO_BLOCK_SAMPLES * TDM_IN_N_AUDIO_BLOCKS / 2];
 	} else {
 		// DMA is receiving to the second half of the buffer
 		// need to remove data from the first half
@@ -125,12 +170,7 @@ void AudioInputTDM::isr(void)
 		#if IMXRT_CACHE_ENABLED >=1
 		arm_dcache_delete((void*)src, sizeof(tdm_rx_buffer) / 2);
 		#endif
-		for (i=0; i < 16; i += 2) {
-			uint32_t *dest1 = (uint32_t *)(block_incoming[i]->data);
-			uint32_t *dest2 = (uint32_t *)(block_incoming[i+1]->data);
-			memcpy_tdm_rx(dest1, dest2, src);
-			src++;
-		}
+                deinterleave(block_incoming, src);
 	}
 	if (update_responsibility) update_all();
 }
@@ -138,12 +178,11 @@ void AudioInputTDM::isr(void)
 
 void AudioInputTDM::update(void)
 {
-	unsigned int i, j;
-	audio_block_t *new_block[16];
-	audio_block_t *out_block[16];
-
-	// allocate 16 new blocks.  If any fails, allocate none
-	for (i=0; i < 16; i++) {
+    unsigned int i, j;
+	audio_block_t *new_block[TDM_IN_N_AUDIO_BLOCKS];
+	audio_block_t *out_block[TDM_IN_N_AUDIO_BLOCKS];
+	// allocate TDM_IN_N_AUDIO_BLOCKS new blocks.  If any fails, allocate none
+	for (i=0; i < TDM_IN_N_AUDIO_BLOCKS; i++) {
 		new_block[i] = allocate();
 		if (new_block[i] == nullptr) {
 			for (j=0; j < i; j++) {
@@ -158,8 +197,8 @@ void AudioInputTDM::update(void)
 	memcpy(block_incoming, new_block, sizeof(block_incoming));
 	__enable_irq();
 	if (out_block[0] != nullptr) {		
-		// if we got 1 block, all 16 are filled
-		for (i=0; i < 16; i++) {
+		// if we got 1 block, all TDM_IN_N_AUDIO_BLOCKS are filled
+		for (i=0; i < TDM_IN_N_AUDIO_BLOCKS; i++) {
 			transmit(out_block[i], i);
 			release(out_block[i]);
 		}
diff --git a/input_tdm.h b/input_tdm.h
index 45ba0515..f0c8cbd2 100644
--- a/input_tdm.h
+++ b/input_tdm.h
@@ -30,6 +30,9 @@
 #include "Arduino.h"
 #include "AudioStream.h"
 #include "DMAChannel.h"
+#include "AudioRate.h"
+
+#define TDM_IN_N_AUDIO_BLOCKS (TDM_CHANNELS_PER_PIN*AUDIO_N_SAI1_RX_DATAPINS)
 
 class AudioInputTDM : public AudioStream
 {
@@ -37,12 +40,12 @@ class AudioInputTDM : public AudioStream
 	AudioInputTDM(void) : AudioStream(0, NULL) { begin(); }
 	virtual void update(void);
 	void begin(void);
-protected:	
+protected:
 	static bool update_responsibility;
 	static DMAChannel dma;
 	static void isr(void);
 private:
-	static audio_block_t *block_incoming[16];
+	static audio_block_t *block_incoming[TDM_IN_N_AUDIO_BLOCKS];
 };
 
 #endif
diff --git a/input_tdm2.cpp b/input_tdm2.cpp
index ae3ebdc3..fe8d2fcc 100644
--- a/input_tdm2.cpp
+++ b/input_tdm2.cpp
@@ -79,8 +79,8 @@ static void memcpy_tdm_rx(uint32_t *dest1, uint32_t *dest2, const uint32_t *src)
 		in1 = *src;
 		in2 = *(src+8);
 		src += 16;
-		*dest1++ = (in1 >> 16) | (in2 & 0xFFFF0000);
-		*dest2++ = (in1 << 16) | (in2 & 0x0000FFFF);
+		*dest1++ = ((in1 >> 16) & 0x0000FFFF) | ((in2 <<  0) & 0xFFFF0000);
+		*dest2++ = ((in1 >>  0) & 0x0000FFFF) | ((in2 << 16) & 0xFFFF0000);
 	}
 }
 
diff --git a/output_tdm.cpp b/output_tdm.cpp
index 8f3a6158..5d1210a1 100644
--- a/output_tdm.cpp
+++ b/output_tdm.cpp
@@ -23,7 +23,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
 #include <Arduino.h>
 
 #if !defined(KINETISL)
@@ -319,20 +318,30 @@ void AudioOutputTDM::config_tdm(void)
 	I2S1_TCR1 = I2S_TCR1_RFW(4);
 	I2S1_TCR2 = I2S_TCR2_SYNC(tsync) | I2S_TCR2_BCP | I2S_TCR2_MSEL(1)
 		| I2S_TCR2_BCD | I2S_TCR2_DIV(0);
-	I2S1_TCR3 = I2S_TCR3_TCE;
+
+    // enable all TX data pins.  Channels must be used in order, so
+    // enabled chanels can (0b0001, 0b0011, 0b0111, or 0b1111) only.  This
+    // limitation is because of the fifo combine mode limitation.
+    I2S1_TCR3 = ((1 << AUDIO_N_SAI1_TX_DATAPINS)-1) << 16;
+
 	I2S1_TCR4 = I2S_TCR4_FRSZ(7) | I2S_TCR4_SYWD(0) | I2S_TCR4_MF
 		| I2S_TCR4_FSE | I2S_TCR4_FSD;
+    I2S1_TCR4 |= (AUDIO_N_SAI1_TX_DATAPINS > 1) ? I2S_TCR4_FCOMB_ENABLED_ON_WRITES : 0;
 	I2S1_TCR5 = I2S_TCR5_WNW(31) | I2S_TCR5_W0W(31) | I2S_TCR5_FBT(31);
 
 	I2S1_RMR = 0;
 	I2S1_RCR1 = I2S_RCR1_RFW(4);
 	I2S1_RCR2 = I2S_RCR2_SYNC(rsync) | I2S_TCR2_BCP | I2S_RCR2_MSEL(1)
 		| I2S_RCR2_BCD | I2S_RCR2_DIV(0);
-	I2S1_RCR3 = I2S_RCR3_RCE;
+        // enable all RX data pins.  Channels must be used in order, so
+        // enabled chanels can (0b0001, 0b0011, 0b0111, or 0b1111) only.  This
+        // limitation is because of the fifo combine mode limitation.
+        I2S1_RCR3 = ( (1 << AUDIO_N_SAI1_RX_DATAPINS) - 1 ) << 16;
+
 	I2S1_RCR4 = I2S_RCR4_FRSZ(7) | I2S_RCR4_SYWD(0) | I2S_RCR4_MF
 		| I2S_RCR4_FSE | I2S_RCR4_FSD;
+        I2S1_RCR4 |= (AUDIO_N_SAI1_RX_DATAPINS > 1) ? I2S_RCR4_FCOMB_ENABLED_ON_READS : 0;
 	I2S1_RCR5 = I2S_RCR5_WNW(31) | I2S_RCR5_W0W(31) | I2S_RCR5_FBT(31);
-
 	CORE_PIN23_CONFIG = 3;  //1:MCLK
 	CORE_PIN21_CONFIG = 3;  //1:RX_BCLK
 	CORE_PIN20_CONFIG = 3;  //1:RX_SYNC
diff --git a/output_tdm.h b/output_tdm.h
index 35982bf4..354b8326 100644
--- a/output_tdm.h
+++ b/output_tdm.h
@@ -30,6 +30,7 @@
 #include "Arduino.h"
 #include "AudioStream.h"
 #include "DMAChannel.h"
+#include "AudioRate.h"
 
 class AudioOutputTDM : public AudioStream
 {