diff --git a/examples/io_test.rs b/examples/io_test.rs
new file mode 100644
index 0000000..adbaf31
--- /dev/null
+++ b/examples/io_test.rs
@@ -0,0 +1,247 @@
+use rand::{thread_rng, Rng};
+use std::error::Error;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+use std::{env, process, thread};
+use vroom::memory::*;
+use vroom::{NvmeDevice, QUEUE_LENGTH};
+
+#[allow(unused_variables, unused_mut)]
+pub fn main() -> Result<(), Box<dyn Error>> {
+    let mut args = env::args();
+    args.next();
+
+    let pci_addr = match args.next() {
+        Some(arg) => arg,
+        None => {
+            eprintln!("Usage: cargo run --example init <pci bus id>");
+            process::exit(1);
+        }
+    };
+
+    let duration = match args.next() {
+        Some(secs) => Some(Duration::from_secs(secs.parse().expect(
+            "Usage: cargo run --example init <pci bus id> <duration in seconds>",
+        ))),
+        None => None,
+    };
+
+    let mut nvme = vroom::init(&pci_addr)?;
+
+    let nvme = qd_n(nvme, 1, 0, false,  128, duration)?;
+    let _ = qd_n(nvme, 1, 0, false,  256, duration)?;
+
+    // let _ = qd1(nvme, 0, false, true, duration)?;
+
+    Ok(())
+}
+
+fn qd1(
+    mut nvme: NvmeDevice,
+    n: u64,
+    write: bool,
+    random: bool,
+    time: Option<Duration>,
+) -> Result<NvmeDevice, Box<dyn Error>> {
+    let mut buffer: Dma<u8> = Dma::allocate(HUGE_PAGE_SIZE, true)?;
+
+    let blocks = 8;
+    let bytes = 512 * blocks;
+    let ns_blocks = nvme.namespaces.get(&1).unwrap().blocks / blocks - 1; // - blocks - 1;
+
+    let mut rng = thread_rng();
+    let seq = if random {
+        (0..n)
+            .map(|_| rng.gen_range(0..ns_blocks as u64))
+            .collect::<Vec<u64>>()
+    } else {
+        (0..n).map(|i| (i * 8) % ns_blocks).collect::<Vec<u64>>()
+    };
+
+    let rand_block = &(0..bytes).map(|_| rand::random::<u8>()).collect::<Vec<_>>()[..];
+    buffer[..rand_block.len()].copy_from_slice(rand_block);
+
+    let mut total = Duration::ZERO;
+
+    if let Some(time) = time {
+        let mut ios = 0;
+        let lba = 0;
+        while total < time {
+            let lba = if random { rng.gen_range(0..ns_blocks) } else { (lba + 1) % ns_blocks };
+
+            let before = Instant::now();
+            if write {
+                nvme.write(&buffer.slice(0..bytes as usize), lba * blocks)?;
+            } else {
+                nvme.read(&buffer.slice(0..bytes as usize), lba * blocks)?;
+            }
+            let elapsed = before.elapsed();
+            total += elapsed;
+            ios += 1;
+        }
+        println!(
+            "IOP: {ios}, total {} iops: {:?}",
+            if write { "write" } else { "read" },
+            ios as f64 / total.as_secs_f64()
+        );
+    } else {
+        for lba in seq {
+            let before = Instant::now();
+            if write {
+                nvme.write(&buffer.slice(0..bytes as usize), lba * blocks)?;
+            } else {
+                nvme.read(&buffer.slice(0..bytes as usize), lba * blocks)?;
+            }
+            total += before.elapsed();
+        }
+        println!(
+            "n: {n}, total {} iops: {:?}",
+            if write { "write" } else { "read" },
+            n as f64 / total.as_secs_f64()
+        );
+    }
+    Ok(nvme)
+}
+
+#[allow(unused)]
+fn qd_n(
+    nvme: NvmeDevice,
+    n_threads: u64,
+    n: u64,
+    write: bool,
+    batch_size: usize,
+    time: Option<Duration>,
+) -> Result<NvmeDevice, Box<dyn Error>> {
+    let blocks = 8;
+    let ns_blocks = nvme.namespaces.get(&1).unwrap().blocks / blocks;
+
+    let nvme = Arc::new(Mutex::new(nvme));
+    let mut threads = Vec::new();
+
+    for i in 0..n_threads {
+        let nvme = Arc::clone(&nvme);
+        let range = (0, ns_blocks);
+
+        let handle = thread::spawn(move || -> (u64, f64) {
+            let mut rng = rand::thread_rng();
+            let bytes = 512 * blocks as usize;
+            let mut total = std::time::Duration::ZERO;
+            let mut buffer: Dma<u8> = Dma::allocate(HUGE_PAGE_SIZE, true).unwrap();
+
+            let mut qpair = nvme
+                .lock()
+                .unwrap()
+                .create_io_queue_pair(QUEUE_LENGTH)
+                .unwrap();
+
+            let rand_block = &(0..(32 * bytes))
+                .map(|_| rand::random::<u8>())
+                .collect::<Vec<_>>()[..];
+            buffer[0..32 * bytes].copy_from_slice(rand_block);
+
+            let mut ctr = 0;
+            if let Some(time) = time {
+                let mut ios = 0;
+                while total < time {
+                    let lba = rng.gen_range(range.0..range.1);
+                    let before = Instant::now();
+                    while let Some(_) = qpair.quick_poll() {
+                        ctr -= 1;
+                        ios += 1;
+                    }
+                    if ctr == batch_size {
+                        qpair.complete_io(1);
+                        ctr -= 1;
+                        ios += 1;
+                    }
+                    qpair.submit_io(
+                        &buffer.slice((ctr * bytes)..(ctr + 1) * bytes),
+                        lba * blocks,
+                        write,
+                    );
+                    total += before.elapsed();
+                    ctr += 1;
+                }
+
+                if ctr != 0 {
+                    let before = Instant::now();
+                    qpair.complete_io(ctr);
+                    total += before.elapsed();
+                }
+                ios += ctr as u64;
+                assert!(qpair.sub_queue.is_empty());
+                nvme.lock().unwrap().delete_io_queue_pair(qpair).unwrap();
+
+                (ios, ios as f64 / total.as_secs_f64())
+            } else {
+                let seq = &(0..n)
+                    .map(|_| rng.gen_range(range.0..range.1))
+                    .collect::<Vec<u64>>()[..];
+                for &lba in seq {
+                    let before = Instant::now();
+                    while let Some(_) = qpair.quick_poll() {
+                        ctr -= 1;
+                    }
+                    if ctr == 32 {
+                        qpair.complete_io(1);
+                        ctr -= 1;
+                    }
+                    qpair.submit_io(
+                        &buffer.slice((ctr * bytes)..(ctr + 1) * bytes),
+                        lba * blocks,
+                        write,
+                    );
+                    total += before.elapsed();
+                    ctr += 1;
+                }
+                if ctr != 0 {
+                    let before = Instant::now();
+                    qpair.complete_io(ctr);
+                    total += before.elapsed();
+                }
+                assert!(qpair.sub_queue.is_empty());
+                nvme.lock().unwrap().delete_io_queue_pair(qpair).unwrap();
+                (n, n as f64 / total.as_secs_f64())
+            }
+
+        });
+        threads.push(handle);
+    }
+
+    let total = threads
+        .into_iter()
+        .fold((0, 0.), |acc, thread| {
+            let res = thread
+                .join()
+                .expect("The thread creation or execution failed!");
+            (
+                acc.0 + res.0,
+                acc.1 + res.1,
+            )
+        });
+    println!(
+        "n: {}, total {} iops: {:?}",
+        total.0,
+        if write { "write" } else { "read" },
+        total.1
+    );
+
+    match Arc::try_unwrap(nvme) {
+        Ok(mutex) => match mutex.into_inner() {
+            Ok(t) => Ok(t),
+            Err(e) => Err(e.into()),
+        },
+        Err(_) => Err("Arc::try_unwrap failed, not the last reference.".into()),
+    }
+}
+
+fn fill_ns(nvme: &mut NvmeDevice) {
+    let buffer: Dma<u8> = Dma::allocate(HUGE_PAGE_SIZE, true).unwrap();
+    let max_lba = nvme.namespaces.get(&1).unwrap().blocks - buffer.size as u64 / 512 - 1;
+    let blocks = buffer.size as u64 / 512;
+    let mut lba = 0;
+    while lba < max_lba - 512 {
+        nvme.write(&buffer, lba).unwrap();
+        lba += blocks;
+    }
+}
diff --git a/rust-toolchain b/rust-toolchain
new file mode 100644
index 0000000..bf867e0
--- /dev/null
+++ b/rust-toolchain
@@ -0,0 +1 @@
+nightly
diff --git a/src/cmd.rs b/src/cmd.rs
index bf766ab..6aa2e6a 100644
--- a/src/cmd.rs
+++ b/src/cmd.rs
@@ -75,6 +75,24 @@ impl NvmeCommand {
         }
     }
 
+    pub fn delete_io_submission_queue(c_id: u16, q_id: u16) -> Self {
+        Self {
+            opcode: 0,
+            c_id,
+            cdw10: q_id as u32,
+            ..Default::default()
+        }
+    }
+
+    pub fn delete_io_completion_queue(c_id: u16, q_id: u16) -> Self {
+        Self {
+            opcode: 4,
+            c_id,
+            cdw10: q_id as u32,
+            ..Default::default()
+        }
+    }
+
     pub fn identify_namespace(c_id: u16, ptr: usize, ns_id: u32) -> Self {
         Self {
             opcode: 6,
@@ -129,7 +147,6 @@ impl NvmeCommand {
         }
     }
 
-    #[allow(unused_variables)]
     pub fn get_features(c_id: u16, ptr: usize, fid: u8) -> Self {
         Self {
             opcode: 0xA,
@@ -151,7 +168,7 @@ impl NvmeCommand {
             cdw10: lba as u32,
             cdw11: (lba >> 32) as u32,
             cdw12: blocks_1 as u32,
-            cdw13: 0, // TODO?
+            cdw13: 0,
             cdw14: 0,
             cdw15: 0,
         }
@@ -196,6 +213,34 @@ impl NvmeCommand {
 
     }
 
+    pub(crate) fn async_event_req(c_id: u16) -> Self {
+        Self {
+            opcode: 0xC,
+            flags: 0,
+            c_id,
+            ns_id: 0,
+            _rsvd: 0,
+            md_ptr: 0,
+            d_ptr: [0, 0],
+            cdw10: 0,
+            cdw11: 0,
+            cdw12: 0,
+            cdw13: 0,
+            cdw14: 0,
+            cdw15: 0
+        }
+    }
+
+    pub(crate) fn get_log_page(c_id: u16, numd: u32, ptr0: u64, ptr1: u64, lid: u8, lpid: u16) -> Self {
+        Self {
+            c_id,
+            d_ptr: [ptr0, ptr1],
+            cdw10: (numd << 16) | lid as u32,
+            cdw11: ((lpid as u32) << 16) | numd >> 16,
+            ..Self::default()
+        }
+    }
+
     // not supported by samsung
     pub fn write_zeroes(c_id: u16, ns_id: u32, slba: u64, nlb: u16, deac: bool) -> Self {
         Self {
diff --git a/src/lib.rs b/src/lib.rs
index 015041c..56736bb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,31 +16,6 @@ use pci::*;
 pub use queues::QUEUE_LENGTH;
 use std::error::Error;
 
-// TODO: remove in place of std::hint::spin_loop
-#[cfg(target_arch = "aarch64")]
-#[inline(always)]
-pub(crate) fn pause() {
-    unsafe {
-        std::arch::aarch64::__yield();
-    }
-}
-
-#[cfg(target_arch = "x86")]
-#[inline(always)]
-pub(crate) fn pause() {
-    unsafe {
-        std::arch::x86::_mm_pause();
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-pub(crate) fn pause() {
-    unsafe {
-        std::arch::x86_64::_mm_pause();
-    }
-}
-
 pub fn init(pci_addr: &str) -> Result<NvmeDevice, Box<dyn Error>> {
     let mut vendor_file = pci_open_resource_ro(pci_addr, "vendor").expect("wrong pci address");
     let mut device_file = pci_open_resource_ro(pci_addr, "device").expect("wrong pci address");
diff --git a/src/nvme.rs b/src/nvme.rs
index 61f0bf8..e706575 100644
--- a/src/nvme.rs
+++ b/src/nvme.rs
@@ -367,7 +367,6 @@ impl NvmeDevice {
     }
 
     // 1 to 1 Submission/Completion Queue Mapping
-    // TODO: return qpair instead?
     pub fn create_io_queue_pair(&mut self, len: usize) -> Result<NvmeQueuePair, Box<dyn Error>> {
         let q_id = self.q_id;
         println!("Requesting i/o queue pair with id {q_id}");
@@ -403,6 +402,23 @@ impl NvmeDevice {
         })
     }
 
+    pub fn delete_io_queue_pair(&mut self, qpair: NvmeQueuePair) -> Result<(), Box<dyn Error>> {
+        println!("Deleting i/o queue pair with id {}", qpair.id);
+        self.submit_and_complete_admin(|c_id, _| {
+            NvmeCommand::delete_io_submission_queue(
+                c_id,
+                qpair.id,
+            )
+        })?;
+        self.submit_and_complete_admin(|c_id, _| {
+            NvmeCommand::delete_io_completion_queue(
+                c_id,
+                qpair.id,
+            )
+        })?;
+        Ok(())
+    }
+
     pub fn identify_namespace_list(&mut self, base: u32) -> Vec<u32> {
         self.submit_and_complete_admin(|c_id, addr| {
             NvmeCommand::identify_namespace_list(c_id, addr, base)
@@ -452,13 +468,10 @@ impl NvmeDevice {
         namespace
     }
 
-    // pass prp list?
     pub fn write(&mut self, data: &impl DmaSlice, mut lba: u64) -> Result<(), Box<dyn Error>> {
-        let ns = *self.namespaces.get(&1).unwrap();
-
         for chunk in data.chunks(2 * 4096) {
-            let blocks = (chunk.slice.len() as u64 + ns.block_size - 1) / ns.block_size;
-            self.namespace_io(&ns, blocks, lba, chunk.phys_addr as u64, true)?;
+            let blocks = (chunk.slice.len() as u64 + 512 - 1) / 512;
+            self.namespace_io(1, blocks, lba, chunk.phys_addr as u64, true)?;
             lba += blocks;
         }
 
@@ -466,11 +479,10 @@ impl NvmeDevice {
     }
 
     pub fn read(&mut self, dest: &impl DmaSlice, mut lba: u64) -> Result<(), Box<dyn Error>> {
-        let ns = *self.namespaces.get(&1).unwrap();
-
+        // let ns = *self.namespaces.get(&1).unwrap();
         for chunk in dest.chunks(2 * 4096) {
-            let blocks = (chunk.slice.len() as u64 + ns.block_size - 1) / ns.block_size;
-            self.namespace_io(&ns, blocks, lba, chunk.phys_addr as u64, false)?;
+            let blocks = (chunk.slice.len() as u64 + 512 - 1) / 512;
+            self.namespace_io(1, blocks, lba, chunk.phys_addr as u64, false)?;
             lba += blocks;
         }
         Ok(())
@@ -478,12 +490,10 @@ impl NvmeDevice {
 
     pub fn write_copied(&mut self, data: &[u8], mut lba: u64) -> Result<(), Box<dyn Error>> {
         let ns = *self.namespaces.get(&1).unwrap();
-
-        // for chunk in data.chunks(128 * 4096) {
-        for chunk in data.chunks(2 * 4096) {
+        for chunk in data.chunks(128 * 4096) {
             self.buffer[..chunk.len()].copy_from_slice(chunk);
             let blocks = (chunk.len() as u64 + ns.block_size - 1) / ns.block_size;
-            self.namespace_io(&ns, blocks, lba, self.buffer.phys as u64, true)?;
+            self.namespace_io(1, blocks, lba, self.buffer.phys as u64, true)?;
             lba += blocks;
         }
 
@@ -497,19 +507,16 @@ impl NvmeDevice {
         mut lba: u64,
     ) -> Result<(), Box<dyn Error>> {
         let ns = *self.namespaces.get(&ns_id).unwrap();
-
-        // for chunk in dest.chunks_mut(128 * 4096) {
-        for chunk in dest.chunks_mut(2 * 4096) {
+        for chunk in dest.chunks_mut(128 * 4096) {
             let blocks = (chunk.len() as u64 + ns.block_size - 1) / ns.block_size;
-            self.namespace_io(&ns, blocks, lba, self.buffer.phys as u64, false)?;
-
+            self.namespace_io(1, blocks, lba, self.buffer.phys as u64, false)?;
             lba += blocks;
             chunk.copy_from_slice(&self.buffer[..chunk.len()]);
         }
         Ok(())
     }
 
-    pub fn submit_io(
+    fn submit_io(
         &mut self,
         ns: &NvmeNamespace,
         addr: u64,
@@ -527,8 +534,8 @@ impl NvmeDevice {
         } else if bytes <= 8192 {
             addr + 4096 // self.page_size
         } else {
-            // TODOo: idk if correct
-            let offset = (addr - self.prp_list.phys as u64) / 8;
+            // idk if this works
+            let offset = (addr - self.buffer.phys as u64) / 8;
             self.prp_list.phys as u64 + offset
         };
 
@@ -554,7 +561,7 @@ impl NvmeDevice {
         self.io_sq.submit_checked(entry)
     }
 
-    pub fn complete_io(&mut self, step: u64) -> Option<u16> {
+    fn complete_io(&mut self, step: u64) -> Option<u16> {
         let q_id = 1;
 
         let (tail, c_entry, _) = self.io_cq.complete_n(step as usize);
@@ -654,9 +661,10 @@ impl NvmeDevice {
         Ok(())
     }
 
-    pub fn namespace_io(
+    #[inline(always)]
+    fn namespace_io(
         &mut self,
-        ns: &NvmeNamespace,
+        ns_id: u32,
         blocks: u64,
         lba: u64,
         addr: u64,
@@ -667,22 +675,20 @@ impl NvmeDevice {
 
         let q_id = 1;
 
-        let bytes = blocks * ns.block_size;
+        let bytes = blocks * 512;
         let ptr1 = if bytes <= 4096 {
             0
         } else if bytes <= 8192 {
             // self.buffer.phys as u64 + 4096 // self.page_size
             addr + 4096 // self.page_size
         } else {
-            // self.prp_list.phys as u64
-            eprintln!("tough luck");
-            addr + 4096
+            self.prp_list.phys as u64
         };
 
         let entry = if write {
             NvmeCommand::io_write(
                 self.io_sq.tail as u16,
-                ns.id,
+                ns_id,
                 lba,
                 blocks as u16 - 1,
                 addr,
@@ -691,7 +697,7 @@ impl NvmeDevice {
         } else {
             NvmeCommand::io_read(
                 self.io_sq.tail as u16,
-                ns.id,
+                ns_id,
                 lba,
                 blocks as u16 - 1,
                 addr,
@@ -707,7 +713,7 @@ impl NvmeDevice {
         Ok(())
     }
 
-    pub fn submit_and_complete_admin<F: FnOnce(u16, usize) -> NvmeCommand>(
+    fn submit_and_complete_admin<F: FnOnce(u16, usize) -> NvmeCommand>(
         &mut self,
         cmd_init: F,
     ) -> Result<NvmeCompletion, Box<dyn Error>> {
diff --git a/src/queues.rs b/src/queues.rs
index 722757f..6d45cd6 100644
--- a/src/queues.rs
+++ b/src/queues.rs
@@ -115,6 +115,7 @@ impl NvmeCompQueue {
     }
 
     ///
+    #[inline(always)]
     pub fn complete_n(&mut self, commands: usize) -> (usize, NvmeCompletion, usize) {
         let prev = self.head;
         self.head += commands - 1;
@@ -127,6 +128,7 @@ impl NvmeCompQueue {
         (head, entry, prev)
     }
 
+    #[inline(always)]
     pub fn complete_spin(&mut self) -> (usize, NvmeCompletion, usize) {
         loop {
             if let Some(val) = self.complete() {