vercel · lukesandberg · Jun 8, 2025 · Jun 9, 2025 · Jun 10, 2025 · Jun 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/napi/src/lib.rs b/crates/napi/src/lib.rs
@@ -73,7 +73,7 @@

    use tokio::runtime::Builder;
    use turbo_tasks::panic_hooks::handle_panic;
    use turbo_tasks_malloc::TurboMalloc;

    let prev_hook = take_hook();
    set_hook(Box::new(move |info| {
@@ -83,9 +83,6 @@
 
     let rt = Builder::new_multi_thread()
         .enable_all()
-        .on_thread_stop(|| {
-            TurboMalloc::thread_stop();
-        })
         .disable_lifo_slot()
         .build()
         .unwrap();

diff --git a/crates/next-build-test/src/main.rs b/crates/next-build-test/src/main.rs
@@ -73,7 +73,6 @@ fn main() {
             tokio::runtime::Builder::new_multi_thread()
                 .enable_all()
                 .on_thread_stop(|| {
-                    TurboMalloc::thread_stop();
                     tracing::debug!("threads stopped");
                 })
                 .build()
@@ -127,8 +126,8 @@ fn main() {
                         noop_backing_storage(),
                     ));
                     let result = main_inner(&tt, strat, factor, limit, files).await;
-                    let memory = TurboMalloc::memory_usage();
-                    tracing::info!("memory usage: {} MiB", memory / 1024 / 1024);
+                    let memory = TurboMalloc::global_allocation_counters();
+                    tracing::info!("memory usage: {}", memory);
                     let start = Instant::now();
                     drop(tt);
                     tracing::info!("drop {:?}", start.elapsed());

diff --git a/turbopack/crates/turbo-tasks-malloc/Cargo.toml b/turbopack/crates/turbo-tasks-malloc/Cargo.toml
@@ -20,3 +20,7 @@ mimalloc = { version = "0.1.42", features = [
 [features]
 custom_allocator = ["dep:mimalloc"]
 default = ["custom_allocator"]
+
+[dependencies]
+lazy_static = {workspace = true}
+rustc-hash = {workspace = true}
diff --git a/turbopack/crates/turbo-tasks-malloc/src/counter.rs b/turbopack/crates/turbo-tasks-malloc/src/counter.rs
@@ -1,176 +1,205 @@
 use std::{
     cell::UnsafeCell,
     ptr::NonNull,
-    sync::atomic::{AtomicUsize, Ordering},
+    sync::{
+        LazyLock,
+        atomic::{AtomicUsize, Ordering},
+    },
 };
 
 use crate::AllocationCounters;
 
-/// Tracks the current total amount of memory allocated through all the [ThreadLocalCounter]
-/// instances.  This is an overestimate as individual threads 'preallocate' a [TARGET_BUFFER] bytes
-/// to reduce the number of global synchronizations.  This means at any given time this might
-/// overcount by up to [MAX_BUFFER] bytes for each thread.
-static ALLOCATED: AtomicUsize = AtomicUsize::new(0);
-const KB: usize = 1024;
-/// When global counter is updates we will keep a thread-local buffer of this
-/// size.
-const TARGET_BUFFER: usize = 100 * KB;
-/// When the thread-local buffer would exceed this size, we will update the
-/// global counter.
-const MAX_BUFFER: usize = 200 * KB;
-
-#[derive(Default)]
-struct ThreadLocalCounter {
-    /// Thread-local buffer of allocated bytes that have been added to the
-    /// global counter desprite not being allocated yet. It is unsigned so that
-    /// means the global counter is always equal or greater than the real
-    /// value.
-    buffer: usize,
-    allocation_counters: AllocationCounters,
+static GLOBAL: LazyLock<Vec<AtomicCounters>> = LazyLock::new(|| {
+    let size = match std::thread::available_parallelism() {
+        Ok(s) => s.into(),
+        Err(_) => 128,
+    };
+    let mut vec = Vec::with_capacity(size);
+    for _ in 0..size {
+        vec.push(AtomicCounters::new());
+    }
+    vec
+});
+
+static INDEX: AtomicUsize = AtomicUsize::new(0);
+
+/// Returns a reference to an entry in global
+fn get_global_ref() -> &'static AtomicCounters {
+    let index = INDEX.fetch_add(1, Ordering::AcqRel) & (GLOBAL.len() - 1);
+
+    GLOBAL.get(index).unwrap()
 }
 
-impl ThreadLocalCounter {
+#[repr(align(64))] // cache line aligned to reduce false sharing between adjacent entries
+struct AtomicCounters {
+    allocations: AtomicUsize,
+    deallocations: AtomicUsize,
+    allocation_count: AtomicUsize,
+    deallocation_count: AtomicUsize,
+}
+
+impl AtomicCounters {
     const fn new() -> Self {
         Self {
-            buffer: 0,
-            allocation_counters: AllocationCounters::new(),
+            allocations: AtomicUsize::new(0),
+            deallocations: AtomicUsize::new(0),
+            allocation_count: AtomicUsize::new(0),
+            deallocation_count: AtomicUsize::new(0),
         }
     }
-    fn add(&mut self, size: usize) {
-        self.allocation_counters.allocations += size;
-        self.allocation_counters.allocation_count += 1;
-        if self.buffer >= size {
-            self.buffer -= size;
-        } else {
-            let offset = size - self.buffer + TARGET_BUFFER;
-            self.buffer = TARGET_BUFFER;
-            ALLOCATED.fetch_add(offset, Ordering::Relaxed);
+}
+
+struct ThreadLocalCounter {
+    counters: AllocationCounters,
+    global: Option<&'static AtomicCounters>,
+}
+
+impl ThreadLocalCounter {
+    const fn new() -> Self {
+        Self {
+            global: None,
+            counters: AllocationCounters::new(),
         }
     }
 
-    fn remove(&mut self, size: usize) {
-        self.allocation_counters.deallocations += size;
-        self.allocation_counters.deallocation_count += 1;
-        self.buffer += size;
-        if self.buffer > MAX_BUFFER {
-            let offset = self.buffer - TARGET_BUFFER;
-            self.buffer = TARGET_BUFFER;
-            ALLOCATED.fetch_sub(offset, Ordering::Relaxed);
-        }
+    fn add(&mut self, size: usize, global: &AtomicCounters) {
+        self.counters.allocations += size;
+        self.counters.allocation_count += 1;
+
+        global.allocations.fetch_add(size, Ordering::Relaxed);
+        global.allocation_count.fetch_add(1, Ordering::Relaxed);
     }
 
-    fn update(&mut self, old_size: usize, new_size: usize) {
-        self.allocation_counters.deallocations += old_size;
-        self.allocation_counters.deallocation_count += 1;
-        self.allocation_counters.allocations += new_size;
-        self.allocation_counters.allocation_count += 1;
-        match old_size.cmp(&new_size) {
-            std::cmp::Ordering::Equal => {}
-            std::cmp::Ordering::Less => {
-                let size = new_size - old_size;
-                if self.buffer >= size {
-                    self.buffer -= size;
-                } else {
-                    let offset = size - self.buffer + TARGET_BUFFER;
-                    self.buffer = TARGET_BUFFER;
-                    ALLOCATED.fetch_add(offset, Ordering::Relaxed);
-                }
-            }
-            std::cmp::Ordering::Greater => {
-                let size = old_size - new_size;
-                self.buffer += size;
-                if self.buffer > MAX_BUFFER {
-                    let offset = self.buffer - TARGET_BUFFER;
-                    self.buffer = TARGET_BUFFER;
-                    ALLOCATED.fetch_sub(offset, Ordering::Relaxed);
-                }
-            }
-        }
+    fn remove(&mut self, size: usize, global: &AtomicCounters) {
+        self.counters.deallocations += size;
+        self.counters.deallocation_count += 1;
+        global.deallocations.fetch_add(size, Ordering::Relaxed);
+        global.deallocation_count.fetch_add(1, Ordering::Relaxed);
     }
 
-    fn unload(&mut self) {
-        if self.buffer > 0 {
-            ALLOCATED.fetch_sub(self.buffer, Ordering::Relaxed);
-            self.buffer = 0;
-        }
-        self.allocation_counters = AllocationCounters::default();
+    fn update(&mut self, old_size: usize, new_size: usize, global: &AtomicCounters) {
+        self.add(new_size, global);
+        self.remove(old_size, global);
     }
 }
 
 thread_local! {
   static LOCAL_COUNTER: UnsafeCell<ThreadLocalCounter> = const {UnsafeCell::new(ThreadLocalCounter::new())};
 }
 
-pub fn get() -> usize {
-    ALLOCATED.load(Ordering::Relaxed)
+// stores an estimate of the peak memory allocated.
+static MAX_ALLOCATED: AtomicUsize = AtomicUsize::new(0);
+
+/// Returns an estimate of the total memory statistics
+pub fn global_counters() -> AllocationCounters {
+    let mut counters = AllocationCounters::new();
+    for global in GLOBAL.iter() {
+        counters.allocation_count += global.allocation_count.load(Ordering::Acquire);
+        counters.deallocation_count += global.deallocation_count.load(Ordering::Acquire);
+        counters.allocations += global.allocations.load(Ordering::Acquire);
+        counters.deallocations += global.deallocations.load(Ordering::Acquire);
+    }
+
+    MAX_ALLOCATED.fetch_max(
+        counters.allocations - counters.deallocations,
+        Ordering::AcqRel,
+    );
+    counters
 }
 
 pub fn allocation_counters() -> AllocationCounters {
-    with_local_counter(|local| local.allocation_counters.clone())
+    with_local_counter(|local, _| local.counters.clone())
 }
 
+/// Resets the counters for the current thread.
+/// This is used to exclude some work from the metrics and as such should be used sparingly.
+/// NOTE: this does not exclude the allocations from the global metrics
 pub fn reset_allocation_counters(start: AllocationCounters) {
-    with_local_counter(|local| local.allocation_counters = start);
+    with_local_counter(|local, _| local.counters = start);
 }
 
-fn with_local_counter<T>(f: impl FnOnce(&mut ThreadLocalCounter) -> T) -> T {
+fn with_local_counter<T>(f: impl FnOnce(&mut ThreadLocalCounter, &AtomicCounters) -> T) -> T {
     LOCAL_COUNTER.with(|local| {
         let ptr = local.get();
-        // SAFETY: This is a thread local.
+        // SAFETY: This is a thread local, and the functions we pass do not recursively access the
+        // threadlocal
         let mut local = unsafe { NonNull::new_unchecked(ptr) };
-        f(unsafe { local.as_mut() })
+        let local = unsafe { local.as_mut() };
+        let global = *local.global.get_or_insert_with(get_global_ref);
+        f(local, global)
     })
 }
 
 /// Adds some `size` to the global counter in a thread-local buffered way.
 pub fn add(size: usize) {
-    with_local_counter(|local| local.add(size));
+    with_local_counter(|local, global| local.add(size, global));
 }
 
 /// Removes some `size` to the global counter in a thread-local buffered way.
 pub fn remove(size: usize) {
-    with_local_counter(|local| local.remove(size));
+    with_local_counter(|local, global| local.remove(size, global));
 }
 
 /// Adds some `size` to the global counter in a thread-local buffered way.
 pub fn update(old_size: usize, new_size: usize) {
-    with_local_counter(|local| local.update(old_size, new_size));
-}
-
-/// Flushes the thread-local buffer to the global counter. This should be called
-/// e. g. when a thread is stopped or goes to sleep for a long time.
-pub fn flush() {
-    with_local_counter(|local| local.unload());
+    with_local_counter(|local, global| local.update(old_size, new_size, global));
 }
 
 #[cfg(test)]
 mod tests {
+    use std::thread;
+
     use super::*;
 
     #[test]
-    fn counting() {
-        let mut expected = get();
-        add(100);
-        // Initial change should fill up the buffer
-        expected += TARGET_BUFFER + 100;
-        assert_eq!(get(), expected);
-        add(100);
-        // Further changes should use the buffer
-        assert_eq!(get(), expected);
-        add(MAX_BUFFER);
-        // Large changes should require more buffer space
-        expected += 100 + MAX_BUFFER;
-        assert_eq!(get(), expected);
-        remove(100);
-        // Small changes should use the buffer
-        // buffer size is now TARGET_BUFFER + 100
-        assert_eq!(get(), expected);
-        remove(MAX_BUFFER);
-        // The buffer should not grow over MAX_BUFFER
-        // buffer size would be TARGET_BUFFER + 100 + MAX_BUFFER
-        // but it will be reduce to TARGET_BUFFER
-        // this means the global counter should reduce by 100 + MAX_BUFFER
-        expected -= MAX_BUFFER + 100;
-        assert_eq!(get(), expected);
+    fn test_counters() {
+        let mut t1 = ThreadLocalCounter::new();
+        let mut t2 = ThreadLocalCounter::new();
+
+        assert_eq!(0, global_counters().allocations);
+
+        assert_eq!(0, global_counters().allocations);
+
+        t1.add(100, &GLOBAL[0]);
+        t2.add(300, &GLOBAL[0]);
+        assert_eq!(400, global_counters().allocations);
+
+        t2.remove(300, &GLOBAL[0]);
+        t1.remove(100, &GLOBAL[0]);
+        assert_eq!(
+            AllocationCounters {
+                allocations: 400,
+                allocation_count: 2,
+                deallocations: 400,
+                deallocation_count: 2,
+                ..Default::default()
+            },
+            global_counters()
+        );
+        assert_eq!(400, MAX_ALLOCATED.load(Ordering::Acquire));
+    }
+
+    #[test]
+    fn test_multithreaded() {
+        const N_THREADS: usize = 1;
+        let barrier = std::sync::Barrier::new(N_THREADS + 1);
+        thread::scope(|s| {
+            for _ in 0..N_THREADS {
+                s.spawn(|| {
+                    add(1);
+                    barrier.wait();
+                    barrier.wait();
+                    remove(1);
+                });
+            }
+            // Wait for all threads to reach the first barrier
+            barrier.wait();
+            let global = global_counters();
+            assert_eq!(global.allocations, N_THREADS);
+            barrier.wait(); // release all the threads
+        });
+        let global = global_counters();
+        assert_eq!(global.allocations, N_THREADS);
+        assert_eq!(global.deallocations, N_THREADS);
     }
 }