diff --git a/array.c b/array.c
index fbb712c7262624..71286d2d8d0669 100644
--- a/array.c
+++ b/array.c
@@ -6875,7 +6875,7 @@ static const rb_data_type_t ary_sample_memo_type = {
     .function = {
         .dfree = (RUBY_DATA_FUNC)st_free_table,
     },
-    .flags = RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY
+    .flags = RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/ast.c b/ast.c
index 5357aa38a5ae09..3321ae069f77a3 100644
--- a/ast.c
+++ b/ast.c
@@ -45,7 +45,7 @@ static const rb_data_type_t rb_node_type = {
     "AST/node",
     {node_gc_mark, RUBY_TYPED_DEFAULT_FREE, node_memsize,},
     0, 0,
-    RUBY_TYPED_FREE_IMMEDIATELY,
+    RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 struct ASTLocationData {
@@ -70,7 +70,7 @@ static const rb_data_type_t rb_location_type = {
     "AST/location",
     {location_gc_mark, RUBY_TYPED_DEFAULT_FREE, location_memsize,},
     0, 0,
-    RUBY_TYPED_FREE_IMMEDIATELY,
+    RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 
diff --git a/box.c b/box.c
index fba494f7ad2e1c..88be74a0a9bfb1 100644
--- a/box.c
+++ b/box.c
@@ -300,7 +300,7 @@ static const rb_data_type_t rb_box_data_type = {
         box_entry_memsize,
         rb_box_gc_update_references,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY // TODO: enable RUBY_TYPED_WB_PROTECTED when inserting write barriers
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE // TODO: enable RUBY_TYPED_WB_PROTECTED when inserting write barriers
 };
 
 static const rb_data_type_t rb_root_box_data_type = {
@@ -311,7 +311,7 @@ static const rb_data_type_t rb_root_box_data_type = {
         box_entry_memsize,
         rb_box_gc_update_references,
     },
-    &rb_box_data_type, 0, RUBY_TYPED_FREE_IMMEDIATELY // TODO: enable RUBY_TYPED_WB_PROTECTED when inserting write barriers
+    &rb_box_data_type, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE // TODO: enable RUBY_TYPED_WB_PROTECTED when inserting write barriers
 };
 
 VALUE
@@ -755,7 +755,7 @@ box_ext_cleanup_free(void *p)
 static const rb_data_type_t box_ext_cleanup_type = {
     "box_ext_cleanup",
     {box_ext_cleanup_mark, box_ext_cleanup_free},
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 void
diff --git a/compile.c b/compile.c
index 100ab126ed152e..bad52f6620310a 100644
--- a/compile.c
+++ b/compile.c
@@ -12312,7 +12312,7 @@ static const rb_data_type_t labels_wrapper_type = {
         .dmark = (RUBY_DATA_FUNC)rb_mark_set,
         .dfree = (RUBY_DATA_FUNC)st_free_table,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 void
@@ -12573,7 +12573,7 @@ static const rb_data_type_t pinned_list_type = {
         RUBY_DEFAULT_FREE,
         NULL, // No external memory to report,
     },
-    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -14724,7 +14724,7 @@ ibf_dump_memsize(const void *ptr)
 static const rb_data_type_t ibf_dump_type = {
     "ibf_dump",
     {ibf_dump_mark, ibf_dump_free, ibf_dump_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static void
@@ -14961,7 +14961,7 @@ ibf_loader_memsize(const void *ptr)
 static const rb_data_type_t ibf_load_type = {
     "ibf_loader",
     {ibf_loader_mark, ibf_loader_free, ibf_loader_memsize,},
-    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 const rb_iseq_t *
diff --git a/concurrent_set.c b/concurrent_set.c
index c8b0c73881a85d..227f1b53f9f02b 100644
--- a/concurrent_set.c
+++ b/concurrent_set.c
@@ -4,14 +4,24 @@
 #include "ruby/atomic.h"
 #include "vm_sync.h"
 
-#define CONCURRENT_SET_CONTINUATION_BIT ((VALUE)1 << (sizeof(VALUE) * CHAR_BIT - 1))
-#define CONCURRENT_SET_HASH_MASK (~CONCURRENT_SET_CONTINUATION_BIT)
+// insertion probes have gone past this slot
+#define CONCURRENT_SET_CONTINUATION_BIT ((VALUE)0x2)
+#define CONCURRENT_SET_KEY_MASK (~CONCURRENT_SET_CONTINUATION_BIT)
+// This slot's hash can be reclaimed if and only if the key is EMPTY and it doesn't have a continuation bit. If the key is something
+// else, this bit on the hash has no meaning and is ignored.
+#define CONCURRENT_SET_HASH_RECLAIMABLE_BIT ((VALUE)1 << (sizeof(VALUE) * CHAR_BIT - 1))
+#define CONCURRENT_SET_HASH_MASK (~CONCURRENT_SET_HASH_RECLAIMABLE_BIT)
+
+#define CONCURRENT_SET_DEBUG 0
+#define CONCURRENT_SET_DEBUG_STATS 0
+#define CONCURRENT_SET_DEBUG_DUPLICATES 0
+#define CONCURRENT_SET_DEBUG_BAD_HASH_FN 0
 
 enum concurrent_set_special_values {
-    CONCURRENT_SET_EMPTY,
-    CONCURRENT_SET_DELETED,
-    CONCURRENT_SET_MOVED,
-    CONCURRENT_SET_SPECIAL_VALUE_COUNT
+    CONCURRENT_SET_EMPTY = 0,
+    CONCURRENT_SET_TOMBSTONE = 1,
+    CONCURRENT_SET_MOVED = 5, // continuation bit is 0x02, so 0x05 doesn't have bits in conflict with it
+    CONCURRENT_SET_SPECIAL_VALUE_COUNT = 6
 };
 
 struct concurrent_set_entry {
@@ -22,38 +32,53 @@ struct concurrent_set_entry {
 struct concurrent_set {
     rb_atomic_t size;
     unsigned int capacity;
-    unsigned int deleted_entries;
+    rb_atomic_t deleted_entries;
     const struct rb_concurrent_set_funcs *funcs;
     struct concurrent_set_entry *entries;
+    int key_type;
+#if CONCURRENT_SET_DEBUG_STATS
+    rb_atomic_t find_count;
+    rb_atomic_t find_probe_total;
+    rb_atomic_t find_probe_max;
+    rb_atomic_t insert_count;
+    rb_atomic_t insert_probe_total;
+    rb_atomic_t insert_probe_max;
+#endif
 };
 
-static void
-concurrent_set_mark_continuation(struct concurrent_set_entry *entry, VALUE curr_hash_and_flags)
+static bool
+concurrent_set_mark_continuation(struct concurrent_set_entry *entry, VALUE raw_key)
 {
-    if (curr_hash_and_flags & CONCURRENT_SET_CONTINUATION_BIT) return;
-
-    RUBY_ASSERT((curr_hash_and_flags & CONCURRENT_SET_HASH_MASK) != 0);
+    if (raw_key & CONCURRENT_SET_CONTINUATION_BIT) return true;
 
-    VALUE new_hash = curr_hash_and_flags | CONCURRENT_SET_CONTINUATION_BIT;
-    VALUE prev_hash = rbimpl_atomic_value_cas(&entry->hash, curr_hash_and_flags, new_hash, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+    VALUE new_key = raw_key | CONCURRENT_SET_CONTINUATION_BIT; // NOTE: raw_key can be CONCURRENT_SET_EMPTY
+    VALUE prev_key = rbimpl_atomic_value_cas(&entry->key, raw_key, new_key, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_ACQUIRE);
 
-    // At the moment we only expect to be racing concurrently against another
-    // thread also setting the continuation bit.
-    // In the future if deletion is concurrent this will need adjusting
-    RUBY_ASSERT(prev_hash == curr_hash_and_flags || prev_hash == new_hash);
-    (void)prev_hash;
+    if (prev_key == raw_key || prev_key == new_key) {
+        return true;
+    }
+    else if ((prev_key & CONCURRENT_SET_KEY_MASK) == CONCURRENT_SET_TOMBSTONE) {
+        return true;
+    }
+    else {
+        // * key could have been made EMPTY, and anything could have happened to this slot since then. Need to retry.
+        // * key could have been moved during resize
+        return false;
+    }
 }
 
 static VALUE
 concurrent_set_hash(const struct concurrent_set *set, VALUE key)
 {
     VALUE hash = set->funcs->hash(key);
+#if CONCURRENT_SET_DEBUG_BAD_HASH_FN
+    hash = hash % 1024;
+    if (hash == 0) hash = 1;
+#endif
     hash &= CONCURRENT_SET_HASH_MASK;
-    if (hash == 0) {
-        hash ^= CONCURRENT_SET_HASH_MASK;
-    }
+    if (hash == 0) hash = ~(VALUE)0 & CONCURRENT_SET_HASH_MASK;
     RUBY_ASSERT(hash != 0);
-    RUBY_ASSERT(!(hash & CONCURRENT_SET_CONTINUATION_BIT));
+    RUBY_ASSERT(!(hash & CONCURRENT_SET_HASH_RECLAIMABLE_BIT));
     return hash;
 }
 
@@ -91,20 +116,31 @@ static const rb_data_type_t concurrent_set_type = {
         .dsize = concurrent_set_size,
     },
     /* Hack: NOT WB_PROTECTED on purpose (see above) */
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    /* NOTE: don't make embedded due to compaction */
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 VALUE
-rb_concurrent_set_new(const struct rb_concurrent_set_funcs *funcs, int capacity)
+rb_concurrent_set_new(const struct rb_concurrent_set_funcs *funcs, int capacity, int key_type)
 {
     struct concurrent_set *set;
     VALUE obj = TypedData_Make_Struct(0, struct concurrent_set, &concurrent_set_type, set);
     set->funcs = funcs;
     set->entries = ZALLOC_N(struct concurrent_set_entry, capacity);
     set->capacity = capacity;
+    (void)key_type;
+#if CONCURRENT_SET_DEBUG
+    set->key_type = key_type;
+#endif
     return obj;
 }
 
+void *
+rb_concurrent_set_get_data(VALUE set_obj)
+{
+    return RTYPEDDATA_GET_DATA(set_obj);
+}
+
 rb_atomic_t
 rb_concurrent_set_size(VALUE set_obj)
 {
@@ -113,6 +149,50 @@ rb_concurrent_set_size(VALUE set_obj)
     return RUBY_ATOMIC_LOAD(set->size);
 }
 
+unsigned int
+rb_concurrent_set_capacity(VALUE set_obj)
+{
+    struct concurrent_set *set = RTYPEDDATA_GET_DATA(set_obj);
+
+    return set->capacity;
+}
+
+void
+rb_concurrent_set_probe_stats(VALUE set_obj,
+                              rb_atomic_t *find_count, rb_atomic_t *find_probe_total, rb_atomic_t *find_probe_max,
+                              rb_atomic_t *insert_count, rb_atomic_t *insert_probe_total, rb_atomic_t *insert_probe_max)
+{
+#if CONCURRENT_SET_DEBUG_STATS
+    struct concurrent_set *set = RTYPEDDATA_GET_DATA(set_obj);
+    *find_count = RUBY_ATOMIC_LOAD(set->find_count);
+    *find_probe_total = RUBY_ATOMIC_LOAD(set->find_probe_total);
+    *find_probe_max = RUBY_ATOMIC_LOAD(set->find_probe_max);
+    *insert_count = RUBY_ATOMIC_LOAD(set->insert_count);
+    *insert_probe_total = RUBY_ATOMIC_LOAD(set->insert_probe_total);
+    *insert_probe_max = RUBY_ATOMIC_LOAD(set->insert_probe_max);
+#else
+    *find_count = 0;
+    *find_probe_total = 0;
+    *find_probe_max = 0;
+    *insert_count = 0;
+    *insert_probe_total = 0;
+    *insert_probe_max = 0;
+#endif
+}
+
+#if CONCURRENT_SET_DEBUG_STATS
+static void
+concurrent_set_atomic_max(rb_atomic_t *target, rb_atomic_t val)
+{
+    rb_atomic_t cur = RUBY_ATOMIC_LOAD(*target);
+    while (val > cur) {
+        rb_atomic_t prev = rbimpl_atomic_cas(target, cur, val, RBIMPL_ATOMIC_RELAXED, RBIMPL_ATOMIC_RELAXED);
+        if (prev == cur) break;
+        cur = prev;
+    }
+}
+#endif
+
 struct concurrent_set_probe {
     int idx;
     int d;
@@ -138,67 +218,59 @@ concurrent_set_probe_next(struct concurrent_set_probe *probe)
 }
 
 static void
-concurrent_set_try_resize_without_locking(VALUE old_set_obj, VALUE *set_obj_ptr)
+concurrent_set_try_resize_locked(VALUE old_set_obj, VALUE *set_obj_ptr, VALUE new_set_obj, int old_capacity)
 {
-    // Check if another thread has already resized.
-    if (rbimpl_atomic_value_load(set_obj_ptr, RBIMPL_ATOMIC_ACQUIRE) != old_set_obj) {
-        return;
-    }
-
     struct concurrent_set *old_set = RTYPEDDATA_GET_DATA(old_set_obj);
-
-    // This may overcount by up to the number of threads concurrently attempting to insert
-    // GC may also happen between now and the set being rebuilt
-    int expected_size = rbimpl_atomic_load(&old_set->size, RBIMPL_ATOMIC_RELAXED) - old_set->deleted_entries;
-
-    // NOTE: new capacity must make sense with load factor, don't change one without checking the other.
     struct concurrent_set_entry *old_entries = old_set->entries;
-    int old_capacity = old_set->capacity;
-    int new_capacity = old_capacity * 2;
-    if (new_capacity > expected_size * 8) {
-        new_capacity = old_capacity / 2;
-    }
-    else if (new_capacity > expected_size * 4) {
-        new_capacity = old_capacity;
-    }
-
-    // May cause GC and therefore deletes, so must happen first.
-    VALUE new_set_obj = rb_concurrent_set_new(old_set->funcs, new_capacity);
     struct concurrent_set *new_set = RTYPEDDATA_GET_DATA(new_set_obj);
 
     for (int i = 0; i < old_capacity; i++) {
         struct concurrent_set_entry *old_entry = &old_entries[i];
-        VALUE key = rbimpl_atomic_value_exchange(&old_entry->key, CONCURRENT_SET_MOVED, RBIMPL_ATOMIC_ACQUIRE);
-        RUBY_ASSERT(key != CONCURRENT_SET_MOVED);
+        VALUE prev_key_raw = rbimpl_atomic_value_exchange(&old_entry->key, CONCURRENT_SET_MOVED, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE prev_key = prev_key_raw & CONCURRENT_SET_KEY_MASK;
+        RUBY_ASSERT(prev_key != CONCURRENT_SET_MOVED);
+
+        if (prev_key < CONCURRENT_SET_SPECIAL_VALUE_COUNT) continue;
 
-        if (key < CONCURRENT_SET_SPECIAL_VALUE_COUNT) continue;
-        if (!RB_SPECIAL_CONST_P(key) && rb_objspace_garbage_object_p(key)) continue;
+        if (!RB_SPECIAL_CONST_P(prev_key) && rb_objspace_garbage_object_p(prev_key)) continue;
 
-        VALUE hash = rbimpl_atomic_value_load(&old_entry->hash, RBIMPL_ATOMIC_RELAXED) & CONCURRENT_SET_HASH_MASK;
-        RUBY_ASSERT(hash != 0);
-        RUBY_ASSERT(hash == concurrent_set_hash(old_set, key));
+#if CONCURRENT_SET_DEBUG
+        if (new_set->key_type == T_STRING) {
+            RUBY_ASSERT(BUILTIN_TYPE(prev_key) == T_STRING);
+            RUBY_ASSERT(FL_TEST(prev_key, RSTRING_FSTR));
+        }
+        else {
+            RUBY_ASSERT(STATIC_SYM_P(prev_key));
+        }
+#endif
+
+        VALUE hash = rbimpl_atomic_value_load(&old_entry->hash, RBIMPL_ATOMIC_ACQUIRE) & CONCURRENT_SET_HASH_MASK;
+        if (hash == 0) continue;
+        RUBY_ASSERT(concurrent_set_hash(old_set, prev_key) == hash);
 
         // Insert key into new_set.
         struct concurrent_set_probe probe;
         int idx = concurrent_set_probe_start(&probe, new_set, hash);
+        int start_idx = idx;
 
         while (true) {
             struct concurrent_set_entry *entry = &new_set->entries[idx];
 
-            if (entry->hash == CONCURRENT_SET_EMPTY) {
+            if (entry->hash == 0) {
                 RUBY_ASSERT(entry->key == CONCURRENT_SET_EMPTY);
 
                 new_set->size++;
                 RUBY_ASSERT(new_set->size <= new_set->capacity / 2);
 
-                entry->key = key;
+                entry->key = prev_key; // no continuation bit
                 entry->hash = hash;
                 break;
             }
 
             RUBY_ASSERT(entry->key >= CONCURRENT_SET_SPECIAL_VALUE_COUNT);
-            entry->hash |= CONCURRENT_SET_CONTINUATION_BIT;
+            entry->key |= CONCURRENT_SET_CONTINUATION_BIT;
             idx = concurrent_set_probe_next(&probe);
+            RUBY_ASSERT(idx != start_idx);
         }
     }
 
@@ -207,12 +279,101 @@ concurrent_set_try_resize_without_locking(VALUE old_set_obj, VALUE *set_obj_ptr)
     RB_GC_GUARD(old_set_obj);
 }
 
+// FIXME: cross-platform initializer. Also, we don't need rwlock anymore, just normal mutex will do
+static pthread_rwlock_t resize_lock = PTHREAD_RWLOCK_INITIALIZER;
+static pthread_t resize_lock_owner;
+static unsigned int resize_lock_lvl;
+
+static inline void
+resize_lock_wrlock(bool allow_reentry)
+{
+    if (allow_reentry && pthread_self() == resize_lock_owner) {
+        // Already held by this thread.
+    }
+    else {
+        int r;
+        if ((r = pthread_rwlock_wrlock(&resize_lock))) {
+            rb_bug_errno("pthread_rwlock_wrlock", r);
+        }
+        resize_lock_owner = pthread_self();
+    }
+    resize_lock_lvl++;
+}
+
+static inline void
+resize_lock_wrunlock(void)
+{
+    RUBY_ASSERT(resize_lock_lvl > 0);
+    resize_lock_lvl--;
+    if (resize_lock_lvl == 0) {
+        resize_lock_owner = 0;
+        int r;
+        if ((r = pthread_rwlock_unlock(&resize_lock))) {
+            rb_bug_errno("pthread_rwlock_unlock", r);
+        }
+    }
+}
+
+static inline bool
+resize_lock_rdlock(void)
+{
+    if (resize_lock_owner == pthread_self()) { // we have the write lock, don't take it
+        return false;
+    }
+    int r;
+    if ((r = pthread_rwlock_rdlock(&resize_lock))) {
+        rb_bug_errno("pthread_rwlock_rdlock", r);
+    }
+    return true;
+}
+
+static inline void
+resize_lock_rdunlock(void)
+{
+    int r;
+    if ((r = pthread_rwlock_unlock(&resize_lock))) {
+        rb_bug_errno("pthread_rwlock_unlock", r);
+    }
+}
+
 static void
 concurrent_set_try_resize(VALUE old_set_obj, VALUE *set_obj_ptr)
 {
-    RB_VM_LOCKING() {
-        concurrent_set_try_resize_without_locking(old_set_obj, set_obj_ptr);
+    unsigned int lev;
+    RB_VM_LOCK_ENTER_LEV(&lev);
+    {
+        // Check if another thread has already resized.
+        if (rbimpl_atomic_value_load(set_obj_ptr, RBIMPL_ATOMIC_ACQUIRE) != old_set_obj) {
+            RB_VM_LOCK_LEAVE_LEV(&lev);
+            return;
+        }
+        struct concurrent_set *old_set = RTYPEDDATA_GET_DATA(old_set_obj);
+
+        // This may overcount by up to the number of threads concurrently attempting to insert
+        // GC may also happen between now and the set being rebuilt
+        int expected_size = rbimpl_atomic_load(&old_set->size, RBIMPL_ATOMIC_RELAXED) - old_set->deleted_entries;
+
+        // NOTE: new capacity must make sense with load factor, don't change one without checking the other.
+        int old_capacity = old_set->capacity;
+        int new_capacity = old_capacity * 2;
+        if (new_capacity > expected_size * 8) {
+            new_capacity = old_capacity / 2;
+        }
+        else if (new_capacity > expected_size * 4) {
+            new_capacity = old_capacity;
+        }
+
+        // May cause GC and therefore deletes, so must happen first.
+        VALUE new_set_obj = rb_concurrent_set_new(old_set->funcs, new_capacity, old_set->key_type);
+        /*fprintf(stderr, "concurrent set resize from %d to %d\n", old_capacity, new_capacity);*/
+        // deletes from sweep thread must not happen during resize and sweep thread can't take VM lock so it takes the resize lock
+        resize_lock_wrlock(true);
+        {
+            concurrent_set_try_resize_locked(old_set_obj, set_obj_ptr, new_set_obj, old_capacity);
+        }
+        resize_lock_wrunlock();
     }
+    RB_VM_LOCK_LEAVE_LEV(&lev);
 }
 
 VALUE
@@ -242,29 +403,39 @@ rb_concurrent_set_find(VALUE *set_obj_ptr, VALUE key)
 
     while (true) {
         struct concurrent_set_entry *entry = &set->entries[idx];
-        VALUE curr_hash_and_flags = rbimpl_atomic_value_load(&entry->hash, RBIMPL_ATOMIC_ACQUIRE);
-        VALUE curr_hash = curr_hash_and_flags & CONCURRENT_SET_HASH_MASK;
-        bool continuation = curr_hash_and_flags & CONCURRENT_SET_CONTINUATION_BIT;
-
-        if (curr_hash_and_flags == CONCURRENT_SET_EMPTY) {
+        VALUE curr_hash = rbimpl_atomic_value_load(&entry->hash, RBIMPL_ATOMIC_ACQUIRE) & CONCURRENT_SET_HASH_MASK;
+
+        if (curr_hash == 0) {
+#if CONCURRENT_SET_DEBUG_STATS
+            rbimpl_atomic_fetch_add(&set->find_count, 1, RBIMPL_ATOMIC_RELAXED);
+            rbimpl_atomic_fetch_add(&set->find_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+            concurrent_set_atomic_max(&set->find_probe_max, probe.d);
+#endif
             return 0;
         }
 
+        VALUE raw_key = rbimpl_atomic_value_load(&entry->key, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE curr_key = raw_key & CONCURRENT_SET_KEY_MASK;
+        bool continuation = raw_key & CONCURRENT_SET_CONTINUATION_BIT;
+
         if (curr_hash != hash) {
             if (!continuation) {
+#if CONCURRENT_SET_DEBUG_STATS
+                rbimpl_atomic_fetch_add(&set->find_count, 1, RBIMPL_ATOMIC_RELAXED);
+                rbimpl_atomic_fetch_add(&set->find_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+                concurrent_set_atomic_max(&set->find_probe_max, probe.d);
+#endif
                 return 0;
             }
             idx = concurrent_set_probe_next(&probe);
             continue;
         }
 
-        VALUE curr_key = rbimpl_atomic_value_load(&entry->key, RBIMPL_ATOMIC_ACQUIRE);
-
         switch (curr_key) {
           case CONCURRENT_SET_EMPTY:
-            // In-progress insert: hash written but key not yet
+            // In-progress insert: hash written but key not yet.
             break;
-          case CONCURRENT_SET_DELETED:
+          case CONCURRENT_SET_TOMBSTONE:
             break;
           case CONCURRENT_SET_MOVED:
             // Wait
@@ -280,11 +451,21 @@ rb_concurrent_set_find(VALUE *set_obj_ptr, VALUE key)
 
             if (set->funcs->cmp(key, curr_key)) {
                 // We've found a match.
+#if CONCURRENT_SET_DEBUG_STATS
+                rbimpl_atomic_fetch_add(&set->find_count, 1, RBIMPL_ATOMIC_RELAXED);
+                rbimpl_atomic_fetch_add(&set->find_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+                concurrent_set_atomic_max(&set->find_probe_max, probe.d);
+#endif
                 RB_GC_GUARD(set_obj);
                 return curr_key;
             }
 
             if (!continuation) {
+#if CONCURRENT_SET_DEBUG_STATS
+                rbimpl_atomic_fetch_add(&set->find_count, 1, RBIMPL_ATOMIC_RELAXED);
+                rbimpl_atomic_fetch_add(&set->find_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+                concurrent_set_atomic_max(&set->find_probe_max, probe.d);
+#endif
                 return 0;
             }
 
@@ -312,7 +493,7 @@ rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data)
     RUBY_ASSERT(set_obj);
 
     struct concurrent_set *set = RTYPEDDATA_GET_DATA(set_obj);
-    key = set->funcs->create(key, data);
+    key = set->funcs->create(key, data); // this can join GC (takes VM Lock)
     VALUE hash = concurrent_set_hash(set, key);
 
     struct concurrent_set_probe probe;
@@ -333,33 +514,40 @@ rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data)
 
     while (true) {
         struct concurrent_set_entry *entry = &set->entries[idx];
-        VALUE curr_hash_and_flags = rbimpl_atomic_value_load(&entry->hash, RBIMPL_ATOMIC_ACQUIRE);
-        VALUE curr_hash = curr_hash_and_flags & CONCURRENT_SET_HASH_MASK;
-        bool continuation = curr_hash_and_flags & CONCURRENT_SET_CONTINUATION_BIT;
-
-        if (curr_hash_and_flags == CONCURRENT_SET_EMPTY) {
+        bool can_continue_probing;
+        VALUE raw_hash = rbimpl_atomic_value_load(&entry->hash, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE curr_hash = raw_hash & CONCURRENT_SET_HASH_MASK;
+        if (raw_hash == 0) {
             // Reserve this slot for our hash value
-            curr_hash_and_flags = rbimpl_atomic_value_cas(&entry->hash, CONCURRENT_SET_EMPTY, hash, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
-            if (curr_hash_and_flags != CONCURRENT_SET_EMPTY) {
+            raw_hash = rbimpl_atomic_value_cas(&entry->hash, 0, hash, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+            if (raw_hash != 0) {
                 // Lost race, retry same slot to check winner's hash
                 continue;
             }
-
-            // CAS succeeded, so these are the values stored
-            curr_hash_and_flags = hash;
+            raw_hash = hash;
             curr_hash = hash;
-
             // Fall through to try to claim key
         }
 
-        if (curr_hash != hash) {
-            goto probe_next;
-        }
-
-        VALUE curr_key = rbimpl_atomic_value_load(&entry->key, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE raw_key = rbimpl_atomic_value_load(&entry->key, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE curr_key = raw_key & CONCURRENT_SET_KEY_MASK;
+        bool continuation = raw_key & CONCURRENT_SET_CONTINUATION_BIT;
 
         switch (curr_key) {
           case CONCURRENT_SET_EMPTY: {
+            if ((raw_hash & CONCURRENT_SET_HASH_RECLAIMABLE_BIT) && !continuation) {
+                // Reclaim this reclaimable slot by clearing the reclaimable bit
+                VALUE prev_hash = rbimpl_atomic_value_cas(&entry->hash, raw_hash, hash, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+                if (prev_hash != raw_hash) {
+                    // Lost race, retry same slot
+                    continue;
+                }
+                curr_hash = hash;
+                raw_hash = hash;
+            }
+            if (curr_hash != hash) {
+                goto probe_next;
+            }
             rb_atomic_t prev_size = rbimpl_atomic_fetch_add(&set->size, 1, RBIMPL_ATOMIC_RELAXED);
 
             // Load_factor reached at 75% full. ex: prev_size: 32, capacity: 64, load_factor: 50%.
@@ -370,9 +558,38 @@ rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data)
                 goto retry;
             }
 
-            VALUE prev_key = rbimpl_atomic_value_cas(&entry->key, CONCURRENT_SET_EMPTY, key, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
-            if (prev_key == CONCURRENT_SET_EMPTY) {
-                RUBY_ASSERT(rb_concurrent_set_find(set_obj_ptr, key) == key);
+            VALUE prev_raw_key = rbimpl_atomic_value_cas(&entry->key, raw_key, key | (continuation ? CONCURRENT_SET_CONTINUATION_BIT : 0), RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+            if (prev_raw_key == raw_key) {
+#if CONCURRENT_SET_DEBUG_STATS
+                rbimpl_atomic_fetch_add(&set->insert_count, 1, RBIMPL_ATOMIC_RELAXED);
+                rbimpl_atomic_fetch_add(&set->insert_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+                concurrent_set_atomic_max(&set->insert_probe_max, probe.d);
+#endif
+#if CONCURRENT_SET_DEBUG_DUPLICATES
+                {
+                    // Probe further to verify no duplicate of our key exists
+                    struct concurrent_set_probe dup_probe = probe;
+                    int dup_idx = concurrent_set_probe_next(&dup_probe);
+                    int dup_idx_start = dup_idx;
+                    while (true) {
+                        struct concurrent_set_entry *dup_entry = &set->entries[dup_idx];
+                        VALUE dup_raw_key = rbimpl_atomic_value_load(&dup_entry->key, RBIMPL_ATOMIC_ACQUIRE);
+                        VALUE dup_key = dup_raw_key & CONCURRENT_SET_KEY_MASK;
+
+                        if (dup_key == CONCURRENT_SET_EMPTY) break;
+                        if (dup_key == CONCURRENT_SET_MOVED) break;
+
+                        if (dup_key >= CONCURRENT_SET_SPECIAL_VALUE_COUNT && dup_key == key) {
+                            rb_bug("concurrent_set_find_or_insert: duplicate key %p found at index %d after inserting at index %d",
+                                   (void *)key, dup_idx, idx);
+                        }
+                        int next_dup_idx = concurrent_set_probe_next(&dup_probe);
+                        if (dup_idx < dup_idx_start && next_dup_idx >= dup_idx_start) break;
+                        if (next_dup_idx == dup_idx_start) break;
+                        dup_idx = next_dup_idx;
+                    }
+                }
+#endif
                 RB_GC_GUARD(set_obj);
                 return key;
             }
@@ -380,31 +597,45 @@ rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data)
                 // Entry was not inserted.
                 rbimpl_atomic_sub(&set->size, 1, RBIMPL_ATOMIC_RELAXED);
 
-                // Another thread won the race, try again at the same location.
+                // * Another thread with the same hash could have won the race, try again at the same location, we might find it.
+                // * A resize could also be underway, and `prev_raw_key` could be CONCURRENT_SET_MOVED.
+                // * The continuation bit could also have been set on the key just now, in which case we'll retry
                 continue;
             }
           }
-          case CONCURRENT_SET_DELETED:
+          case CONCURRENT_SET_TOMBSTONE:
             break;
           case CONCURRENT_SET_MOVED:
             // Wait
             RB_VM_LOCKING();
             goto retry;
           default:
-            // We're never GC during our search
+            if (curr_hash != hash) {
+                goto probe_next;
+            }
             // If the continuation bit wasn't set at the start of our search,
-            // any concurrent find with the same hash value would also look at
+            // any concurrent find_or_insert with the same hash value would also look at
             // this location and try to swap curr_key
             if (UNLIKELY(!RB_SPECIAL_CONST_P(curr_key) && rb_objspace_garbage_object_p(curr_key))) {
                 if (continuation) {
                     goto probe_next;
                 }
-                rbimpl_atomic_value_cas(&entry->key, curr_key, CONCURRENT_SET_EMPTY, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
-                continue;
+                {
+                    VALUE prev = rbimpl_atomic_value_cas(&entry->key, raw_key, CONCURRENT_SET_EMPTY, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+                    if (prev == raw_key) {
+                        rbimpl_atomic_sub(&set->size, 1, RBIMPL_ATOMIC_RELAXED);
+                    }
+                }
+                continue; // try to reclaim same slot, because the hash is the same and it's now EMPTY
             }
 
             if (set->funcs->cmp(key, curr_key)) {
                 // We've found a live match.
+#if CONCURRENT_SET_DEBUG_STATS
+                rbimpl_atomic_fetch_add(&set->insert_count, 1, RBIMPL_ATOMIC_RELAXED);
+                rbimpl_atomic_fetch_add(&set->insert_probe_total, probe.d, RBIMPL_ATOMIC_RELAXED);
+                concurrent_set_atomic_max(&set->insert_probe_max, probe.d);
+#endif
                 RB_GC_GUARD(set_obj);
 
                 // We created key using set->funcs->create, but we didn't end
@@ -418,8 +649,10 @@ rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data)
         }
 
       probe_next:
-        RUBY_ASSERT(curr_hash_and_flags != CONCURRENT_SET_EMPTY);
-        concurrent_set_mark_continuation(entry, curr_hash_and_flags);
+        can_continue_probing =  concurrent_set_mark_continuation(entry, raw_key);
+        if (!can_continue_probing) {
+            continue;
+        }
         idx = concurrent_set_probe_next(&probe);
     }
 }
@@ -429,22 +662,21 @@ concurrent_set_delete_entry_locked(struct concurrent_set *set, struct concurrent
 {
     ASSERT_vm_locking_with_barrier();
 
-    if (entry->hash & CONCURRENT_SET_CONTINUATION_BIT) {
-        entry->hash = CONCURRENT_SET_CONTINUATION_BIT;
-        entry->key = CONCURRENT_SET_DELETED;
+    if (entry->key & CONCURRENT_SET_CONTINUATION_BIT) {
+        entry->key = CONCURRENT_SET_TOMBSTONE | CONCURRENT_SET_CONTINUATION_BIT;
         set->deleted_entries++;
     }
     else {
-        entry->hash = CONCURRENT_SET_EMPTY;
+        entry->hash = 0;
         entry->key = CONCURRENT_SET_EMPTY;
         set->size--;
     }
 }
 
-VALUE
-rb_concurrent_set_delete_by_identity(VALUE set_obj, VALUE key)
+
+static VALUE
+rb_concurrent_set_delete_by_identity_locked(VALUE set_obj, VALUE key)
 {
-    ASSERT_vm_locking_with_barrier();
 
     struct concurrent_set *set = RTYPEDDATA_GET_DATA(set_obj);
 
@@ -452,25 +684,70 @@ rb_concurrent_set_delete_by_identity(VALUE set_obj, VALUE key)
 
     struct concurrent_set_probe probe;
     int idx = concurrent_set_probe_start(&probe, set, hash);
+    bool hash_cleared = false;
+    VALUE prev_hash = 0;
 
     while (true) {
         struct concurrent_set_entry *entry = &set->entries[idx];
-        VALUE curr_key = entry->key;
+        VALUE raw_key = rbimpl_atomic_value_load(&entry->key, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE loaded_hash_raw = rbimpl_atomic_value_load(&entry->hash, RBIMPL_ATOMIC_ACQUIRE);
+        VALUE loaded_hash = loaded_hash_raw & CONCURRENT_SET_HASH_MASK;
+        bool continuation = raw_key & CONCURRENT_SET_CONTINUATION_BIT;
+        VALUE curr_key = raw_key & CONCURRENT_SET_KEY_MASK;
 
         switch (curr_key) {
           case CONCURRENT_SET_EMPTY:
-            // We didn't find our entry to delete.
-            return 0;
-          case CONCURRENT_SET_DELETED:
+            if (!continuation) {
+                return 0;
+            }
+            break;
+          case CONCURRENT_SET_TOMBSTONE:
             break;
           case CONCURRENT_SET_MOVED:
             rb_bug("rb_concurrent_set_delete_by_identity: moved entry");
             break;
           default:
             if (key == curr_key) {
-                RUBY_ASSERT((entry->hash & CONCURRENT_SET_HASH_MASK) == hash);
-                concurrent_set_delete_entry_locked(set, entry);
-                return curr_key;
+                VALUE new_key;
+                RUBY_ASSERT(hash_cleared || loaded_hash == hash);
+                if (continuation) {
+                    new_key = CONCURRENT_SET_TOMBSTONE | CONCURRENT_SET_CONTINUATION_BIT;
+                }
+                else {
+                    new_key = CONCURRENT_SET_EMPTY;
+                }
+
+                if (!hash_cleared) {
+                    // Hashes only change here and they get reclaimed in find_or_insert
+                    prev_hash = rbimpl_atomic_value_cas(&entry->hash, loaded_hash_raw, hash | CONCURRENT_SET_HASH_RECLAIMABLE_BIT, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_RELAXED);
+                    RUBY_ASSERT(prev_hash == hash || prev_hash == (hash | CONCURRENT_SET_HASH_RECLAIMABLE_BIT));
+                    hash_cleared = true;
+                }
+                VALUE prev_key = rbimpl_atomic_value_cas(&entry->key, raw_key, new_key, RBIMPL_ATOMIC_RELEASE, RBIMPL_ATOMIC_ACQUIRE);
+                if (prev_key == raw_key) {
+                    if (continuation) {
+                        rbimpl_atomic_add(&set->deleted_entries, 1, RBIMPL_ATOMIC_RELAXED);
+                    }
+                    else {
+                        rbimpl_atomic_sub(&set->size, 1, RBIMPL_ATOMIC_RELAXED);
+                    }
+                    return curr_key;
+                }
+                else if (!continuation && prev_key == (raw_key | CONCURRENT_SET_CONTINUATION_BIT)) {
+                    continue; // try again, the continuation bit was just set on this key so we can tombstone it
+                }
+                else if ((prev_key & CONCURRENT_SET_KEY_MASK) == CONCURRENT_SET_EMPTY || (prev_key & CONCURRENT_SET_KEY_MASK) == CONCURRENT_SET_TOMBSTONE) {
+                    return curr_key; // the key was deleted by another thread
+                }
+                else {
+                    // the key was changed to EMPTY by being garbage during find_or_insert and then a new key was put at the same slot. It's okay
+                    // that the hash was marked reclaimable above.
+                    RUBY_ASSERT(prev_hash != 0);
+                    return curr_key;
+                }
+            }
+            else if (!continuation) {
+                return 0;
             }
             break;
         }
@@ -479,8 +756,41 @@ rb_concurrent_set_delete_by_identity(VALUE set_obj, VALUE key)
     }
 }
 
-void
-rb_concurrent_set_foreach_with_replace(VALUE set_obj, int (*callback)(VALUE *key, void *data), void *data)
+// This can be called concurrently by a ruby GC thread and the sweep thread.
+VALUE
+rb_concurrent_set_delete_by_identity(VALUE *set_obj_ptr, VALUE key)
+{
+    VALUE result;
+    bool is_sweep_thread_p(void);
+
+    VALUE set_obj = rbimpl_atomic_value_load(set_obj_ptr, RBIMPL_ATOMIC_ACQUIRE);
+
+    if (is_sweep_thread_p()) {
+        while (1) {
+            bool lock_taken = resize_lock_rdlock();
+            {
+                VALUE current_set_obj = rbimpl_atomic_value_load(set_obj_ptr, RBIMPL_ATOMIC_ACQUIRE);
+                if (current_set_obj != set_obj) {
+                    set_obj = current_set_obj;
+                    // retry - resize happened
+                }
+                else {
+                    result = rb_concurrent_set_delete_by_identity_locked(set_obj, key);
+                    if (lock_taken) resize_lock_rdunlock();
+                    break;
+                }
+            }
+            if (lock_taken) resize_lock_rdunlock();
+        }
+    }
+    else {
+        result = rb_concurrent_set_delete_by_identity_locked(set_obj, key);
+    }
+    return result;
+}
+
+static void
+rb_concurrent_set_foreach_with_replace_locked(VALUE set_obj, int (*callback)(VALUE *key, void *data), void *data)
 {
     ASSERT_vm_locking_with_barrier();
 
@@ -488,26 +798,50 @@ rb_concurrent_set_foreach_with_replace(VALUE set_obj, int (*callback)(VALUE *key
 
     for (unsigned int i = 0; i < set->capacity; i++) {
         struct concurrent_set_entry *entry = &set->entries[i];
-        VALUE key = entry->key;
+        VALUE raw_key = entry->key;
+        bool continuation = raw_key & CONCURRENT_SET_CONTINUATION_BIT;
+        VALUE key = raw_key & CONCURRENT_SET_KEY_MASK;
 
         switch (key) {
           case CONCURRENT_SET_EMPTY:
-          case CONCURRENT_SET_DELETED:
+          case CONCURRENT_SET_TOMBSTONE:
             continue;
           case CONCURRENT_SET_MOVED:
             rb_bug("rb_concurrent_set_foreach_with_replace: moved entry");
             break;
           default: {
-            int ret = callback(&entry->key, data);
+            VALUE cb_key = key;
+            int ret = callback(&cb_key, data);
             switch (ret) {
               case ST_STOP:
                 return;
               case ST_DELETE:
                 concurrent_set_delete_entry_locked(set, entry);
                 break;
+              case ST_CONTINUE:
+                if (cb_key != key) {
+                    // Key was replaced by callback
+                    entry->key = cb_key | (continuation ? CONCURRENT_SET_CONTINUATION_BIT : 0);
+                }
+                break;
+              case ST_REPLACE:
+                rb_bug("unexpected concurrent_set callback return value: ST_REPLACE");
             }
             break;
           }
         }
     }
 }
+
+void
+rb_concurrent_set_foreach_with_replace(VALUE set_obj, int (*callback)(VALUE *key, void *data), void *data)
+{
+    RB_VM_LOCKING() {
+        // Don't allow concurrent deletes from sweep thread during this time. Maybe we can loosen this restriction.
+        resize_lock_wrlock(true);
+        {
+            rb_concurrent_set_foreach_with_replace_locked(set_obj, callback, data);
+        }
+        resize_lock_wrunlock();
+    }
+}
diff --git a/cont.c b/cont.c
index e5239635081629..4ada1ba00cef36 100644
--- a/cont.c
+++ b/cont.c
@@ -298,6 +298,63 @@ rb_free_shared_fiber_pool(void)
 
 static ID fiber_initialize_keywords[3] = {0};
 
+// We don't use the VM lock to protect the shared fiber pool because the sweep
+// thread needs to be able to free fibers and it can't take the VM lock.
+rb_nativethread_lock_t fiber_lock;
+#ifdef RUBY_THREAD_PTHREAD_H
+pthread_t fiber_pool_lock_owner;
+#endif
+
+MAYBE_UNUSED(static inline bool
+fiber_pool_locked_p(bool fallback))
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    return pthread_self() == fiber_pool_lock_owner;
+#else
+    return fallback;
+#endif
+}
+
+static inline void
+ASSERT_fiber_pool_locked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(fiber_pool_locked_p(true));
+#endif
+}
+
+static inline void
+ASSERT_fiber_pool_unlocked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(!fiber_pool_locked_p(false));
+#endif
+}
+
+static inline void
+fiber_pool_lock(void) {
+    ASSERT_fiber_pool_unlocked();
+    rb_native_mutex_lock(&fiber_lock);
+#ifdef RUBY_THREAD_PTHREAD_H
+    fiber_pool_lock_owner = pthread_self();
+#endif
+}
+
+static inline void
+fiber_pool_unlock(void) {
+    ASSERT_fiber_pool_locked();
+#ifdef RUBY_THREAD_PTHREAD_H
+    fiber_pool_lock_owner = 0;
+#endif
+    rb_native_mutex_unlock(&fiber_lock);
+}
+
+void
+fiber_pool_lock_reset(void)
+{
+    rb_native_mutex_initialize(&fiber_lock);
+}
+
 /*
  * FreeBSD require a first (i.e. addr) argument of mmap(2) is not NULL
  * if MAP_STACK is passed.
@@ -394,6 +451,7 @@ fiber_pool_vacancy_reset(struct fiber_pool_vacancy * vacancy)
 inline static struct fiber_pool_vacancy *
 fiber_pool_vacancy_push(struct fiber_pool_vacancy * vacancy, struct fiber_pool_vacancy * head)
 {
+    ASSERT_fiber_pool_locked();
     vacancy->next = head;
 
 #ifdef FIBER_POOL_ALLOCATION_FREE
@@ -426,6 +484,7 @@ fiber_pool_vacancy_remove(struct fiber_pool_vacancy * vacancy)
 inline static struct fiber_pool_vacancy *
 fiber_pool_vacancy_pop(struct fiber_pool * pool)
 {
+    ASSERT_fiber_pool_locked();
     struct fiber_pool_vacancy * vacancy = pool->vacancies;
 
     if (vacancy) {
@@ -438,6 +497,7 @@ fiber_pool_vacancy_pop(struct fiber_pool * pool)
 inline static struct fiber_pool_vacancy *
 fiber_pool_vacancy_pop(struct fiber_pool * pool)
 {
+    ASSERT_fiber_pool_locked();
     struct fiber_pool_vacancy * vacancy = pool->vacancies;
 
     if (vacancy) {
@@ -525,117 +585,149 @@ fiber_pool_allocate_memory(size_t * count, size_t stride)
 // fiber_pool_initialize before the pool is shared across threads.
 // @sa fiber_pool_allocation_free
 static struct fiber_pool_allocation *
-fiber_pool_expand(struct fiber_pool * fiber_pool, size_t count)
+fiber_pool_expand(struct fiber_pool * fiber_pool, size_t count, bool needs_lock, bool unlock_before_raise, struct fiber_pool_vacancy **vacancy_out)
 {
     if (count == 0) {
         errno = EAGAIN;
         return NULL;
     }
 
-    STACK_GROW_DIR_DETECTION;
+    // Allocate metadata before mmap: ruby_xmalloc (RB_ALLOC) raises on failure and
+    // must not run after base is mapped, or the region would leak.
+    struct fiber_pool_allocation * allocation = RB_ALLOC(struct fiber_pool_allocation);
+
+    if (needs_lock) fiber_pool_lock(); // no xmalloc allocations can occur with this lock held
+    {
+        STACK_GROW_DIR_DETECTION;
 
-    size_t size = fiber_pool->size;
-    size_t stride = size + RB_PAGE_SIZE;
+        size_t size = fiber_pool->size;
+        size_t stride = size + RB_PAGE_SIZE;
 
-    // If the maximum number of stacks is set, and we have reached it, return NULL.
-    if (fiber_pool->maximum_count > 0) {
-        if (fiber_pool->count >= fiber_pool->maximum_count) {
-            errno = EAGAIN;
-            return NULL;
-        }
-        size_t remaining = fiber_pool->maximum_count - fiber_pool->count;
-        if (count > remaining) {
-            count = remaining;
+        // If the maximum number of stacks is set, and we have reached it, return NULL.
+        if (fiber_pool->maximum_count > 0) {
+            if (fiber_pool->count >= fiber_pool->maximum_count) {
+                if (unlock_before_raise) fiber_pool_unlock();
+                errno = EAGAIN;
+                return NULL;
+            }
+            size_t remaining = fiber_pool->maximum_count - fiber_pool->count;
+            if (count > remaining) {
+                count = remaining;
+            }
         }
-    }
 
-    // Allocate metadata before mmap: ruby_xmalloc (RB_ALLOC) raises on failure and
-    // must not run after base is mapped, or the region would leak.
-    struct fiber_pool_allocation * allocation = RB_ALLOC(struct fiber_pool_allocation);
 
-    // Allocate the memory required for the stacks:
-    void * base = fiber_pool_allocate_memory(&count, stride);
+        // Allocate the memory required for the stacks:
+        void * base = fiber_pool_allocate_memory(&count, stride);
 
-    if (base == NULL) {
-        if (!errno) errno = ENOMEM;
-        ruby_xfree(allocation);
-        return NULL;
-    }
+        if (base == NULL) {
+            int saved_errno = errno;
+            if (!saved_errno) saved_errno = ENOMEM;
+            if (unlock_before_raise) fiber_pool_unlock();
+            ruby_xfree(allocation);
+            errno = saved_errno;
+            return NULL;
+        }
 
-    struct fiber_pool_vacancy * vacancies = fiber_pool->vacancies;
+        struct fiber_pool_vacancy * vacancies = fiber_pool->vacancies;
 
-    // Initialize fiber pool allocation:
-    allocation->base = base;
-    allocation->size = size;
-    allocation->stride = stride;
-    allocation->count = count;
+        // Initialize fiber pool allocation:
+        allocation->base = base;
+        allocation->size = size;
+        allocation->stride = stride;
+        allocation->count = count;
 #ifdef FIBER_POOL_ALLOCATION_FREE
-    allocation->used = 0;
+        allocation->used = 0;
 #endif
-    allocation->pool = fiber_pool;
+        allocation->pool = fiber_pool;
 
-    if (DEBUG_EXPAND) {
-        fprintf(stderr, "fiber_pool_expand(%"PRIuSIZE"): %p, %"PRIuSIZE"/%"PRIuSIZE" x [%"PRIuSIZE":%"PRIuSIZE"]\n",
-                count, (void*)fiber_pool, fiber_pool->used, fiber_pool->count, size, fiber_pool->vm_stack_size);
-    }
+        if (DEBUG_EXPAND) {
+            fprintf(stderr, "fiber_pool_expand(%"PRIuSIZE"): %p, %"PRIuSIZE"/%"PRIuSIZE" x [%"PRIuSIZE":%"PRIuSIZE"]\n",
+                    count, (void*)fiber_pool, fiber_pool->used, fiber_pool->count, size, fiber_pool->vm_stack_size);
+        }
 
-    // Iterate over all stacks, initializing the vacancy list:
-    for (size_t i = 0; i < count; i += 1) {
-        void * base = (char*)allocation->base + (stride * i);
-        void * page = (char*)base + STACK_DIR_UPPER(size, 0);
+        // Iterate over all stacks, initializing the vacancy list:
+        for (size_t i = 0; i < count; i += 1) {
+            void * base = (char*)allocation->base + (stride * i);
+            void * page = (char*)base + STACK_DIR_UPPER(size, 0);
 #if defined(_WIN32)
-        DWORD old_protect;
-
-        if (!VirtualProtect(page, RB_PAGE_SIZE, PAGE_READWRITE | PAGE_GUARD, &old_protect)) {
-            int error = rb_w32_map_errno(GetLastError());
-            VirtualFree(allocation->base, 0, MEM_RELEASE);
-            ruby_xfree(allocation);
-            errno = error;
-            return NULL;
-        }
+            DWORD old_protect;
+
+            if (!VirtualProtect(page, RB_PAGE_SIZE, PAGE_READWRITE | PAGE_GUARD, &old_protect)) {
+                int error = rb_w32_map_errno(GetLastError());
+                if (unlock_before_raise) fiber_pool_unlock();
+                VirtualFree(allocation->base, 0, MEM_RELEASE);
+                ruby_xfree(allocation);
+                errno = error;
+                return NULL;
+            }
 #elif defined(__wasi__)
-        // wasi-libc's mprotect emulation doesn't support PROT_NONE.
-        (void)page;
+            // wasi-libc's mprotect emulation doesn't support PROT_NONE.
+            (void)page;
 #else
-        if (mprotect(page, RB_PAGE_SIZE, PROT_NONE) < 0) {
-            int error = errno;
-            if (!error) error = ENOMEM;
-            munmap(allocation->base, count*stride);
-            ruby_xfree(allocation);
-            errno = error;
-            return NULL;
-        }
+            if (mprotect(page, RB_PAGE_SIZE, PROT_NONE) < 0) {
+                int error = errno;
+                if (!error) error = ENOMEM;
+                if (unlock_before_raise) fiber_pool_unlock();
+                munmap(allocation->base, count*stride);
+                ruby_xfree(allocation);
+                errno = error;
+                return NULL;
+            }
 #endif
 
-        vacancies = fiber_pool_vacancy_initialize(
-            fiber_pool, vacancies,
-            (char*)base + STACK_DIR_UPPER(0, RB_PAGE_SIZE),
-            size
-        );
+            vacancies = fiber_pool_vacancy_initialize(
+                fiber_pool, vacancies,
+                (char*)base + STACK_DIR_UPPER(0, RB_PAGE_SIZE),
+                size
+            );
 
 #ifdef FIBER_POOL_ALLOCATION_FREE
-        vacancies->stack.allocation = allocation;
+            vacancies->stack.allocation = allocation;
 #endif
-    }
+        }
 
-    // Insert the allocation into the head of the pool:
-    allocation->next = fiber_pool->allocations;
+        // Insert the allocation into the head of the pool:
+        allocation->next = fiber_pool->allocations;
 
 #ifdef FIBER_POOL_ALLOCATION_FREE
-    if (allocation->next) {
-        allocation->next->previous = allocation;
-    }
+        if (allocation->next) {
+            allocation->next->previous = allocation;
+        }
 
-    allocation->previous = NULL;
+        allocation->previous = NULL;
 #endif
 
-    fiber_pool->allocations = allocation;
-    fiber_pool->vacancies = vacancies;
-    fiber_pool->count += count;
+        fiber_pool->allocations = allocation;
+        fiber_pool->vacancies = vacancies;
+        fiber_pool->count += count;
+
+        if (vacancy_out) {
+           *vacancy_out = fiber_pool_vacancy_pop(fiber_pool);
+        }
+
+        if (needs_lock) fiber_pool_unlock();
+    }
 
     return allocation;
 }
 
+static struct fiber_pool_vacancy *
+fiber_pool_expand_and_pop(struct fiber_pool * fiber_pool, size_t count, bool needs_lock, bool unlock_before_raise)
+{
+    RUBY_ASSERT(needs_lock || (!needs_lock && fiber_pool_locked_p(true)));
+    struct fiber_pool_vacancy *vacancy_out = NULL;
+    struct fiber_pool_allocation *allocation = fiber_pool_expand(fiber_pool, count, needs_lock, unlock_before_raise, &vacancy_out);
+    if (allocation) {
+        RUBY_ASSERT(vacancy_out);
+        return vacancy_out;
+    }
+    else {
+        return NULL;
+    }
+
+}
+
 // Initialize the specified fiber pool with the given number of stacks.
 // @param vm_stack_size The size of the vm stack to allocate.
 static void
@@ -654,7 +746,7 @@ fiber_pool_initialize(struct fiber_pool * fiber_pool, size_t size, size_t minimu
     fiber_pool->vm_stack_size = vm_stack_size;
 
     if (fiber_pool->minimum_count > 0) {
-        if (RB_UNLIKELY(!fiber_pool_expand(fiber_pool, fiber_pool->minimum_count))) {
+        if (RB_UNLIKELY(!fiber_pool_expand(fiber_pool, fiber_pool->minimum_count, true, true, NULL))) {
             rb_raise(rb_eFiberError, "can't allocate initial fiber stacks (%"PRIuSIZE" x %"PRIuSIZE" bytes): %s", fiber_pool->minimum_count, fiber_pool->size, strerror(errno));
         }
     }
@@ -709,6 +801,7 @@ fiber_pool_allocation_free(struct fiber_pool_allocation * allocation)
 static size_t
 fiber_pool_stack_expand_count(const struct fiber_pool *pool)
 {
+    ASSERT_fiber_pool_locked();
     const size_t maximum_allocations = FIBER_POOL_MAXIMUM_ALLOCATIONS;
     const size_t minimum_count = FIBER_POOL_MINIMUM_COUNT;
 
@@ -739,19 +832,24 @@ fiber_pool_stack_expand_count(const struct fiber_pool *pool)
 static struct fiber_pool_vacancy *
 fiber_pool_stack_acquire_expand(struct fiber_pool *fiber_pool)
 {
+    // fiber_pool_lock acquired
     size_t count = fiber_pool_stack_expand_count(fiber_pool);
 
     if (DEBUG_ACQUIRE) fprintf(stderr, "fiber_pool_stack_acquire: expanding fiber pool by %"PRIuSIZE" stacks\n", count);
 
     struct fiber_pool_vacancy *vacancy = NULL;
 
-    if (RB_LIKELY(fiber_pool_expand(fiber_pool, count))) {
-        return fiber_pool_vacancy_pop(fiber_pool);
+    if (RB_LIKELY((vacancy = fiber_pool_expand_and_pop(fiber_pool, count, false, true)))) {
+        return vacancy;
     }
     else {
         if (DEBUG_ACQUIRE) fprintf(stderr, "fiber_pool_stack_acquire: expand failed (%s), collecting garbage\n", strerror(errno));
 
-        rb_gc();
+        fiber_pool_unlock();
+        {
+            rb_gc();
+        }
+        fiber_pool_lock();
 
         // After running GC, the vacancy list may have some stacks:
         vacancy = fiber_pool_vacancy_pop(fiber_pool);
@@ -763,8 +861,8 @@ fiber_pool_stack_acquire_expand(struct fiber_pool *fiber_pool)
         count = fiber_pool_stack_expand_count(fiber_pool);
 
         // Try to expand the fiber pool again:
-        if (RB_LIKELY(fiber_pool_expand(fiber_pool, count))) {
-            return fiber_pool_vacancy_pop(fiber_pool);
+        if (RB_LIKELY((vacancy = fiber_pool_expand_and_pop(fiber_pool, false, true, count)))) {
+            return vacancy;
         }
         else {
             // Okay, we really failed to acquire a stack. Give up and return NULL with errno set:
@@ -779,8 +877,7 @@ fiber_pool_stack_acquire(struct fiber_pool * fiber_pool)
 {
     struct fiber_pool_vacancy * vacancy;
 
-    unsigned int lev;
-    RB_VM_LOCK_ENTER_LEV(&lev);
+    fiber_pool_lock();
     {
         // Fast path: try to acquire a stack from the vacancy list:
         vacancy = fiber_pool_vacancy_pop(fiber_pool);
@@ -793,7 +890,7 @@ fiber_pool_stack_acquire(struct fiber_pool * fiber_pool)
 
             // If expansion failed, raise an error:
             if (RB_UNLIKELY(!vacancy)) {
-                RB_VM_LOCK_LEAVE_LEV(&lev);
+                fiber_pool_unlock();
                 rb_raise(rb_eFiberError, "can't allocate fiber stack: %s", strerror(errno));
             }
         }
@@ -811,10 +908,9 @@ fiber_pool_stack_acquire(struct fiber_pool * fiber_pool)
 #ifdef FIBER_POOL_ALLOCATION_FREE
         vacancy->stack.allocation->used += 1;
 #endif
-
         fiber_pool_stack_reset(&vacancy->stack);
     }
-    RB_VM_LOCK_LEAVE_LEV(&lev);
+    fiber_pool_unlock();
 
     return vacancy->stack;
 }
@@ -880,10 +976,11 @@ fiber_pool_stack_free(struct fiber_pool_stack * stack)
 #endif
 }
 
-// Release and return a stack to the vacancy list.
+// Release and return a stack to the vacancy list. fiber_lock is acquired upon entry.
 static void
 fiber_pool_stack_release(struct fiber_pool_stack * stack)
 {
+    ASSERT_fiber_pool_locked();
     struct fiber_pool * pool = stack->pool;
     struct fiber_pool_vacancy * vacancy = fiber_pool_vacancy_pointer(stack->base, stack->size);
 
@@ -1031,17 +1128,6 @@ fiber_stack_release(rb_fiber_t * fiber)
     rb_ec_clear_vm_stack(ec);
 }
 
-static void
-fiber_stack_release_locked(rb_fiber_t *fiber)
-{
-    if (!ruby_vm_during_cleanup) {
-        // We can't try to acquire the VM lock here because MMTK calls free in its own native thread which has no ec.
-        // This assertion will fail on MMTK but we currently don't have CI for debug releases of MMTK, so we can assert for now.
-        ASSERT_vm_locking_with_barrier();
-    }
-    fiber_stack_release(fiber);
-}
-
 static const char *
 fiber_status_name(enum fiber_status s)
 {
@@ -1204,7 +1290,11 @@ cont_free(void *ptr)
     else {
         rb_fiber_t *fiber = (rb_fiber_t*)cont;
         coroutine_destroy(&fiber->context);
-        fiber_stack_release_locked(fiber);
+        fiber_pool_lock();
+        {
+            fiber_stack_release(fiber);
+        }
+        fiber_pool_unlock();
     }
 
     SIZED_FREE_N(cont->saved_vm_stack.ptr, cont->saved_vm_stack.size);
@@ -1373,7 +1463,7 @@ cont_handle_weak_references(void *ptr)
 static const rb_data_type_t rb_cont_data_type = {
     "continuation",
     {cont_mark, cont_free, cont_memsize, cont_compact, cont_handle_weak_references},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static inline void
@@ -2134,7 +2224,7 @@ fiber_handle_weak_references(void *ptr)
 static const rb_data_type_t rb_fiber_data_type = {
     "fiber",
     {fiber_mark, fiber_free, fiber_memsize, fiber_compact, fiber_handle_weak_references},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -2892,9 +2982,11 @@ fiber_switch(rb_fiber_t *fiber, int argc, const VALUE *argv, int kw_splat, rb_fi
     // We cannot free the stack until the pthread is joined:
 #ifndef COROUTINE_PTHREAD_CONTEXT
     if (FIBER_TERMINATED_P(fiber)) {
-        RB_VM_LOCKING() {
+        fiber_pool_lock();
+        {
             fiber_stack_release(fiber);
         }
+        fiber_pool_unlock();
     }
 #endif
 
@@ -3540,7 +3632,7 @@ fiber_pool_memsize(const void *ptr)
 static const rb_data_type_t FiberPoolDataType = {
     "fiber_pool",
     {NULL, fiber_pool_free, fiber_pool_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -3651,6 +3743,7 @@ Init_Cont(void)
 #endif
     SET_MACHINE_STACK_END(&th->ec->machine.stack_end);
 
+    rb_native_mutex_initialize(&fiber_lock);
     size_t minimum_count = shared_fiber_pool_minimum_count();
     size_t maximum_count = shared_fiber_pool_maximum_count();
     fiber_pool_initialize(&shared_fiber_pool, stack_size, minimum_count, maximum_count, vm_stack_size);
diff --git a/darray.h b/darray.h
index 31ab7d412aa441..08d79a45c27bd7 100644
--- a/darray.h
+++ b/darray.h
@@ -138,6 +138,21 @@ rb_darray_size(const void *ary)
  * Useful for TypedData objects. */
 #define rb_darray_memsize(ary) (sizeof(*(ary)) + (rb_darray_size(ary) * sizeof((ary)->data[0])))
 
+/* Remove n items from the beginning of the array */
+#define rb_darray_shift_n(ary, n) rb_darray_shift_n_impl(ary, ary->data, n, sizeof((ary)->data[0]))
+
+static inline void
+rb_darray_shift_n_impl(void *ary, void *data, size_t n, size_t type_sz)
+{
+    rb_darray_meta_t *meta = ary;
+    RUBY_ASSERT(meta->size >= n);
+    char *dst = (char*)data;
+    if (n > 0) {
+        memmove(dst, dst + n * type_sz, (meta->size - n) * type_sz);
+        meta->size -= n;
+    }
+}
+
 static inline void
 rb_darray_pop(void *ary, size_t count)
 {
@@ -225,7 +240,9 @@ rb_darray_realloc_mul_add_without_gc(void *orig_ptr, size_t x, size_t y, size_t
     size_t size = rbimpl_size_add_or_raise(rbimpl_size_mul_or_raise(x, y), z);
 
     void *ptr = realloc(orig_ptr, size);
-    if (ptr == NULL) rb_bug("rb_darray_realloc_mul_add_without_gc: failed");
+    if (ptr == NULL) {
+        rb_bug("rb_darray_realloc_mul_add_without_gc: failed");
+    }
 
     return ptr;
 }
diff --git a/dir.c b/dir.c
index d67de8cf06c830..72496d0906dbc0 100644
--- a/dir.c
+++ b/dir.c
@@ -545,7 +545,7 @@ static const rb_data_type_t dir_data_type = {
         dir_free,
         NULL, // Nothing allocated externally, so don't need a memsize function
     },
-    0, NULL, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE
+    0, NULL, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE dir_close(VALUE);
diff --git a/encoding.c b/encoding.c
index 8bb393b471ed54..04f5269d63f5ea 100644
--- a/encoding.c
+++ b/encoding.c
@@ -122,7 +122,7 @@ static int filesystem_encindex = ENCINDEX_ASCII_8BIT;
 static const rb_data_type_t encoding_data_type = {
     "encoding",
     {0, 0, 0,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define is_encoding_type(obj) (RTYPEDDATA_TYPE(obj) == &encoding_data_type)
diff --git a/enumerator.c b/enumerator.c
index 81b71bd8b43b29..2f181918f08cb2 100644
--- a/enumerator.c
+++ b/enumerator.c
@@ -280,7 +280,7 @@ static const rb_data_type_t enumerator_data_type = {
         NULL, // Nothing allocated externally, so don't need a memsize function
         NULL,
     },
-    0, NULL, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE
+    0, NULL, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct enumerator *
@@ -311,7 +311,7 @@ static const rb_data_type_t proc_entry_data_type = {
         NULL, // Nothing allocated externally, so don't need a memsize function
         proc_entry_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct proc_entry *
@@ -1323,7 +1323,7 @@ static const rb_data_type_t yielder_data_type = {
         NULL,
         yielder_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct yielder *
@@ -1447,7 +1447,7 @@ static const rb_data_type_t generator_data_type = {
         NULL,
         generator_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct generator *
@@ -2978,7 +2978,7 @@ static const rb_data_type_t producer_data_type = {
         producer_memsize,
         producer_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct producer *
@@ -3196,7 +3196,7 @@ static const rb_data_type_t enum_chain_data_type = {
         enum_chain_memsize,
         enum_chain_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct enum_chain *
@@ -3511,7 +3511,7 @@ static const rb_data_type_t enum_product_data_type = {
         enum_product_memsize,
         enum_product_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct enum_product *
@@ -3849,7 +3849,7 @@ static const rb_data_type_t arith_seq_data_type = {
         NULL,
     },
     .parent = &enumerator_data_type,
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/error.c b/error.c
index 52bd3629bf2d13..10162bd0e1b46b 100644
--- a/error.c
+++ b/error.c
@@ -1117,11 +1117,16 @@ rb_bug_without_die(const char *fmt, ...)
     va_end(args);
 }
 
+bool is_sweep_thread_p(void);
+
 void
 rb_bug(const char *fmt, ...)
 {
     va_list args;
     va_start(args, fmt);
+    if (is_sweep_thread_p()) {
+        fprintf(stderr, "rb_bug() called from sweep_thread!\n");
+    }
     rb_bug_without_die_internal(fmt, args);
     va_end(args);
     die();
@@ -2530,7 +2535,7 @@ static const rb_data_type_t name_err_mesg_data_type = {
         NULL, // No external memory to report,
         name_err_mesg_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 /* :nodoc: */
diff --git a/ext/date/date_core.c b/ext/date/date_core.c
index f37c1a54e5f53e..f85dc3083a61be 100644
--- a/ext/date/date_core.c
+++ b/ext/date/date_core.c
@@ -3222,7 +3222,7 @@ static const rb_data_type_t d_lite_type = {
     "Date",
     {d_lite_gc_mark, RUBY_TYPED_DEFAULT_FREE, d_lite_memsize,},
     0, 0,
-    RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED|RUBY_TYPED_FROZEN_SHAREABLE,
+    RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_CONCURRENT_FREE_SAFE|RUBY_TYPED_WB_PROTECTED|RUBY_TYPED_FROZEN_SHAREABLE,
 };
 
 inline static VALUE
diff --git a/ext/digest/digest.c b/ext/digest/digest.c
index bd8d3e815ffe6a..e54f0d7bda8e7c 100644
--- a/ext/digest/digest.c
+++ b/ext/digest/digest.c
@@ -619,7 +619,7 @@ static const rb_data_type_t digest_type = {
     "digest",
     {0, RUBY_TYPED_DEFAULT_FREE, 0,},
     0, 0,
-    (RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED),
+    (RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_CONCURRENT_FREE_SAFE|RUBY_TYPED_WB_PROTECTED),
 };
 
 static inline void
diff --git a/ext/socket/raddrinfo.c b/ext/socket/raddrinfo.c
index 6cdf5c6abc40e7..53a4e7f4564c11 100644
--- a/ext/socket/raddrinfo.c
+++ b/ext/socket/raddrinfo.c
@@ -1295,7 +1295,7 @@ addrinfo_memsize(const void *ptr)
 static const rb_data_type_t addrinfo_type = {
     "socket/addrinfo",
     {addrinfo_mark, addrinfo_free, addrinfo_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_WB_PROTECTED,
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_WB_PROTECTED,
 };
 
 static VALUE
diff --git a/ext/stringio/stringio.c b/ext/stringio/stringio.c
index 09757a283eaf7c..fdb7f0e6550e14 100644
--- a/ext/stringio/stringio.c
+++ b/ext/stringio/stringio.c
@@ -119,7 +119,7 @@ static const rb_data_type_t strio_data_type = {
 	strio_free,
 	strio_memsize,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED // uses reference count, not concurrent free safe
 };
 
 #define check_strio(self) ((struct StringIO*)rb_check_typeddata((self), &strio_data_type))
diff --git a/file.c b/file.c
index e40f67ec73817a..c90a499e6603ff 100644
--- a/file.c
+++ b/file.c
@@ -535,7 +535,7 @@ static const rb_data_type_t stat_data_type = {
         RUBY_TYPED_DEFAULT_FREE,
         NULL, // No external memory to report
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 struct rb_stat {
diff --git a/gc.c b/gc.c
index d6d517d6a44c9e..817037553b67f9 100644
--- a/gc.c
+++ b/gc.c
@@ -151,9 +151,18 @@ rb_gc_vm_unlock(unsigned int lev, const char *file, int line)
     rb_vm_lock_leave(&lev, file, line);
 }
 
+bool
+is_sweep_thread_p(void)
+{
+    rb_vm_t *vm = GET_VM();
+    if (!vm) return false;
+    return vm->gc.sweep_thread == pthread_self();
+}
+
 unsigned int
 rb_gc_cr_lock(const char *file, int line)
 {
+    GC_ASSERT(!is_sweep_thread_p());
     unsigned int lev;
     rb_vm_lock_enter_cr(GET_RACTOR(), &lev, file, line);
     return lev;
@@ -162,6 +171,7 @@ rb_gc_cr_lock(const char *file, int line)
 void
 rb_gc_cr_unlock(unsigned int lev, const char *file, int line)
 {
+    GC_ASSERT(!is_sweep_thread_p());
     rb_vm_lock_leave_cr(GET_RACTOR(), &lev, file, line);
 }
 
@@ -1347,7 +1357,7 @@ rb_gc_obj_needs_cleanup_p(VALUE obj)
     }
 
     shape_id_t shape_id = RBASIC_SHAPE_ID(obj);
-    if (id2ref_tbl && rb_shape_has_object_id(shape_id)) return true;
+    if (RUBY_ATOMIC_PTR_LOAD(id2ref_tbl) && rb_shape_has_object_id(shape_id)) return true;
 
     switch (flags & RUBY_T_MASK) {
       case T_OBJECT:
@@ -1392,8 +1402,12 @@ rb_gc_obj_needs_cleanup_p(VALUE obj)
       case T_COMPLEX:
         return rb_shape_has_fields(shape_id);
 
+      case T_ZOMBIE:
+        RUBY_ASSERT(flags & FL_FREEZE);
+        return true;
+
       default:
-        UNREACHABLE_RETURN(true);
+        rb_bug("bad object type in needs_cleanup_p: %lu", flags & RUBY_T_MASK);
     }
 }
 
@@ -1410,6 +1424,7 @@ make_io_zombie(void *objspace, VALUE obj)
     rb_gc_impl_make_zombie(objspace, obj, io_fptr_finalize, fptr);
 }
 
+// Returns whether or not we can add `obj` back to the page's freelist.
 static bool
 rb_data_free(void *objspace, VALUE obj)
 {
@@ -1476,6 +1491,7 @@ classext_iclass_free(rb_classext_t *ext, bool is_prime, VALUE box_value, void *a
     rb_iclass_classext_free(args->klass, ext, is_prime);
 }
 
+// Returns whether or not we can add `obj` back to the page's freelist.
 bool
 rb_gc_obj_free(void *objspace, VALUE obj)
 {
@@ -1580,7 +1596,7 @@ rb_gc_obj_free(void *objspace, VALUE obj)
         }
         break;
       case T_DATA:
-        if (!rb_data_free(objspace, obj)) return false;
+        if (!RB_LIKELY(rb_data_free(objspace, obj))) return FALSE;
         break;
       case T_MATCH:
         {
@@ -1665,12 +1681,19 @@ rb_gc_obj_free(void *objspace, VALUE obj)
         rb_imemo_free((VALUE)obj);
         break;
 
+      case T_ZOMBIE:
+        GC_ASSERT(FL_TEST(obj, FL_FREEZE));
+        GC_ASSERT(!FL_TEST(obj, FL_FINALIZE));
+        void rb_gc_impl_free_zombie(rb_objspace_t *, VALUE);
+        rb_gc_impl_free_zombie(objspace, obj);
+        return TRUE;
       default:
         rb_bug("gc_sweep(): unknown data type 0x%x(%p) 0x%"PRIxVALUE,
                BUILTIN_TYPE(obj), (void*)obj, RBASIC(obj)->flags);
     }
 
     if (FL_TEST_RAW(obj, FL_FINALIZE)) {
+        GC_ASSERT(BUILTIN_TYPE(obj) !=  T_ZOMBIE);
         rb_gc_impl_make_zombie(objspace, obj, 0, 0);
         return FALSE;
     }
@@ -2057,12 +2080,78 @@ id2ref_tbl_memsize(const void *data)
     return rb_st_memsize(data);
 }
 
+// TODO: platforms other than pthread
+static rb_nativethread_lock_t id2ref_tbl_lock_ = PTHREAD_MUTEX_INITIALIZER;
+#ifdef RUBY_THREAD_PTHREAD_H
+static pthread_t id2ref_tbl_lock_owner;
+#endif
+static unsigned int id2ref_tbl_lock_lvl;
+
+static inline void
+ASSERT_id2ref_tbl_locked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() == id2ref_tbl_lock_owner);
+#endif
+}
+
+static inline void
+ASSERT_id2ref_tbl_unlocked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() != id2ref_tbl_lock_owner);
+#endif
+}
+
+static inline void
+id2ref_tbl_lock(bool allow_reentry)
+{
+    if (allow_reentry && pthread_self() == id2ref_tbl_lock_owner) {
+    } else {
+        ASSERT_id2ref_tbl_unlocked();
+        rb_native_mutex_lock(&id2ref_tbl_lock_);
+        id2ref_tbl_lock_owner = pthread_self();
+    }
+    id2ref_tbl_lock_lvl++;
+}
+
+static inline bool
+id2ref_tbl_trylock(bool allow_reentry)
+{
+    if (allow_reentry && pthread_self() == id2ref_tbl_lock_owner) {
+    } else {
+        ASSERT_id2ref_tbl_unlocked();
+        if (rb_native_mutex_trylock(&id2ref_tbl_lock_) == EBUSY) {
+            return false;
+        }
+        id2ref_tbl_lock_owner = pthread_self();
+    }
+    id2ref_tbl_lock_lvl++;
+    return true;
+}
+
+static inline void
+id2ref_tbl_unlock(void)
+{
+    ASSERT_id2ref_tbl_locked();
+    GC_ASSERT(id2ref_tbl_lock_lvl > 0);
+    id2ref_tbl_lock_lvl--;
+    if (id2ref_tbl_lock_lvl == 0) {
+        id2ref_tbl_lock_owner = 0;
+        rb_native_mutex_unlock(&id2ref_tbl_lock_);
+    }
+}
+
 static void
 id2ref_tbl_free(void *data)
 {
-    id2ref_tbl = NULL; // clear global ref
-    st_table *table = (st_table *)data;
-    st_free_table(table);
+    id2ref_tbl_lock(true);
+    {
+        st_table *table = (st_table *)data;
+        st_free_table(table);
+        RUBY_ATOMIC_PTR_SET(id2ref_tbl, NULL); // clear global ref
+    }
+    id2ref_tbl_unlock();
 }
 
 static const rb_data_type_t id2ref_tbl_type = {
@@ -2074,6 +2163,8 @@ static const rb_data_type_t id2ref_tbl_type = {
         // dcompact function not required because the table is reference updated
         // in rb_gc_vm_weak_table_foreach
     },
+    // Not marked concurrent free safe so that we can know that when we take the VM lock and check for
+    // the id2ref_tbl, it won't be deleted out from under us while the VM lock is held.
     .flags = RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY
 };
 
@@ -2088,8 +2179,14 @@ class_object_id(VALUE klass)
         if (existing_id) {
             id = existing_id;
         }
-        else if (RB_UNLIKELY(id2ref_tbl)) {
-            st_insert(id2ref_tbl, id, klass);
+        else {
+            if (RB_UNLIKELY(id2ref_tbl)) {
+                id2ref_tbl_lock(false);
+                {
+                    st_insert(id2ref_tbl, id, klass); // needs VM lock for allocation
+                }
+                id2ref_tbl_unlock();
+            }
         }
         RB_GC_VM_UNLOCK(lock_lev);
     }
@@ -2135,9 +2232,13 @@ object_id0(VALUE obj)
     RUBY_ASSERT(RBASIC_SHAPE_ID(obj) == object_id_shape_id);
     RUBY_ASSERT(rb_shape_obj_has_id(obj));
 
-    if (RB_UNLIKELY(id2ref_tbl)) {
+    if (RB_UNLIKELY(RUBY_ATOMIC_PTR_LOAD(id2ref_tbl))) {
         RB_VM_LOCKING() {
-            st_insert(id2ref_tbl, (st_data_t)id, (st_data_t)obj);
+            id2ref_tbl_lock(false);
+            {
+                st_insert(id2ref_tbl, (st_data_t)id, (st_data_t)obj); // needs VM lock for allocation
+            }
+            id2ref_tbl_unlock();
         }
     }
     return id;
@@ -2175,6 +2276,8 @@ build_id2ref_i(VALUE obj, void *data)
 {
     st_table *id2ref_tbl = (st_table *)data;
 
+    if (rb_objspace_garbage_object_p(obj)) return;
+
     switch (BUILTIN_TYPE(obj)) {
       case T_CLASS:
       case T_MODULE:
@@ -2208,8 +2311,8 @@ object_id_to_ref(void *objspace_ptr, VALUE object_id)
 
     unsigned int lev = RB_GC_VM_LOCK();
 
-    if (!id2ref_tbl) {
-        rb_gc_vm_barrier(); // stop other ractors
+    if (!RUBY_ATOMIC_PTR_LOAD(id2ref_tbl)) {
+        rb_gc_vm_barrier(); // stop other ractors but sweep thread could still be running
 
         // GC Must not trigger while we build the table, otherwise if we end
         // up freeing an object that had an ID, we might try to delete it from
@@ -2218,20 +2321,25 @@ object_id_to_ref(void *objspace_ptr, VALUE object_id)
         VALUE tmp_id2ref_value = TypedData_Wrap_Struct(0, &id2ref_tbl_type, tmp_id2ref_tbl);
 
         // build_id2ref_i will most certainly malloc, which could trigger GC and sweep
-        // objects we just added to the table.
-        // By calling rb_gc_disable() we also save having to handle potentially garbage objects.
+        // objects we just added to the table. The sweep thread could still be running so
+        // we need to handle garbage objects.
         bool gc_disabled = RTEST(rb_gc_disable());
         {
-            id2ref_tbl = tmp_id2ref_tbl;
             id2ref_value = tmp_id2ref_value;
 
-            rb_gc_impl_each_object(objspace, build_id2ref_i, (void *)id2ref_tbl);
+            rb_gc_impl_each_object(objspace, build_id2ref_i, (void *)tmp_id2ref_tbl);
+            RUBY_ATOMIC_PTR_SET(id2ref_tbl, tmp_id2ref_tbl);
         }
         if (!gc_disabled) rb_gc_enable();
     }
 
     VALUE obj;
-    bool found = st_lookup(id2ref_tbl, object_id, &obj) && !rb_gc_impl_garbage_object_p(objspace, obj);
+    bool found;
+    id2ref_tbl_lock(false);
+    {
+        found = st_lookup(id2ref_tbl, object_id, &obj) && !rb_gc_impl_garbage_object_p(objspace, obj);
+    }
+    id2ref_tbl_unlock();
 
     RB_GC_VM_UNLOCK(lev);
 
@@ -2247,11 +2355,11 @@ object_id_to_ref(void *objspace_ptr, VALUE object_id)
     }
 }
 
-static inline void
-obj_free_object_id(VALUE obj)
+static VALUE
+obj_get_object_id(VALUE obj)
 {
     VALUE obj_id = 0;
-    if (RB_UNLIKELY(id2ref_tbl)) {
+    if (RB_UNLIKELY(RUBY_ATOMIC_PTR_LOAD(id2ref_tbl))) {
         switch (BUILTIN_TYPE(obj)) {
           case T_CLASS:
           case T_MODULE:
@@ -2259,11 +2367,11 @@ obj_free_object_id(VALUE obj)
             break;
           case T_IMEMO:
             if (!IMEMO_TYPE_P(obj, imemo_fields)) {
-                return;
+                break;
             }
             // fallthrough
           case T_OBJECT:
-            {
+          {
             shape_id_t shape_id = RBASIC_SHAPE_ID(obj);
             if (rb_shape_has_object_id(shape_id)) {
                 obj_id = object_id_get(obj, shape_id);
@@ -2271,31 +2379,77 @@ obj_free_object_id(VALUE obj)
             break;
           }
           default:
+            break;
             // For generic_fields, the T_IMEMO/fields is responsible for freeing the id.
-            return;
         }
+    }
+    return obj_id;
+}
+
+static inline bool
+obj_free_object_id(VALUE obj, bool in_user_gc_thread)
+{
+    if (RB_UNLIKELY(RUBY_ATOMIC_PTR_LOAD(id2ref_tbl))) {
+        VALUE obj_id = obj_get_object_id(obj);
 
         if (RB_UNLIKELY(obj_id)) {
             RUBY_ASSERT(FIXNUM_P(obj_id) || RB_TYPE_P(obj_id, T_BIGNUM));
 
+            // If we're in the sweep thread, we must use trylock because GC could have been
+            // triggered by inserting into the id2ref_tbl, which means the GC thread holds the
+            // lock and we can't wait on it.
+            bool needs_id2ref_tbl_trylock = !in_user_gc_thread;
+            if (needs_id2ref_tbl_trylock) {
+                bool did_lock = id2ref_tbl_trylock(false);
+                if (!did_lock) return false;
+            } else {
+                id2ref_tbl_lock(true);
+            }
             if (!st_delete(id2ref_tbl, (st_data_t *)&obj_id, NULL)) {
                 // The the object is a T_IMEMO/fields, then it's possible the actual object
                 // has been garbage collected already.
                 if (!RB_TYPE_P(obj, T_IMEMO)) {
+                    id2ref_tbl_unlock();
                     rb_bug("Object ID seen, but not in _id2ref table: object_id=%llu object=%s", NUM2ULL(obj_id), rb_obj_info(obj));
                 }
             }
+            id2ref_tbl_unlock();
         }
     }
+    return true;
 }
 
-void
+bool
+rb_gc_obj_free_concurrency_safe_vm_weak_references(VALUE obj)
+{
+    bool result = obj_free_object_id(obj, false);
+    if (RB_UNLIKELY(rb_obj_gen_fields_p(obj))) {
+        bool freed_generic = rb_free_generic_ivar(obj);
+        if (!freed_generic) result = false;
+    }
+    switch (BUILTIN_TYPE(obj)) {
+      case T_STRING:
+        if (FL_TEST_RAW(obj, RSTRING_FSTR)) {
+            rb_gc_free_fstring(obj);
+        }
+        break;
+      case T_SYMBOL:
+        rb_gc_free_dsymbol(obj);
+        break;
+      default:
+        break;
+    }
+    return result;
+}
+
+bool
 rb_gc_obj_free_vm_weak_references(VALUE obj)
 {
     ASSUME(!RB_SPECIAL_CONST_P(obj));
-    obj_free_object_id(obj);
 
-    if (rb_obj_gen_fields_p(obj)) {
+    obj_free_object_id(obj, true);
+
+    if (RB_UNLIKELY(rb_obj_gen_fields_p(obj))) {
         rb_free_generic_ivar(obj);
     }
 
@@ -2323,6 +2477,7 @@ rb_gc_obj_free_vm_weak_references(VALUE obj)
       default:
         break;
     }
+    return true;
 }
 
 /*
@@ -2649,7 +2804,14 @@ count_objects_i(VALUE obj, void *d)
     struct count_objects_data *data = (struct count_objects_data *)d;
 
     if (RBASIC(obj)->flags) {
-        data->counts[BUILTIN_TYPE(obj)]++;
+        // This will make sure the count is like the old behavior when we used to turn a zombie into
+        // T_NONE right after the finalizer and/or free function ran.
+        if (BUILTIN_TYPE(obj) == T_ZOMBIE && FL_TEST(obj, FL_FREEZE)) {
+            data->freed++;
+        }
+        else {
+            data->counts[BUILTIN_TYPE(obj)]++;
+        }
     }
     else {
         data->freed++;
@@ -4185,6 +4347,7 @@ vm_weak_table_gen_fields_foreach(st_data_t key, st_data_t value, st_data_t data)
     if (key != new_key || value != new_value) {
         DURING_GC_COULD_MALLOC_REGION_START();
         {
+            // We're STW, no need for gen_fields_tbl_lock
             st_insert(rb_generic_fields_tbl_get(), (st_data_t)new_key, new_value);
         }
         DURING_GC_COULD_MALLOC_REGION_END();
@@ -4255,7 +4418,7 @@ rb_gc_vm_weak_table_foreach(vm_table_foreach_callback_func callback,
         break;
       }
       case RB_GC_VM_ID2REF_TABLE: {
-        if (id2ref_tbl) {
+        if (id2ref_tbl) { // we're STW, no need for lock
             st_foreach_with_replace(
                 id2ref_tbl,
                 vm_weak_table_id2ref_foreach,
@@ -4267,7 +4430,7 @@ rb_gc_vm_weak_table_foreach(vm_table_foreach_callback_func callback,
       }
       case RB_GC_VM_GENERIC_FIELDS_TABLE: {
         st_table *generic_fields_tbl = rb_generic_fields_tbl_get();
-        if (generic_fields_tbl) {
+        if (generic_fields_tbl) { // we're STW, no need for lock
             st_foreach(
                 generic_fields_tbl,
                 vm_weak_table_gen_fields_foreach,
@@ -4842,7 +5005,7 @@ rb_method_type_name(rb_method_type_t type)
 static void
 rb_raw_iseq_info(char *const buff, const size_t buff_size, const rb_iseq_t *iseq)
 {
-    if (buff_size > 0 && ISEQ_BODY(iseq) && ISEQ_BODY(iseq)->location.label && !RB_TYPE_P(ISEQ_BODY(iseq)->location.pathobj, T_MOVED)) {
+    if (buff_size > 0 && ISEQ_BODY(iseq) && ISEQ_BODY(iseq)->location.label && !rb_objspace_garbage_object_p(ISEQ_BODY(iseq)->location.pathobj)) {
         VALUE path = rb_iseq_path(iseq);
         int n = ISEQ_BODY(iseq)->location.first_lineno;
         snprintf(buff, buff_size, " %s@%s:%d",
@@ -4873,7 +5036,7 @@ str_len_no_raise(VALUE str)
 #define C(c, s) ((c) != 0 ? (s) : " ")
 
 static size_t
-rb_raw_obj_info_common(char *const buff, const size_t buff_size, const VALUE obj)
+rb_raw_obj_info_common(char *const buff, const size_t buff_size, const VALUE obj, bool *is_garbage_out)
 {
     size_t pos = 0;
 
@@ -4916,6 +5079,10 @@ rb_raw_obj_info_common(char *const buff, const size_t buff_size, const VALUE obj
         else if (RBASIC(obj)->klass == 0) {
             APPEND_S("(temporary internal)");
         }
+        else if (rb_objspace_garbage_object_p(RBASIC(obj)->klass)) {
+            APPEND_S("(garbage class)");
+            *is_garbage_out = true;
+        }
         else if (RTEST(RBASIC(obj)->klass)) {
             VALUE class_path = rb_class_path_cached(RBASIC(obj)->klass);
             if (!NIL_P(class_path)) {
@@ -5014,9 +5181,14 @@ rb_raw_obj_info_buitin_type(char *const buff, const size_t buff_size, const VALU
             }
           case T_ICLASS:
             {
-                VALUE class_path = rb_class_path_cached(RBASIC_CLASS(obj));
-                if (!NIL_P(class_path)) {
-                    APPEND_F("src:%s", RSTRING_PTR(class_path));
+                if (rb_objspace_garbage_object_p(RBASIC_CLASS(obj))) {
+                    APPEND_S("src: garbage");
+                }
+                else {
+                    VALUE class_path = rb_class_path_cached(RBASIC_CLASS(obj));
+                    if (!NIL_P(class_path)) {
+                        APPEND_F("src:%s", RSTRING_PTR(class_path));
+                    }
                 }
                 break;
             }
@@ -5157,8 +5329,11 @@ rb_asan_poisoned_object_p(VALUE obj)
 static void
 raw_obj_info(char *const buff, const size_t buff_size, VALUE obj)
 {
-    size_t pos = rb_raw_obj_info_common(buff, buff_size, obj);
-    pos = rb_raw_obj_info_buitin_type(buff, buff_size, obj, pos);
+    bool is_garbage = false;
+    size_t pos = rb_raw_obj_info_common(buff, buff_size, obj, &is_garbage);
+    if (!is_garbage) {
+        pos = rb_raw_obj_info_buitin_type(buff, buff_size, obj, pos);
+    }
     if (pos >= buff_size) {} // truncated
 }
 
@@ -5173,11 +5348,9 @@ rb_raw_obj_info(char *const buff, const size_t buff_size, VALUE obj)
     else if (!rb_gc_impl_pointer_to_heap_p(objspace, (const void *)obj)) {
         snprintf(buff, buff_size, "out-of-heap:%p", (void *)obj);
     }
-#if 0 // maybe no need to check it?
-    else if (0 && rb_gc_impl_garbage_object_p(objspace, obj)) {
+    else if (rb_gc_impl_garbage_object_p(objspace, obj)) {
         snprintf(buff, buff_size, "garbage:%p", (void *)obj);
     }
-#endif
     else {
         asan_unpoisoning_object(obj) {
             raw_obj_info(buff, buff_size, obj);
diff --git a/gc.rb b/gc.rb
index 895a82b7343c01..01d798addb1596 100644
--- a/gc.rb
+++ b/gc.rb
@@ -147,7 +147,7 @@ def self.count
   #    sweeping_time: 0,
   #    heap_allocated_pages: 521,
   #    heap_empty_pages: 0,
-   #    heap_allocatable_bytes: 0,
+  #    heap_allocatable_bytes: 0,
   #    heap_available_slots: 539590,
   #    heap_live_slots: 422243,
   #    heap_free_slots: 117347,
diff --git a/gc/default/default.c b/gc/default/default.c
index 1b7d109ce69a99..40f8d4501d2068 100644
--- a/gc/default/default.c
+++ b/gc/default/default.c
@@ -25,8 +25,11 @@
 #include "ruby/atomic.h"
 #include "ruby/debug.h"
 #include "ruby/thread.h"
+#include "ruby/thread_native.h"
 #include "ruby/util.h"
 #include "ruby/vm.h"
+
+#include <pthread.h>
 #include "ruby/internal/encoding/string.h"
 #include "ccan/list/list.h"
 #include "darray.h"
@@ -112,6 +115,16 @@
 #ifndef GC_HEAP_INIT_BYTES
 #define GC_HEAP_INIT_BYTES (2560 * 1024)
 #endif
+
+#define PSWEEP_DEBUG 0
+#if PSWEEP_DEBUG
+#define psweep_debug(lvl, ...) if (lvl <= PSWEEP_DEBUG) fprintf(stderr, __VA_ARGS__)
+#else
+#define psweep_debug(...) (void)0
+#endif
+#define PSWEEP_LOCK_STATS 0
+#define PSWEEP_COLLECT_TIMINGS 0
+
 #ifndef GC_HEAP_FREE_SLOTS
 #define GC_HEAP_FREE_SLOTS  4096
 #endif
@@ -165,8 +178,10 @@
 #ifdef RB_THREAD_LOCAL_SPECIFIER
 #define USE_MALLOC_INCREASE_LOCAL 1
 static RB_THREAD_LOCAL_SPECIFIER int malloc_increase_local;
+static RB_THREAD_LOCAL_SPECIFIER struct heap_page *current_sweep_thread_page;
 #else
 #define USE_MALLOC_INCREASE_LOCAL 0
+static struct heap_page *current_sweep_thread_page;
 #endif
 
 #ifndef GC_CAN_COMPILE_COMPACTION
@@ -464,16 +479,33 @@ typedef struct rb_heap_struct {
     /* Sweeping statistics */
     size_t freed_slots;
     size_t empty_slots;
+#if RUBY_DEBUG
+    size_t zombie_slots; // pre-existing zombies not ready yet to free
+#endif
 
     struct heap_page *free_pages;
     struct ccan_list_head pages;
-    struct heap_page *sweeping_page; /* iterator for .pages */
+    struct heap_page *sweeping_page; /* iterator for .pages. It always points to the next page to sweep. */
+    struct heap_page *pre_sweeping_page; /* Background thread is currently sweeping this page */
+    struct heap_page *swept_pages; /* pages claimed and swept by background thread */
+    struct heap_page *latest_swept_page; // tail of `swept_pages`
     struct heap_page *compact_cursor;
     uintptr_t compact_cursor_index;
     struct heap_page *pooled_pages;
     size_t total_pages;      /* total page count in a heap */
     size_t total_slots;      /* total slot count */
+#if RUBY_DEBUG
+    rb_atomic_t made_zombies;
+#endif
 
+    rb_atomic_t foreground_sweep_steps; // incremented by ruby thread, checked by sweep thread
+    rb_atomic_t background_sweep_steps; // only incremented/checked by sweep thread
+    rb_nativethread_cond_t sweep_page_cond; // associated with global sweep lock
+    rb_nativethread_lock_t swept_pages_lock;
+    size_t pre_swept_slots_deferred;
+    bool is_finished_sweeping;
+    bool done_background_sweep;
+    bool skip_sweep_continue; // skip current sweep continue
 } rb_heap_t;
 
 enum {
@@ -513,16 +545,27 @@ typedef struct rb_objspace {
     struct {
         unsigned int mode : 2;
         unsigned int immediate_sweep : 1;
-        unsigned int dont_gc : 1;
         unsigned int dont_incremental : 1;
-        unsigned int during_gc : 1;
         unsigned int during_compacting : 1;
+#if RUBY_DEBUG
+        unsigned int was_compacting: 1;
+#endif
         unsigned int during_reference_updating : 1;
-        unsigned int gc_stressful: 1;
-        unsigned int during_minor_gc : 1;
         unsigned int during_incremental_marking : 1;
         unsigned int measure_gc : 1;
     } flags;
+    // This can't be a bitfield because it's accessed in garbage_object_p() from the sweep thread
+    // while the ruby GC thread could be running and changing other bitfields.
+    bool during_lazy_sweeping;
+    // This one too, it's accessed in debug_free_check
+    bool during_minor_gc;
+    bool during_gc;
+    bool dont_gc;
+    bool gc_stressful;
+#if RUBY_DEBUG
+    size_t will_be_swept_slots;
+    size_t have_swept_slots;
+#endif
 
     rb_event_flag_t hook_events;
 
@@ -530,6 +573,21 @@ typedef struct rb_objspace {
     size_t empty_pages_count;
     struct heap_page *empty_pages;
 
+    rb_nativethread_lock_t sweep_lock;
+    rb_nativethread_cond_t sweep_cond;
+    pthread_t sweep_thread;
+    bool sweep_thread_running;
+    bool sweep_thread_sweep_requested;
+    bool sweep_thread_sweep_exited;
+    bool sweep_thread_waiting_request;
+    bool sweep_thread_sweeping;
+    rb_atomic_t use_background_sweep_thread;
+    bool background_sweep_mode;
+    bool background_sweep_abort;
+    bool background_sweep_restart_heaps;
+    bool sweep_rest;
+    unsigned int heaps_done_background_sweep;
+
     struct {
         rb_atomic_t finalizing;
     } atomic_flags;
@@ -568,6 +626,11 @@ typedef struct rb_objspace {
 
         size_t minor_gc_count;
         size_t major_gc_count;
+        size_t major_gc_count_by_nofree;
+        size_t major_gc_count_by_oldgen;
+        size_t major_gc_count_by_shady;
+        size_t major_gc_count_by_force;
+        size_t major_gc_count_by_oldmalloc;
         size_t compact_count;
         size_t read_barrier_faults;
 #if RGENGC_PROFILE > 0
@@ -601,6 +664,16 @@ typedef struct rb_objspace {
         unsigned long long sweeping_time_ns;
         struct timespec sweeping_start_time;
 
+#if PSWEEP_COLLECT_TIMINGS > 0
+        /* Ruby thread sweep time tracking (always collected) */
+        unsigned long long ruby_thread_sweep_cpu_time_ns;
+        unsigned long long ruby_thread_sweep_wall_time_ns;
+        struct timespec ruby_thread_sweep_cpu_start_time;
+        struct timespec ruby_thread_sweep_wall_start_time;
+#endif
+        size_t pages_swept_by_sweep_thread;
+        size_t pages_swept_by_sweep_thread_had_deferred_free_objects;
+
         /* Weak references */
         size_t weak_references_count;
     } profile;
@@ -779,11 +852,17 @@ struct heap_page {
     unsigned short free_slots;
     unsigned short final_slots;
     unsigned short pinned_slots;
+    unsigned short pre_freed_slots;
+    unsigned short pre_empty_slots;
+    unsigned short pre_deferred_free_slots;
+    unsigned short pre_final_slots;
+    unsigned short pre_zombie_slots;
+    size_t pre_freed_malloc_bytes;
     struct {
-        unsigned int before_sweep : 1;
         unsigned int has_remembered_objects : 1;
         unsigned int has_uncollectible_wb_unprotected_objects : 1;
     } flags;
+    rb_atomic_t before_sweep; // bool
 
     rb_heap_t *heap;
 
@@ -804,6 +883,7 @@ struct heap_page {
     /* If set, the object is not movable */
     bits_t pinned_bits[HEAP_PAGE_BITMAP_LIMIT];
     bits_t age_bits[HEAP_PAGE_BITMAP_LIMIT * RVALUE_AGE_BIT_COUNT];
+    bits_t deferred_free_bits[HEAP_PAGE_BITMAP_LIMIT];
 };
 
 /*
@@ -859,6 +939,12 @@ slot_index_for_offset(size_t offset, uint32_t div_magic)
     return (size_t)(((uint64_t)offset * div_magic) >> 32);
 }
 
+static inline unsigned
+popcount_bits(bits_t x)
+{
+    return rb_popcount_intptr((uintptr_t)x);
+}
+
 #define SLOT_INDEX(page, p)          slot_index_for_offset((uintptr_t)(p) - (page)->start, (page)->slot_div_magic)
 #define SLOT_BITMAP_INDEX(page, p)   (SLOT_INDEX(page, p) / BITS_BITLENGTH)
 #define SLOT_BITMAP_OFFSET(page, p)  (SLOT_INDEX(page, p) & (BITS_BITLENGTH - 1))
@@ -926,10 +1012,10 @@ RVALUE_AGE_SET(VALUE obj, int age)
 #define heap_pages_freeable_pages	objspace->heap_pages.freeable_pages
 #define heap_pages_deferred_final	objspace->heap_pages.deferred_final
 #define heaps              objspace->heaps
-#define during_gc		objspace->flags.during_gc
+#define during_gc		objspace->during_gc
 #define finalizing		objspace->atomic_flags.finalizing
 #define finalizer_table 	objspace->finalizer_table
-#define ruby_gc_stressful	objspace->flags.gc_stressful
+#define ruby_gc_stressful	objspace->gc_stressful
 #define ruby_gc_stress_mode     objspace->gc_stress_mode
 #if GC_DEBUG_STRESS_TO_CLASS
 #define stress_to_class         objspace->stress_to_class
@@ -940,15 +1026,15 @@ RVALUE_AGE_SET(VALUE obj, int age)
 #endif
 
 #if 0
-#define dont_gc_on()          (fprintf(stderr, "dont_gc_on@%s:%d\n",      __FILE__, __LINE__), objspace->flags.dont_gc = 1)
-#define dont_gc_off()         (fprintf(stderr, "dont_gc_off@%s:%d\n",     __FILE__, __LINE__), objspace->flags.dont_gc = 0)
-#define dont_gc_set(b)        (fprintf(stderr, "dont_gc_set(%d)@%s:%d\n", __FILE__, __LINE__), objspace->flags.dont_gc = (int)(b))
-#define dont_gc_val()         (objspace->flags.dont_gc)
+#define dont_gc_on()          (fprintf(stderr, "dont_gc_on@%s:%d\n",      __FILE__, __LINE__), objspace->dont_gc = 1)
+#define dont_gc_off()         (fprintf(stderr, "dont_gc_off@%s:%d\n",     __FILE__, __LINE__), objspace->dont_gc = 0)
+#define dont_gc_set(b)        (fprintf(stderr, "dont_gc_set(%d)@%s:%d\n", __FILE__, __LINE__), objspace->dont_gc = (int)(b))
+#define dont_gc_val()         (objspace->dont_gc)
 #else
-#define dont_gc_on()          (objspace->flags.dont_gc = 1)
-#define dont_gc_off()         (objspace->flags.dont_gc = 0)
-#define dont_gc_set(b)        (objspace->flags.dont_gc = (int)(b))
-#define dont_gc_val()         (objspace->flags.dont_gc)
+#define dont_gc_on()          (objspace->dont_gc = 1)
+#define dont_gc_off()         (objspace->dont_gc = 0)
+#define dont_gc_set(b)        (objspace->dont_gc = (bool)(b))
+#define dont_gc_val()         (objspace->dont_gc)
 #endif
 
 #define gc_config_full_mark_set(b) (objspace->gc_config.full_mark = (int)(b))
@@ -983,15 +1069,224 @@ gc_mode_verify(enum gc_mode mode)
     return mode;
 }
 
-static inline bool
+#if PSWEEP_LOCK_STATS > 0
+/* Lock contention statistics per callsite */
+#define MAX_LOCK_CALLSITES 100
+
+typedef struct lock_callsite_stats {
+    const char *function;
+    int line;
+    size_t acquired_without_contention;
+    size_t contended;
+} lock_callsite_stats_t;
+
+typedef struct lock_stats {
+    const char *name;
+    lock_callsite_stats_t callsites[MAX_LOCK_CALLSITES];
+    int num_callsites;
+} lock_stats_t;
+
+static lock_stats_t sweep_lock_stats = {"objspace->sweep_lock", {{0}}, 0};
+static lock_stats_t swept_pages_lock_stats = {"heap->swept_pages_lock", {{0}}, 0};
+
+
+static lock_callsite_stats_t*
+find_or_create_callsite(lock_stats_t *stats, const char *function, int line)
+{
+    /* Find existing callsite */
+    for (int i = 0; i < stats->num_callsites; i++) {
+        if (stats->callsites[i].function == function && stats->callsites[i].line == line) {
+            return &stats->callsites[i];
+        }
+    }
+
+    /* Create new callsite if space available */
+    if (stats->num_callsites < MAX_LOCK_CALLSITES) {
+        lock_callsite_stats_t *callsite = &stats->callsites[stats->num_callsites++];
+        callsite->function = function;
+        callsite->line = line;
+        callsite->acquired_without_contention = 0;
+        callsite->contended = 0;
+        return callsite;
+    }
+
+    /* No space - return last callsite as overflow */
+    return &stats->callsites[MAX_LOCK_CALLSITES - 1];
+}
+
+static void
+instrumented_lock_acquire_impl(rb_nativethread_lock_t *lock, lock_stats_t *stats, const char *function, int line)
+{
+    lock_callsite_stats_t *callsite = find_or_create_callsite(stats, function, line);
+
+    if (rb_native_mutex_trylock(lock) == 0) {
+        callsite->acquired_without_contention++;
+    }
+    else {
+        callsite->contended++;
+        rb_native_mutex_lock(lock);
+    }
+}
+
+/* Macro to automatically pass function and line */
+#define instrumented_lock_acquire(lock, stats) \
+    instrumented_lock_acquire_impl(lock, stats, __FUNCTION__, __LINE__)
+
+static void
+print_lock_stats(void)
+{
+    fprintf(stderr, "\n=== Lock Contention Statistics by Callsite ===\n");
+    fprintf(stderr, "%-40s %-30s %12s %12s %10s\n", "Lock Name", "Callsite", "Uncontended", "Contended", "Ratio");
+    fprintf(stderr, "%-40s %-30s %12s %12s %10s\n", "---------", "--------", "-----------", "---------", "-----");
+
+    lock_stats_t *all_stats[] = {&sweep_lock_stats, &swept_pages_lock_stats};
+
+    for (int i = 0; i < 2; i++) {
+        lock_stats_t *stats = all_stats[i];
+
+        /* Sort callsites by total contentions (descending) */
+        for (int j = 0; j < stats->num_callsites - 1; j++) {
+            for (int k = j + 1; k < stats->num_callsites; k++) {
+                if (stats->callsites[k].contended > stats->callsites[j].contended) {
+                    lock_callsite_stats_t temp = stats->callsites[j];
+                    stats->callsites[j] = stats->callsites[k];
+                    stats->callsites[k] = temp;
+                }
+            }
+        }
+
+        /* Print callsites for this lock */
+        for (int j = 0; j < stats->num_callsites; j++) {
+            lock_callsite_stats_t *cs = &stats->callsites[j];
+            size_t total = cs->acquired_without_contention + cs->contended;
+            if (total > 0) {
+                char callsite_buf[32];
+                snprintf(callsite_buf, sizeof(callsite_buf), "%s:%d", cs->function, cs->line);
+
+                double ratio = (double)cs->contended / total * 100.0;
+                fprintf(stderr, "%-40s %-30s %12zu %12zu %9.2f%%\n",
+                        j == 0 ? stats->name : "",
+                        callsite_buf,
+                        cs->acquired_without_contention,
+                        cs->contended,
+                        ratio);
+            }
+        }
+    }
+    fprintf(stderr, "================================================\n\n");
+}
+#endif /* PSWEEP_LOCK_STATS > 0 */
+
+static pthread_t sweep_lock_owner = 0;
+
+static inline void
+sweep_lock_lock_impl(rb_nativethread_lock_t *sweep_lock, const char *function, int line)
+{
+    GC_ASSERT(sweep_lock_owner != pthread_self());
+#if PSWEEP_LOCK_STATS > 0
+    instrumented_lock_acquire_impl(sweep_lock, &sweep_lock_stats, function, line);
+#else
+    rb_native_mutex_lock(sweep_lock);
+#endif
+    GC_ASSERT(sweep_lock_owner == 0);
+#if VM_CHECK_MODE > 0
+    sweep_lock_owner = pthread_self();
+#endif
+}
+
+#define sweep_lock_lock(sweep_lock) \
+    sweep_lock_lock_impl(sweep_lock, __FUNCTION__, __LINE__)
+
+static inline void
+sweep_lock_unlock(rb_nativethread_lock_t *sweep_lock)
+{
+#if VM_CHECK_MODE > 0
+    GC_ASSERT(sweep_lock_owner == pthread_self());
+    sweep_lock_owner = 0;
+#endif
+    rb_native_mutex_unlock(sweep_lock);
+}
+
+static inline void
+sweep_lock_set_locked(void)
+{
+    GC_ASSERT(sweep_lock_owner == 0);
+#if VM_CHECK_MODE > 0
+    sweep_lock_owner = pthread_self();
+#endif
+}
+
+static inline void
+sweep_lock_set_unlocked(void)
+{
+#if VM_CHECK_MODE > 0
+    GC_ASSERT(sweep_lock_owner == pthread_self());
+    sweep_lock_owner = 0;
+#endif
+}
+
+// Returns true when the background sweep thread and Ruby thread have finished processing
+// (background sweeping + ruby thread post-processing or deferred freeing) all pages for that heap.
+static bool
+heap_is_sweep_done(rb_objspace_t *objspace, rb_heap_t *heap)
+{
+    if (heap->is_finished_sweeping) {
+        psweep_debug(2, "[gc] heap_is_sweep_done: %d, heap:%p (%ld), heap->is_finished_sweeping\n", true, heap, heap - heaps);
+        return true;
+    }
+    if (!objspace->use_background_sweep_thread) {
+        bool done =  heap->sweeping_page == NULL;
+        psweep_debug(2, "[gc] heap_is_sweep_done: %d, heap:%p (%ld), !use_background_thread\n", done, heap, heap - heaps);
+        return done;
+    }
+
+    // We always dequeue the last page, never the sweep thread. This avoids locking in the common case.
+    // It should be synchronized, but it's a "benign race".
+    if (heap->sweeping_page) {
+        return false;
+    }
+
+    bool done;
+    sweep_lock_lock(&objspace->sweep_lock);
+    if (heap->sweeping_page || heap->swept_pages) {
+        psweep_debug(2, "heap_is_sweep_done: %d, heap:%p (%ld), swept_pages:%d, sweeping_page:%p\n", false, heap, heap - heaps, heap->swept_pages != 0, heap->sweeping_page);
+        done = false;
+    }
+    else if (heap->pre_sweeping_page) {
+        sweep_lock_set_unlocked();
+        // We need to wait because this is the final page for this heap, and the caller calls us
+        // like `while (!heap_is_sweep_done(heap)) { gc_sweep_step(heap) }` (we don't want to spin).
+        rb_native_cond_wait(&heap->sweep_page_cond, &objspace->sweep_lock);
+        sweep_lock_set_locked();
+        GC_ASSERT(heap->swept_pages);
+        done = false;
+    }
+    else {
+        done = true;
+    }
+    sweep_lock_unlock(&objspace->sweep_lock);
+    return done;
+}
+
+// Does the GC still have pages to sweep? If returns false, then the Ruby thread has fully
+// processed all the pages in every heap.
+static bool
 has_sweeping_pages(rb_objspace_t *objspace)
 {
+    rb_heap_t *heap_not_finished = NULL;
     for (int i = 0; i < HEAP_COUNT; i++) {
-        if ((&heaps[i])->sweeping_page) {
-            return TRUE;
+        rb_heap_t *heap = &heaps[i];
+        if (!heap->is_finished_sweeping) {
+            if (heap_not_finished) {
+                return true;
+            }
+            else {
+                heap_not_finished = heap;
+            }
         }
     }
-    return FALSE;
+    if (!heap_not_finished) return false; // all done
+    return !heap_is_sweep_done(objspace, heap_not_finished);
 }
 
 static inline size_t
@@ -1032,7 +1327,7 @@ total_final_slots_count(rb_objspace_t *objspace)
     size_t count = 0;
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
-        count += heap->final_slots_count;
+        count += (size_t)RUBY_ATOMIC_VALUE_LOAD(heap->final_slots_count);
     }
     return count;
 }
@@ -1043,12 +1338,12 @@ total_final_slots_count(rb_objspace_t *objspace)
 
 #define is_marking(objspace)             (gc_mode(objspace) == gc_mode_marking)
 #define is_sweeping(objspace)            (gc_mode(objspace) == gc_mode_sweeping)
-#define is_full_marking(objspace)        ((objspace)->flags.during_minor_gc == FALSE)
+#define is_full_marking(objspace)        ((objspace)->during_minor_gc == FALSE)
 #define is_incremental_marking(objspace) ((objspace)->flags.during_incremental_marking != FALSE)
 #define will_be_incremental_marking(objspace) ((objspace)->rgengc.need_major_gc != GPR_FLAG_NONE)
 #define GC_INCREMENTAL_SWEEP_SLOT_COUNT 2048
 #define GC_INCREMENTAL_SWEEP_POOL_SLOT_COUNT 1024
-#define is_lazy_sweeping(objspace)           (GC_ENABLE_LAZY_SWEEP && has_sweeping_pages(objspace))
+#define is_lazy_sweeping(objspace)           ((objspace)->during_lazy_sweeping != FALSE)
 /* In lazy sweeping or the previous incremental marking finished and did not yield a free page. */
 #define needs_continue_sweeping(objspace, heap) \
     ((heap)->free_pages == NULL && is_lazy_sweeping(objspace))
@@ -1081,6 +1376,8 @@ static int garbage_collect(rb_objspace_t *, unsigned int reason);
 
 static int  gc_start(rb_objspace_t *objspace, unsigned int reason);
 static void gc_rest(rb_objspace_t *objspace);
+static inline void atomic_sub_nounderflow(size_t *var, size_t sub);
+static size_t malloc_increase_local_flush(rb_objspace_t *objspace);
 
 enum gc_enter_event {
     gc_enter_event_start,
@@ -1093,7 +1390,7 @@ static inline void gc_enter(rb_objspace_t *objspace, enum gc_enter_event event,
 static inline void gc_exit(rb_objspace_t *objspace, enum gc_enter_event event, unsigned int *lock_lev);
 static void gc_marking_enter(rb_objspace_t *objspace);
 static void gc_marking_exit(rb_objspace_t *objspace);
-static void gc_sweeping_enter(rb_objspace_t *objspace);
+static void gc_sweeping_enter(rb_objspace_t *objspace, const char *from_fn);
 static void gc_sweeping_exit(rb_objspace_t *objspace);
 static bool gc_marks_continue(rb_objspace_t *objspace, rb_heap_t *heap);
 
@@ -1251,6 +1548,15 @@ RVALUE_MARKED(rb_objspace_t *objspace, VALUE obj)
     return RVALUE_MARKED_BITMAP(obj) != 0;
 }
 
+static inline int
+RVALUE_MARKED_ATOMIC(rb_objspace_t *objspace, VALUE obj)
+{
+    bits_t *bits = GET_HEAP_MARK_BITS(obj);
+    struct heap_page *page = GET_HEAP_PAGE(obj);
+    bits_t word = rbimpl_atomic_value_load((VALUE*)&bits[SLOT_BITMAP_INDEX(page, obj)], RBIMPL_ATOMIC_ACQUIRE);
+    return (word & SLOT_BITMAP_BIT(page, obj)) != 0;
+}
+
 static inline int
 RVALUE_PINNED(rb_objspace_t *objspace, VALUE obj)
 {
@@ -1299,6 +1605,10 @@ check_rvalue_consistency_force(rb_objspace_t *objspace, const VALUE obj, int ter
 {
     int err = 0;
 
+
+    rb_execution_context_t *ec = rb_current_execution_context(false);
+    if (!ec) return 0; // sweep thread
+
     int lev = RB_GC_VM_LOCK_NO_BARRIER();
     {
         if (SPECIAL_CONST_P(obj)) {
@@ -1338,7 +1648,7 @@ check_rvalue_consistency_force(rb_objspace_t *objspace, const VALUE obj, int ter
                 fprintf(stderr, "check_rvalue_consistency: %s is T_NONE.\n", rb_obj_info(obj));
                 err++;
             }
-            if (BUILTIN_TYPE(obj) == T_ZOMBIE) {
+            if (BUILTIN_TYPE(obj) == T_ZOMBIE && !FL_TEST(obj, FL_FREEZE)) {
                 fprintf(stderr, "check_rvalue_consistency: %s is T_ZOMBIE.\n", rb_obj_info(obj));
                 err++;
             }
@@ -1590,6 +1900,12 @@ rb_gc_impl_get_measure_total_time(void *objspace_ptr)
     return objspace->flags.measure_gc;
 }
 
+#define ZOMBIE_OBJ_KEPT_FLAGS (FL_FINALIZE)
+// Zombie needs to be put back on the freelist later (during GC) and finalizer has ran
+#define ZOMBIE_NEEDS_FREE_FLAG (FL_FREEZE)
+#define ZOMBIE_NEEDS_FREE_P(zombie) (FL_TEST(zombie, ZOMBIE_NEEDS_FREE_FLAG))
+#define ZOMBIE_SET_NEEDS_FREE_FLAG(zombie) (FL_SET(zombie, ZOMBIE_NEEDS_FREE_FLAG))
+
 /* garbage objects will be collected soon. */
 bool
 rb_gc_impl_garbage_object_p(void *objspace_ptr, VALUE ptr)
@@ -1598,29 +1914,57 @@ rb_gc_impl_garbage_object_p(void *objspace_ptr, VALUE ptr)
 
     bool dead = false;
 
-    asan_unpoisoning_object(ptr) {
-        switch (BUILTIN_TYPE(ptr)) {
-          case T_NONE:
-          case T_MOVED:
-          case T_ZOMBIE:
-            dead = true;
-            break;
-          default:
-            break;
+    // Set to false/true by the ruby GC thread when entering/exiting GC, so shouldn't change throughout this call.
+    rb_atomic_t use_sweep_thread = rbimpl_atomic_load(&objspace->use_background_sweep_thread, RBIMPL_ATOMIC_RELAXED);
+
+    if (!use_sweep_thread) {
+        // It's not safe to read flags on an object if the sweep thread is running
+        asan_unpoisoning_object(ptr) {
+            switch (BUILTIN_TYPE(ptr)) {
+            case T_NONE:
+            case T_MOVED:
+                dead = true;
+                break;
+            case T_ZOMBIE:
+                dead = ZOMBIE_NEEDS_FREE_P(ptr);
+                break;
+            default:
+                break;
+            }
         }
     }
 
     if (dead) return true;
-    return is_lazy_sweeping(objspace) && GET_HEAP_PAGE(ptr)->flags.before_sweep &&
-        !RVALUE_MARKED(objspace, ptr);
+
+    struct heap_page *page = GET_HEAP_PAGE(ptr);
+    bool during_lazy_sweep = is_lazy_sweeping(objspace);
+
+    if (!use_sweep_thread) {
+        // The ruby GC thread or a user thread called us
+        bool marked = RVALUE_MARKED(objspace, ptr);
+        return during_lazy_sweep && !marked && rbimpl_atomic_load(&page->before_sweep, RBIMPL_ATOMIC_RELAXED);
+    }
+    else if (during_lazy_sweep) {
+        // we're currently lazy sweeping with the sweep thread
+        bool marked = RVALUE_MARKED_ATOMIC(objspace, ptr); // load it atomically so it can't be re-ordered past the next atomic load
+        rb_atomic_t before_sweep = rbimpl_atomic_load(&page->before_sweep, RBIMPL_ATOMIC_ACQUIRE);
+        bool is_garbage = !marked && before_sweep;
+        if (is_garbage) return true;
+        if (marked && before_sweep) return false;
+        // already swept page, just check flags
+        return BUILTIN_TYPE(ptr) == T_NONE || BUILTIN_TYPE(ptr) == T_MOVED || (BUILTIN_TYPE(ptr) == T_ZOMBIE && ZOMBIE_NEEDS_FREE_P(ptr));
+    }
+    else {
+        return BUILTIN_TYPE(ptr) == T_NONE || BUILTIN_TYPE(ptr) == T_MOVED || (BUILTIN_TYPE(ptr) == T_ZOMBIE && ZOMBIE_NEEDS_FREE_P(ptr));
+    }
 }
 
 static void free_stack_chunks(mark_stack_t *);
 static void mark_stack_free_cache(mark_stack_t *);
-static void heap_page_free(rb_objspace_t *objspace, struct heap_page *page);
+static void heap_page_free(rb_objspace_t *objspace, struct heap_page *page, bool log);
 
 static inline void
-heap_page_add_freeobj(rb_objspace_t *objspace, struct heap_page *page, VALUE obj)
+heap_page_add_freeobj(rb_objspace_t *objspace, struct heap_page *page, VALUE obj, bool from_sweep_thread)
 {
     rb_asan_unpoison_object(obj, false);
 
@@ -1632,8 +1976,10 @@ heap_page_add_freeobj(rb_objspace_t *objspace, struct heap_page *page, VALUE obj
     page->freelist = slot;
     asan_lock_freelist(page);
 
-    // Should have already been reset
-    GC_ASSERT(RVALUE_AGE_GET(obj) == 0);
+    if (!from_sweep_thread) {
+        // Should have already been reset
+        GC_ASSERT(RVALUE_AGE_GET(obj) == 0);
+    }
 
     if (RGENGC_CHECK_MODE &&
         /* obj should belong to page */
@@ -1682,7 +2028,7 @@ heap_allocatable_bytes_expand(rb_objspace_t *objspace,
     }
 
     if (gc_params.growth_max_bytes > 0) {
-        size_t max_total_slots = total_slots + gc_params.growth_max_bytes / slot_size;
+        size_t max_total_slots = total_slots + (gc_params.growth_max_bytes / slot_size);
         if (target_total_slots > max_total_slots) target_total_slots = max_total_slots;
     }
 
@@ -1693,13 +2039,16 @@ heap_allocatable_bytes_expand(rb_objspace_t *objspace,
     objspace->heap_pages.allocatable_bytes += extend_slot_count * slot_size;
 }
 
+/* Add a `page` with some free slots to the beginning of `heap->free_pages` */
 static inline void
-heap_add_freepage(rb_heap_t *heap, struct heap_page *page)
+heap_add_freepage(rb_heap_t *heap, struct heap_page *page, const char *from_func)
 {
     asan_unlock_freelist(page);
     GC_ASSERT(page->free_slots != 0);
     GC_ASSERT(page->freelist != NULL);
 
+    psweep_debug(1, "[gc] heap_add_freepage(heap:%p, page:%p) from %s\n", heap, page, from_func);
+
     page->free_next = heap->free_pages;
     heap->free_pages = page;
 
@@ -1726,7 +2075,10 @@ static void
 heap_unlink_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
 {
     ccan_list_del(&page->page_node);
+    GC_ASSERT(heap->total_pages > 0);
     heap->total_pages--;
+    GC_ASSERT(heap->total_slots >= page->total_slots);
+    GC_ASSERT(page->total_slots > 0);
     heap->total_slots -= page->total_slots;
 }
 
@@ -1763,11 +2115,12 @@ heap_page_body_free(struct heap_page_body *page_body)
 }
 
 static void
-heap_page_free(rb_objspace_t *objspace, struct heap_page *page)
+heap_page_free(rb_objspace_t *objspace, struct heap_page *page, bool log)
 {
     objspace->heap_pages.freed_pages++;
     heap_page_body_free(page->body);
     free(page);
+    psweep_debug(1, "[gc] heap_page_free heap:%p page:%p\n", page->heap, page);
 }
 
 static void
@@ -1783,7 +2136,7 @@ heap_pages_free_unused_pages(rb_objspace_t *objspace)
             struct heap_page *page = rb_darray_get(objspace->heap_pages.sorted, i);
 
             if (heap_page_in_global_empty_pages_pool(objspace, page) && heap_pages_freeable_pages > 0) {
-                heap_page_free(objspace, page);
+                heap_page_free(objspace, page, true);
                 heap_pages_freeable_pages--;
             }
             else {
@@ -1908,6 +2261,8 @@ heap_page_body_allocate(void)
     return page_body;
 }
 
+/* Try to "resurrect" an empty page by removing it from the `objspace->empty_pages` list */
+/* NOTE: empty pages can go to any heap */
 static struct heap_page *
 heap_page_resurrect(rb_objspace_t *objspace)
 {
@@ -1920,6 +2275,7 @@ heap_page_resurrect(rb_objspace_t *objspace)
         objspace->empty_pages_count--;
         page = objspace->empty_pages;
         objspace->empty_pages = page->free_next;
+        page->freelist = NULL;
     }
 
     return page;
@@ -1973,8 +2329,9 @@ heap_page_allocate(rb_objspace_t *objspace)
     return page;
 }
 
+/* Add either an empty page (objspace->empty_pages) or a newly allocated page to a heap. Thread the freelist and set `heap->free_slots` */
 static void
-heap_add_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
+heap_add_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page, bool sweep_lock_taken)
 {
     /* Adding to eden heap during incremental sweeping is forbidden */
     GC_ASSERT(!heap->sweeping_page);
@@ -1994,6 +2351,7 @@ heap_add_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
     page->slot_size = heap->slot_size;
     page->slot_div_magic = slot_div_magics[heap - heaps];
     page->heap = heap;
+    page->free_next = NULL;
 
     memset(&page->wb_unprotected_bits[0], 0, HEAP_PAGE_BITMAP_SIZE);
     memset(&page->age_bits[0], 0, sizeof(page->age_bits));
@@ -2001,22 +2359,31 @@ heap_add_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
     asan_unlock_freelist(page);
     page->freelist = NULL;
     asan_unpoison_memory_region(page->body, HEAP_PAGE_SIZE, false);
+    int i = 0;
     for (VALUE p = (VALUE)start; p < start + (slot_count * heap->slot_size); p += heap->slot_size) {
-        heap_page_add_freeobj(objspace, page, p);
+        i++;
+        heap_page_add_freeobj(objspace, page, p, false);
     }
+    GC_ASSERT(i == slot_count);
     asan_lock_freelist(page);
 
     page->free_slots = slot_count;
 
     heap->total_allocated_pages++;
 
-    ccan_list_add_tail(&heap->pages, &page->page_node);
+    if (!sweep_lock_taken) sweep_lock_lock(&objspace->sweep_lock);
+    {
+        ccan_list_add_tail(&heap->pages, &page->page_node);
+    }
+    if (!sweep_lock_taken) sweep_lock_unlock(&objspace->sweep_lock);
+
     heap->total_pages++;
+    GC_ASSERT(page->total_slots == page->free_slots);
     heap->total_slots += page->total_slots;
 }
 
 static int
-heap_page_allocate_and_initialize(rb_objspace_t *objspace, rb_heap_t *heap)
+heap_page_allocate_and_initialize(rb_objspace_t *objspace, rb_heap_t *heap, bool sweep_lock_taken)
 {
     gc_report(1, objspace, "heap_page_allocate_and_initialize: rb_darray_size(objspace->heap_pages.sorted): %"PRIdSIZE", "
                   "allocatable_bytes: %"PRIdSIZE", heap->total_pages: %"PRIdSIZE"\n",
@@ -2026,6 +2393,7 @@ heap_page_allocate_and_initialize(rb_objspace_t *objspace, rb_heap_t *heap)
     struct heap_page *page = heap_page_resurrect(objspace);
 
     if (page == NULL && objspace->heap_pages.allocatable_bytes > 0) {
+        psweep_debug(1, "[gc] heap_page_allocate_and_initialize: no empty pages, allocating page\n");
         page = heap_page_allocate(objspace);
         allocated = true;
 
@@ -2033,8 +2401,8 @@ heap_page_allocate_and_initialize(rb_objspace_t *objspace, rb_heap_t *heap)
     }
 
     if (page != NULL) {
-        heap_add_page(objspace, heap, page);
-        heap_add_freepage(heap, page);
+        heap_add_page(objspace, heap, page, sweep_lock_taken);
+        heap_add_freepage(heap, page, "allocate_and_initialize");
 
         if (allocated) {
             size_t page_bytes = (size_t)page->total_slots * page->slot_size;
@@ -2051,21 +2419,25 @@ heap_page_allocate_and_initialize(rb_objspace_t *objspace, rb_heap_t *heap)
 }
 
 static void
-heap_page_allocate_and_initialize_force(rb_objspace_t *objspace, rb_heap_t *heap)
+heap_page_allocate_and_initialize_force(rb_objspace_t *objspace, rb_heap_t *heap, bool sweep_lock_taken)
 {
     size_t prev_allocatable_bytes = objspace->heap_pages.allocatable_bytes;
     objspace->heap_pages.allocatable_bytes = HEAP_PAGE_SIZE;
-    heap_page_allocate_and_initialize(objspace, heap);
+    heap_page_allocate_and_initialize(objspace, heap, sweep_lock_taken);
     GC_ASSERT(heap->free_pages != NULL);
     objspace->heap_pages.allocatable_bytes = prev_allocatable_bytes;
 }
 
+// Run incremental marking and/or sweeping, if in incremental marking or sweeping mode
 static void
 gc_continue(rb_objspace_t *objspace, rb_heap_t *heap)
 {
     unsigned int lock_lev;
     bool needs_gc = is_incremental_marking(objspace) || needs_continue_sweeping(objspace, heap);
-    if (!needs_gc) return;
+    if (!needs_gc) {
+        psweep_debug(1, "[gc] gc_continue: !needs_gc\n");
+        return;
+    }
 
     gc_enter(objspace, gc_enter_event_continue, &lock_lev); // takes vm barrier, try to avoid
 
@@ -2079,32 +2451,51 @@ gc_continue(rb_objspace_t *objspace, rb_heap_t *heap)
     if (needs_continue_sweeping(objspace, heap)) {
         gc_sweep_continue(objspace, heap);
     }
+    else {
+        psweep_debug(-1, "[gc] gc_continue: !needs_continue_sweeping (lazy_sweeping:%d)\n", is_lazy_sweeping(objspace));
+    }
 
     gc_exit(objspace, gc_enter_event_continue, &lock_lev);
 }
 
+void wait_for_background_sweeping_to_finish(rb_objspace_t *objspace, bool abort_current_background_sweep, bool exit_sweep_thread, const char *from_fn);
+
 static void
 heap_prepare(rb_objspace_t *objspace, rb_heap_t *heap)
 {
     GC_ASSERT(heap->free_pages == NULL);
 
-    if (heap->total_slots < gc_params.heap_init_bytes / heap->slot_size &&
-            heap->sweeping_page == NULL) {
-        heap_page_allocate_and_initialize_force(objspace, heap);
+    if (heap->is_finished_sweeping && heap->total_slots < (gc_params.heap_init_bytes / heap->slot_size)) {
+        heap_page_allocate_and_initialize_force(objspace, heap, false);
         GC_ASSERT(heap->free_pages != NULL);
         return;
     }
+    else {
+        sweep_lock_lock(&objspace->sweep_lock);
+        {
+            if (heap->total_slots < (gc_params.heap_init_bytes / heap->slot_size) &&
+                    heap->sweeping_page == NULL && heap->swept_pages == NULL && !heap->pre_sweeping_page) {
+                heap_page_allocate_and_initialize_force(objspace, heap, true);
+                GC_ASSERT(heap->free_pages != NULL);
+                sweep_lock_unlock(&objspace->sweep_lock);
+                return;
+            }
+        }
+        sweep_lock_unlock(&objspace->sweep_lock);
+    }
 
     /* Continue incremental marking or lazy sweeping, if in any of those steps. */
     gc_continue(objspace, heap);
 
     if (heap->free_pages == NULL) {
-        heap_page_allocate_and_initialize(objspace, heap);
+        psweep_debug(1, "[gc] heap_prepare: heap->free_pages is NULL after gc_continue\n");
+        heap_page_allocate_and_initialize(objspace, heap, false);
     }
 
     /* If we still don't have a free page and not allowed to create a new page,
      * we should start a new GC cycle. */
     if (heap->free_pages == NULL) {
+        psweep_debug(1, "[gc] heap_prepare: still no heap_>free_pages even after try allocate!\n");
         GC_ASSERT(objspace->empty_pages_count == 0);
         GC_ASSERT(objspace->heap_pages.allocatable_bytes == 0);
 
@@ -2124,7 +2515,7 @@ heap_prepare(rb_objspace_t *objspace, rb_heap_t *heap)
             /* If we're not incremental marking (e.g. a minor GC) or finished
              * sweeping and still don't have a free page, then
              * gc_sweep_finish_heap should allow us to create a new page. */
-            if (heap->free_pages == NULL && !heap_page_allocate_and_initialize(objspace, heap)) {
+            if (heap->free_pages == NULL && !heap_page_allocate_and_initialize(objspace, heap, false)) {
                 if (gc_needs_major_flags == GPR_FLAG_NONE) {
                     rb_bug("cannot create a new page after GC");
                 }
@@ -2137,7 +2528,7 @@ heap_prepare(rb_objspace_t *objspace, rb_heap_t *heap)
                         gc_continue(objspace, heap);
 
                         if (heap->free_pages == NULL &&
-                                !heap_page_allocate_and_initialize(objspace, heap)) {
+                                !heap_page_allocate_and_initialize(objspace, heap, false)) {
                             rb_bug("cannot create a new page after major GC");
                         }
                     }
@@ -2171,6 +2562,7 @@ static inline VALUE
 newobj_init(VALUE klass, VALUE flags, int wb_protected, rb_objspace_t *objspace, VALUE obj)
 {
     GC_ASSERT(BUILTIN_TYPE(obj) == T_NONE);
+    GC_ASSERT(RVALUE_AGE_GET(obj) == 0);
     GC_ASSERT((flags & FL_WB_PROTECTED) == 0);
     RBASIC(obj)->flags = flags;
     *((VALUE *)&RBASIC(obj)->klass) = klass;
@@ -2179,7 +2571,7 @@ newobj_init(VALUE klass, VALUE flags, int wb_protected, rb_objspace_t *objspace,
 #endif
 
 
-#if RACTOR_CHECK_MODE
+#if RACTOR_CHECK_MODE > 10
     void rb_ractor_setup_belonging(VALUE obj);
     rb_ractor_setup_belonging(obj);
 #endif
@@ -2281,6 +2673,7 @@ ractor_cache_allocate_slot(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *ca
     }
 
     if (RB_LIKELY(p)) {
+        psweep_debug(2, "[gc] allocate slot: %p from heap:%p page:%p\n", p, &heaps[heap_idx], heap_cache->using_page);
         VALUE obj = (VALUE)p;
         rb_asan_unpoison_object(obj, true);
         heap_cache->freelist = p->next;
@@ -2315,8 +2708,8 @@ heap_next_free_page(rb_objspace_t *objspace, rb_heap_t *heap)
 
     page = heap->free_pages;
     heap->free_pages = page->free_next;
-
-    GC_ASSERT(page->free_slots != 0);
+    psweep_debug(1, "[gc] heap_next_free_page heap:%p free_pages:%p -> %p (free_slots:%d)\n", heap, page, heap->free_pages, page->free_slots);
+    GC_ASSERT(page->free_slots > 0);
 
     asan_unlock_freelist(page);
 
@@ -2451,6 +2844,8 @@ newobj_alloc(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t he
 
 ALWAYS_INLINE(static VALUE newobj_slowpath(VALUE klass, VALUE flags, rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, int wb_protected, size_t heap_idx));
 
+static const char *type_name(int type, VALUE obj);
+
 static inline VALUE
 newobj_slowpath(VALUE klass, VALUE flags, rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, int wb_protected, size_t heap_idx)
 {
@@ -2466,7 +2861,7 @@ newobj_slowpath(VALUE klass, VALUE flags, rb_objspace_t *objspace, rb_ractor_new
                 if (rb_memerror_reentered()) {
                     rb_memerror();
                 }
-                rb_bug("object allocation during garbage collection phase");
+                rb_bug("object allocation during garbage collection phase for klass %s\n", type_name(flags & T_MASK, 0));
             }
 
             if (ruby_gc_stressful) {
@@ -2611,26 +3006,28 @@ rb_gc_impl_pointer_to_heap_p(void *objspace_ptr, const void *ptr)
     return is_pointer_to_heap(objspace_ptr, ptr);
 }
 
-#define ZOMBIE_OBJ_KEPT_FLAGS (FL_FINALIZE)
 
 void
 rb_gc_impl_make_zombie(void *objspace_ptr, VALUE obj, void (*dfree)(void *), void *data)
 {
     rb_objspace_t *objspace = objspace_ptr;
 
+    struct heap_page *page = GET_HEAP_PAGE(obj);
     struct RZombie *zombie = RZOMBIE(obj);
     zombie->flags = T_ZOMBIE | (zombie->flags & ZOMBIE_OBJ_KEPT_FLAGS);
     zombie->dfree = dfree;
     zombie->data = data;
-    VALUE prev, next = heap_pages_deferred_final;
+    VALUE prev, next = (VALUE)RUBY_ATOMIC_PTR_LOAD(heap_pages_deferred_final);
+    GC_ASSERT(page == GET_HEAP_PAGE(zombie));
     do {
         zombie->next = prev = next;
         next = RUBY_ATOMIC_VALUE_CAS(heap_pages_deferred_final, prev, obj);
     } while (next != prev);
-
-    struct heap_page *page = GET_HEAP_PAGE(obj);
-    page->final_slots++;
-    page->heap->final_slots_count++;
+    page->final_slots++; // NOTE: not synchronized, but either background thread or user thread owns page during free
+#if RUBY_DEBUG
+    RUBY_ATOMIC_INC(page->heap->made_zombies);
+#endif
+    RUBY_ATOMIC_SIZE_INC(page->heap->final_slots_count);
 }
 
 typedef int each_obj_callback(void *, void *, size_t, void *);
@@ -2754,6 +3151,7 @@ objspace_each_exec(bool protected, struct each_obj_data *each_obj_data)
 static void
 objspace_each_objects(rb_objspace_t *objspace, each_obj_callback *callback, void *data, bool protected)
 {
+    wait_for_background_sweeping_to_finish(objspace, true, false, "objspace_each_objects");
     struct each_obj_data each_obj_data = {
         .objspace = objspace,
         .each_obj_callback = callback,
@@ -2901,30 +3299,33 @@ run_final(rb_objspace_t *objspace, VALUE zombie, unsigned int lev)
     return lev;
 }
 
+void
+rb_gc_impl_free_zombie(rb_objspace_t *objspace, VALUE obj)
+{
+    GC_ASSERT(!is_sweep_thread_p());
+    struct heap_page *page = GET_HEAP_PAGE(obj);
+    GC_ASSERT(RUBY_ATOMIC_VALUE_LOAD(page->heap->final_slots_count) > 0);
+    RUBY_ATOMIC_SIZE_DEC(page->heap->final_slots_count);
+    GC_ASSERT(page->final_slots > 0);
+    page->final_slots--;
+    RVALUE_AGE_SET_BITMAP(obj, 0);
+}
+
 static void
 finalize_list(rb_objspace_t *objspace, VALUE zombie)
 {
     while (zombie) {
         VALUE next_zombie;
-        struct heap_page *page;
         rb_asan_unpoison_object(zombie, false);
         next_zombie = RZOMBIE(zombie)->next;
-        page = GET_HEAP_PAGE(zombie);
 
         unsigned int lev = RB_GC_VM_LOCK();
 
         lev = run_final(objspace, zombie, lev);
         {
             GC_ASSERT(BUILTIN_TYPE(zombie) == T_ZOMBIE);
-            GC_ASSERT(page->heap->final_slots_count > 0);
-            GC_ASSERT(page->final_slots > 0);
-
-            page->heap->final_slots_count--;
-            page->final_slots--;
-            page->free_slots++;
-            RVALUE_AGE_SET_BITMAP(zombie, 0);
-            heap_page_add_freeobj(objspace, page, zombie);
-            page->heap->total_freed_objects++;
+            GC_ASSERT(!FL_TEST(zombie, FL_FINALIZE));
+            ZOMBIE_SET_NEEDS_FREE_FLAG(zombie);
         }
         RB_GC_VM_UNLOCK(lev);
 
@@ -2981,15 +3382,27 @@ gc_abort(void *objspace_ptr)
         objspace->flags.during_incremental_marking = FALSE;
     }
 
+#if RUBY_DEBUG
+    sweep_lock_lock(&objspace->sweep_lock);
+    GC_ASSERT(!objspace->sweep_rest);
+    sweep_lock_unlock(&objspace->sweep_lock);
+#endif
+
+    wait_for_background_sweeping_to_finish(objspace, true, false, "gc_abort");
+
     if (is_lazy_sweeping(objspace)) {
         for (int i = 0; i < HEAP_COUNT; i++) {
             rb_heap_t *heap = &heaps[i];
-
             heap->sweeping_page = NULL;
+            heap->swept_pages = NULL;
+
+            heap->pre_sweeping_page = NULL;
+            heap->is_finished_sweeping = false;
+            heap->background_sweep_steps = heap->foreground_sweep_steps;
             struct heap_page *page = NULL;
 
             ccan_list_for_each(&heap->pages, page, page_node) {
-                page->flags.before_sweep = false;
+                page->before_sweep = 0;
             }
         }
     }
@@ -3043,6 +3456,8 @@ rb_gc_impl_shutdown_call_finalizer_i(st_data_t key, st_data_t val, st_data_t _da
     return ST_DELETE;
 }
 
+void rb_gc_stop_background_threads(rb_objspace_t *objspace, const char *from_fn);
+
 void
 rb_gc_impl_shutdown_call_finalizer(void *objspace_ptr)
 {
@@ -3052,6 +3467,8 @@ rb_gc_impl_shutdown_call_finalizer(void *objspace_ptr)
     gc_verify_internal_consistency(objspace);
 #endif
 
+    wait_for_background_sweeping_to_finish(objspace, true, false, "shutdown_call_finalizer");
+
     /* prohibit incremental GC */
     objspace->flags.dont_incremental = 1;
 
@@ -3066,7 +3483,6 @@ rb_gc_impl_shutdown_call_finalizer(void *objspace_ptr)
         st_foreach(finalizer_table, rb_gc_impl_shutdown_call_finalizer_i, 0);
     }
 
-    /* run finalizers */
     finalize_deferred(objspace);
     GC_ASSERT(heap_pages_deferred_final == 0);
 
@@ -3481,8 +3897,11 @@ struct gc_sweep_context {
     int final_slots;
     int freed_slots;
     int empty_slots;
+    int zombie_slots; /* pre-existing zombies not yet ready to free */
 };
 
+bool rb_gc_obj_needs_cleanup_p(VALUE obj);
+
 static inline void
 gc_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, uintptr_t p, bits_t bitset, struct gc_sweep_context *ctx)
 {
@@ -3497,7 +3916,7 @@ gc_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, uintptr_t p, bits_t bit
         if (bitset & 1) {
             switch (BUILTIN_TYPE(vp)) {
               case T_MOVED:
-                if (objspace->flags.during_compacting) {
+                if (RB_UNLIKELY(objspace->flags.during_compacting)) {
                     /* The sweep cursor shouldn't have made it to any
                      * T_MOVED slots while the compact flag is enabled.
                      * The sweep cursor and compact cursor move in
@@ -3507,23 +3926,31 @@ gc_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, uintptr_t p, bits_t bit
                 }
                 gc_report(3, objspace, "page_sweep: %s is added to freelist\n", rb_obj_info(vp));
                 ctx->empty_slots++;
-                heap_page_add_freeobj(objspace, sweep_page, vp);
+                heap_page_add_freeobj(objspace, sweep_page, vp, false);
                 break;
               case T_ZOMBIE:
-                /* already counted */
+                if (ZOMBIE_NEEDS_FREE_P(vp)) {
+                    goto free_object;
+                }
+                /* already counted as final slot */
+                ctx->zombie_slots++;
                 break;
               case T_NONE:
                 ctx->empty_slots++; /* already freed */
                 break;
 
               default:
+              free_object:
+                psweep_debug(0, "[gc] gc_sweep_plane: heap:%p (%ld) freeing obj:%p (%s)\n", heap, heap - heaps, (void*)vp, rb_obj_info(vp));
 #if RGENGC_CHECK_MODE
                 if (!is_full_marking(objspace)) {
-                    if (RVALUE_OLD_P(objspace, vp)) rb_bug("page_sweep: %p - old while minor GC.", (void *)p);
-                    if (RVALUE_REMEMBERED(objspace, vp)) rb_bug("page_sweep: %p - remembered.", (void *)p);
+                    if (RVALUE_OLD_P(objspace, vp)) rb_bug("page_sweep: %p - old while minor GC.", (void *)vp);
+                    if (RVALUE_REMEMBERED(objspace, vp)) rb_bug("page_sweep: %p - remembered.", (void *)vp);
                 }
 #endif
 
+                if (RVALUE_WB_UNPROTECTED(objspace, vp)) CLEAR_IN_BITMAP(GET_HEAP_WB_UNPROTECTED_BITS(vp), vp);
+
 #if RGENGC_CHECK_MODE
 #define CHECK(x) if (x(objspace, vp) != FALSE) rb_bug("obj_free: " #x "(%s) != FALSE", rb_obj_info(vp))
                 CHECK(RVALUE_WB_UNPROTECTED);
@@ -3538,21 +3965,23 @@ gc_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, uintptr_t p, bits_t bit
                         rb_gc_event_hook(vp, RUBY_INTERNAL_EVENT_FREEOBJ);
                     }
 
-                    (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)p, slot_size);
-                    heap_page_add_freeobj(objspace, sweep_page, vp);
                     gc_report(3, objspace, "page_sweep: %s (fast path) added to freelist\n", rb_obj_info(vp));
+                    RVALUE_AGE_SET_BITMAP(vp, 0);
+                    heap_page_add_freeobj(objspace, sweep_page, vp, false);
+                    (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)vp, slot_size);
                     ctx->freed_slots++;
                 }
                 else {
-                    gc_report(2, objspace, "page_sweep: free %p\n", (void *)p);
+                    gc_report(2, objspace, "page_sweep: free %p\n", (void *)vp);
 
                     rb_gc_event_hook(vp, RUBY_INTERNAL_EVENT_FREEOBJ);
 
                     rb_gc_obj_free_vm_weak_references(vp);
                     if (rb_gc_obj_free(objspace, vp)) {
-                        (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)p, slot_size);
-                        heap_page_add_freeobj(objspace, sweep_page, vp);
                         gc_report(3, objspace, "page_sweep: %s is added to freelist\n", rb_obj_info(vp));
+                        RVALUE_AGE_SET_BITMAP(vp, 0);
+                        heap_page_add_freeobj(objspace, sweep_page, vp, false);
+                        (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)vp, slot_size);
                         ctx->freed_slots++;
                     }
                     else {
@@ -3562,11 +3991,139 @@ gc_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, uintptr_t p, bits_t bit
                 break;
             }
         }
+        else {
+            GC_ASSERT(RVALUE_MARKED(objspace, vp));
+        }
         p += slot_size;
         bitset >>= 1;
     } while (bitset);
 }
 
+void
+wait_for_background_sweeping_to_finish(rb_objspace_t *objspace, bool abort_current_background_sweep, bool exit_sweep_thread, const char *from_fn)
+{
+    if (!objspace->sweep_thread) {
+        return;
+    }
+    sweep_lock_lock(&objspace->sweep_lock);
+    if (abort_current_background_sweep) {
+        objspace->background_sweep_abort = true;
+        objspace->background_sweep_restart_heaps = false;
+        objspace->sweep_thread_sweep_requested = false;
+    }
+    while (objspace->sweep_thread_running && objspace->sweep_thread_sweeping) {
+        psweep_debug(1, "[gc] Waiting for sweep thread to finish (abort_sweep:%d, from_fn:%s)\n", abort_current_background_sweep, from_fn);
+        rb_native_cond_signal(&objspace->sweep_cond);
+        sweep_lock_set_unlocked();
+        rb_native_cond_wait(&objspace->sweep_cond, &objspace->sweep_lock);
+        sweep_lock_set_locked();
+    }
+    if (exit_sweep_thread) {
+        GC_ASSERT(abort_current_background_sweep);
+        objspace->sweep_thread_running = false;
+        while (!objspace->sweep_thread_sweep_exited) {
+            rb_native_cond_signal(&objspace->sweep_cond);
+            sweep_lock_set_unlocked();
+            rb_native_cond_wait(&objspace->sweep_cond, &objspace->sweep_lock);
+            sweep_lock_set_locked();
+        }
+        pthread_join(objspace->sweep_thread, NULL);
+        psweep_debug(0, "Sweep thread joined from %s\n", from_fn);
+        GET_VM()->gc.sweep_thread = 0;
+        objspace->sweep_thread = 0;
+    }
+    else {
+        psweep_debug(0, "Waited for sweep thread to finish sweep from %s\n", from_fn);
+    }
+    objspace->background_sweep_abort = false;
+    objspace->background_sweep_mode = false;
+    sweep_lock_unlock(&objspace->sweep_lock);
+}
+
+// Free the object in a Ruby thread. Return whether or not we put the slot back on the page's freelist.
+static bool
+deferred_free(rb_objspace_t *objspace, VALUE obj)
+{
+    ASSERT_vm_locking_with_barrier();
+    bool result;
+#ifdef PSWEEP_DEBUG
+    MAYBE_UNUSED(const char *obj_info) = rb_obj_info(obj);
+#endif
+    bool freed_weakrefs = rb_gc_obj_free_vm_weak_references(obj);
+    (void)freed_weakrefs;
+    GC_ASSERT(freed_weakrefs);
+    if (rb_gc_obj_free(objspace, obj)) {
+        struct heap_page *page = GET_HEAP_PAGE(obj);
+        psweep_debug(1, "[gc] deferred free: page(%p) obj(%p) %s (success)\n", page, (void*)obj, obj_info);
+        RVALUE_AGE_SET_BITMAP(obj, 0);
+        heap_page_add_freeobj(objspace, page, obj, false);
+        (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)obj, page->slot_size);
+        result = true;
+    }
+    else {
+#if RUBY_DEBUG
+        if (!(BUILTIN_TYPE(obj) == T_ZOMBIE && !FL_TEST(obj, FL_FREEZE))) {
+            rb_bug("should be unfreeable zombie");
+        }
+#endif
+        result = false;
+        MAYBE_UNUSED(struct heap_page *page) = GET_HEAP_PAGE(obj);
+        psweep_debug(1, "[gc] deferred sweep: page(%p) obj(%p) %s (zombie)\n", page, (void*)obj, obj_info);
+    }
+    return result;
+}
+
+// Clear bits for the page that was swept by the background thread.
+static inline void
+gc_post_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *sweep_page)
+{
+    GC_ASSERT(sweep_page->heap == heap);
+
+    bits_t *bits;
+
+    gc_report(2, objspace, "post_page_sweep: start.\n");
+
+#if RGENGC_CHECK_MODE
+    if (!objspace->flags.immediate_sweep) {
+        GC_ASSERT(RUBY_ATOMIC_LOAD(sweep_page->before_sweep));
+    }
+#endif
+    rbimpl_atomic_store(&sweep_page->before_sweep, 0, RBIMPL_ATOMIC_RELEASE);
+
+    bits = sweep_page->mark_bits;
+
+    int total_slots = sweep_page->total_slots;
+    int bitmap_plane_count = CEILDIV(total_slots, BITS_BITLENGTH);
+
+    int out_of_range_bits = total_slots % BITS_BITLENGTH;
+    if (out_of_range_bits != 0) {
+        bits[bitmap_plane_count - 1] |= ~(((bits_t)1 << out_of_range_bits) - 1);
+    }
+
+    // Clear wb_unprotected and age bits for all unmarked slots
+    {
+        bits_t *wb_unprotected_bits = sweep_page->wb_unprotected_bits;
+        bits_t *age_bits = sweep_page->age_bits;
+        for (int i = 0; i < bitmap_plane_count; i++) {
+            bits_t unmarked = ~bits[i];
+            wb_unprotected_bits[i] &= ~unmarked;
+            age_bits[i * 2] &= ~unmarked;
+            age_bits[i * 2 + 1] &= ~unmarked;
+        }
+    }
+
+    if (!heap->compact_cursor) {
+        gc_setup_mark_bits(sweep_page);
+    }
+
+    if (RUBY_ATOMIC_PTR_LOAD(heap_pages_deferred_final) && !finalizing) {
+        gc_finalize_deferred_register(objspace);
+    }
+
+    gc_report(2, objspace, "post_page_sweep: end.\n");
+}
+
+// Sweep a page by the Ruby thread (synchronous freeing).
 static inline void
 gc_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct gc_sweep_context *ctx)
 {
@@ -3578,12 +4135,14 @@ gc_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct gc_sweep_context
 
     gc_report(2, objspace, "page_sweep: start.\n");
 
+    psweep_debug(1, "[gc] gc_sweep_page: heap:%p (%ld) page:%p\n", heap, heap - heaps, sweep_page);
+
 #if RGENGC_CHECK_MODE
     if (!objspace->flags.immediate_sweep) {
-        GC_ASSERT(sweep_page->flags.before_sweep == TRUE);
+        GC_ASSERT(RUBY_ATOMIC_LOAD(sweep_page->before_sweep));
     }
 #endif
-    sweep_page->flags.before_sweep = FALSE;
+    rbimpl_atomic_store(&sweep_page->before_sweep, 0, RBIMPL_ATOMIC_RELEASE);
     sweep_page->free_slots = 0;
 
     p = (uintptr_t)sweep_page->start;
@@ -3597,6 +4156,14 @@ gc_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct gc_sweep_context
         bits[bitmap_plane_count - 1] |= ~(((bits_t)1 << out_of_range_bits) - 1);
     }
 
+    for (int i = 0; i < bitmap_plane_count; i++) {
+        bitset = ~bits[i];
+        if (bitset) {
+            gc_sweep_plane(objspace, heap, p, bitset, ctx);
+        }
+        p += BITS_BITLENGTH * slot_size;
+    }
+
     // Clear wb_unprotected and age bits for all unmarked slots
     {
         bits_t *wb_unprotected_bits = sweep_page->wb_unprotected_bits;
@@ -3609,13 +4176,26 @@ gc_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct gc_sweep_context
         }
     }
 
-    for (int i = 0; i < bitmap_plane_count; i++) {
-        bitset = ~bits[i];
-        if (bitset) {
-            gc_sweep_plane(objspace, heap, p, bitset, ctx);
+#if RGENGC_CHECK_MODE
+    {
+        /* Assert that all unmarked slots with live objects were either freed or made into zombies. */
+        int unmarked_slots = 0;
+        for (int i = 0; i < bitmap_plane_count; i++) {
+            bits_t unmarked = ~bits[i];
+            unmarked_slots += (int)popcount_bits(unmarked);
+        }
+
+        int freed_or_zombie = ctx->freed_slots + ctx->final_slots;
+        int unmarked_live = unmarked_slots - ctx->empty_slots - ctx->zombie_slots;
+        if (freed_or_zombie != unmarked_live) {
+            rb_bug("gc_sweep_page: unmarked live slot count mismatch: "
+                   "unmarked_slots=%d - empty_slots=%d - zombie_slots=%d = %d unmarked live, "
+                   "but freed_slots=%d + final_slots=%d = %d",
+                   unmarked_slots, ctx->empty_slots, ctx->zombie_slots, unmarked_live,
+                   ctx->freed_slots, ctx->final_slots, freed_or_zombie);
         }
-        p += BITS_BITLENGTH * slot_size;
     }
+#endif
 
     if (!heap->compact_cursor) {
         gc_setup_mark_bits(sweep_page);
@@ -3633,10 +4213,10 @@ gc_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct gc_sweep_context
                    sweep_page->total_slots,
                    ctx->freed_slots, ctx->empty_slots, ctx->final_slots);
 
-    sweep_page->free_slots += ctx->freed_slots + ctx->empty_slots;
     sweep_page->heap->total_freed_objects += ctx->freed_slots;
+    sweep_page->free_slots = ctx->freed_slots + ctx->empty_slots;
 
-    if (heap_pages_deferred_final && !finalizing) {
+    if (RUBY_ATOMIC_PTR_LOAD(heap_pages_deferred_final) && !finalizing) {
         gc_finalize_deferred_register(objspace);
     }
 
@@ -3712,17 +4292,503 @@ heap_page_freelist_append(struct heap_page *page, struct free_slot *freelist)
     }
 }
 
+static inline void
+sweep_in_ruby_thread(rb_objspace_t *objspace, struct heap_page *page, VALUE obj)
+{
+    page->pre_deferred_free_slots += 1;
+    psweep_debug(1, "[sweep] register sweep later: page(%p), obj(%p) %s\n", (void*)page, (void*)obj, rb_obj_info(obj));
+    GC_ASSERT(BUILTIN_TYPE(obj) != T_NONE);
+    MARK_IN_BITMAP(page->deferred_free_bits, obj);
+}
+
+static inline bool
+zombie_needs_deferred_free(VALUE zombie)
+{
+    return ZOMBIE_NEEDS_FREE_P(zombie);
+}
+
+#if RGENGC_CHECK_MODE
+static void
+debug_free_check(rb_objspace_t *objspace, VALUE vp)
+{
+    if (!is_full_marking(objspace)) {
+        if (RVALUE_OLD_P(objspace, vp)) rb_bug("page_sweep: %p - old while minor GC.", (void *)vp);
+        if (RVALUE_REMEMBERED(objspace, vp)) rb_bug("page_sweep: %p - remembered.", (void *)vp);
+    }
+#define CHECK(x) if (x(objspace, vp) != FALSE) rb_bug("obj_free: " #x "(%s) != FALSE", rb_obj_info(vp))
+    CHECK(RVALUE_MARKED);
+    CHECK(RVALUE_MARKING);
+    CHECK(RVALUE_UNCOLLECTIBLE);
+#undef CHECK
+}
+#else
+#define debug_free_check(...) (void)0
+#endif
+
+static inline void
+gc_pre_sweep_plane(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page, uintptr_t p, bits_t bitset, short slot_size)
+{
+    unsigned short freed = 0;
+    unsigned short empties = 0;
+    unsigned short finals = 0;
+    unsigned short zombies = 0;
+    do {
+        VALUE vp = (VALUE)p;
+        GC_ASSERT(GET_HEAP_PAGE(vp) == page);
+
+        rb_asan_unpoison_object(vp, false);
+        if (bitset & 1) {
+            GC_ASSERT(!RVALUE_MARKED(objspace, vp));
+            switch (BUILTIN_TYPE(vp)) {
+              case T_MOVED: {
+                empties++;
+                heap_page_add_freeobj(objspace, page, vp, true);
+                (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)vp, page->slot_size);
+                break;
+              }
+              case T_NONE:
+                empties++; // already in freelist
+                break;
+              case T_ZOMBIE:
+                if (zombie_needs_deferred_free(vp)) {
+                    sweep_in_ruby_thread(objspace, page, vp);
+                }
+                else {
+                    // already counted as final_slot when made into a zombie
+                    zombies++;
+                }
+                break;
+              case T_DATA: {
+                debug_free_check(objspace, vp);
+                void *data = RTYPEDDATA_P(vp) ? RTYPEDDATA_GET_DATA(vp) : DATA_PTR(vp);
+                if (!data) {
+                    goto free;
+                }
+                // NOTE: this repeats code found in `rb_data_free`. This is just for testing purposes.
+                bool free_immediately = false;
+                void (*dfree)(void *);
+                if (RTYPEDDATA_P(vp)) {
+                    free_immediately = (RTYPEDDATA_TYPE(vp)->flags & RUBY_TYPED_FREE_IMMEDIATELY) != 0 && (RTYPEDDATA_TYPE(vp)->flags & RUBY_TYPED_CONCURRENT_FREE_SAFE) != 0;
+                    dfree = RTYPEDDATA_TYPE(vp)->function.dfree;
+                }
+                else {
+                    dfree = RDATA(vp)->dfree;
+                }
+                if (!dfree || dfree == RUBY_DEFAULT_FREE || free_immediately) {
+                    goto free;
+                }
+                else {
+                    sweep_in_ruby_thread(objspace, page, vp);
+                    break;
+                }
+                break;
+              }
+              case T_IMEMO: {
+                debug_free_check(objspace, vp);
+                switch (imemo_type(vp)) {
+                    case imemo_callcache:
+                    case imemo_constcache:
+                    case imemo_cref:
+                    case imemo_env:
+                    case imemo_ifunc:
+                    case imemo_memo:
+                    case imemo_svar:
+                    case imemo_throw_data:
+                    case imemo_tmpbuf:
+                    case imemo_fields:
+                        goto free;
+                    case imemo_callinfo:
+                    case imemo_iseq: // calls rb_yjit_iseq_free which is not concurrency safe
+                    case imemo_ment:
+                        // blacklisted due to vm weak references
+                        sweep_in_ruby_thread(objspace, page, vp);
+                        break;
+                    default:
+                        rb_bug("Unknown imemo type: %d\n", imemo_type(vp));
+                }
+                break;
+              }
+              case T_COMPLEX:
+              case T_RATIONAL:
+              case T_FLOAT:
+              case T_BIGNUM:
+              case T_OBJECT:
+              case T_STRING:
+              case T_SYMBOL:
+              case T_ARRAY:
+              case T_HASH:
+              case T_STRUCT:
+              case T_MATCH:
+              case T_REGEXP:
+              case T_FILE: {
+                debug_free_check(objspace, vp);
+                goto free;
+              }
+              case T_CLASS:
+              case T_MODULE:
+              case T_ICLASS:
+                debug_free_check(objspace, vp);
+                if (!rb_gc_obj_needs_cleanup_p(vp)) {
+                    heap_page_add_freeobj(objspace, page, vp, true);
+                    psweep_debug(2, "[sweep] freed: page(%p), obj(%p)\n", (void*)page, (void*)vp);
+                    (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)vp, page->slot_size);
+                    freed++;
+                }
+                else {
+                    sweep_in_ruby_thread(objspace, page, vp);
+                }
+                break;
+              free: {
+                  debug_free_check(objspace, vp);
+                  if (RB_LIKELY(rb_gc_obj_free_concurrency_safe_vm_weak_references(vp))) {
+                      bool can_put_back_on_freelist = rb_gc_obj_free(objspace, vp);
+                      if (can_put_back_on_freelist) {
+                          heap_page_add_freeobj(objspace, page, vp, true);
+                          freed++;
+                          psweep_debug(2, "[sweep] freed: page(%p), obj(%p)\n", (void*)page, (void*)vp);
+                          (void)VALGRIND_MAKE_MEM_UNDEFINED((void*)vp, page->slot_size);
+                      }
+                      else {
+                          RUBY_ASSERT(BUILTIN_TYPE(vp) == T_ZOMBIE);
+                          psweep_debug(2, "[sweep] zombie: page(%p), obj(%p)\n", (void*)page, (void*)vp);
+                          finals++;
+                      }
+                  }
+                  else {
+                      GC_ASSERT(BUILTIN_TYPE(vp) != T_NONE);
+                      sweep_in_ruby_thread(objspace, page, vp);
+                  }
+                  break;
+              }
+              default:
+                rb_bug("unexpected type: %d\n", BUILTIN_TYPE(vp));
+            }
+        }
+        else {
+            GC_ASSERT(RVALUE_MARKED(objspace, vp));
+        }
+
+        p += slot_size;
+        bitset >>= 1;
+    } while (bitset);
+
+    page->pre_freed_slots += freed;
+    page->pre_empty_slots += empties;
+    page->pre_final_slots += finals;
+    page->pre_zombie_slots += zombies;
+}
+
+static void
+gc_pre_sweep_page(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
+{
+    uintptr_t p = (uintptr_t)page->start;
+    bits_t *bits = page->mark_bits;
+    bits_t bitset;
+    short slot_size = page->slot_size;
+    int total_slots = page->total_slots;
+    psweep_debug(1, "[sweep] gc_pre_sweep_page(heap:%p page:%p) start\n", heap, page);
+    GC_ASSERT(page->heap == heap);
+    page->pre_deferred_free_slots = 0;
+    memset(page->deferred_free_bits, 0, sizeof(page->deferred_free_bits));
+    page->pre_zombie_slots = 0;
+    page->pre_freed_malloc_bytes = 0;
+    current_sweep_thread_page = page;
+
+    int bitmap_plane_count = CEILDIV(total_slots, BITS_BITLENGTH);
+    int out_of_range_bits = total_slots % BITS_BITLENGTH;
+
+    if (out_of_range_bits != 0) {
+        bits[bitmap_plane_count - 1] |= ~(((bits_t)1 << out_of_range_bits) - 1);
+    }
+
+    for (int i = 0; i < bitmap_plane_count; i++) {
+        bitset = ~bits[i];
+        if (bitset) {
+            gc_pre_sweep_plane(objspace, heap, page, p, bitset, slot_size);
+        }
+        p += BITS_BITLENGTH * slot_size;
+    }
+    objspace->profile.pages_swept_by_sweep_thread++;
+    if (page->pre_deferred_free_slots > 0) {
+        objspace->profile.pages_swept_by_sweep_thread_had_deferred_free_objects++;
+    }
+
+#if RGENGC_CHECK_MODE
+    {
+        /* Assert that all unmarked slots with live objects were either freed, made into
+         * zombies, or deferred to the Ruby thread. */
+        int unmarked_slots = 0;
+        for (int i = 0; i < bitmap_plane_count; i++) {
+            bits_t unmarked = ~bits[i];
+            unmarked_slots += (int)popcount_bits(unmarked);
+        }
+
+        int freed_or_zombie = page->pre_freed_slots + page->pre_final_slots + page->pre_deferred_free_slots;
+        int unmarked_live = unmarked_slots - page->pre_empty_slots - page->pre_zombie_slots;
+        if (freed_or_zombie != unmarked_live) {
+            rb_bug("gc_pre_sweep_page: unmarked live slot count mismatch: "
+                   "unmarked_slots=%d - empty_slots=%d - zombie_slots=%d = %d unmarked live, "
+                   "but freed_slots=%d + final_slots=%d + deferred_free_slots=%d = %d",
+                   unmarked_slots, page->pre_empty_slots, page->pre_zombie_slots, unmarked_live,
+                   page->pre_freed_slots, page->pre_final_slots, page->pre_deferred_free_slots, freed_or_zombie);
+        }
+    }
+#endif
+
+#if USE_MALLOC_INCREASE_LOCAL
+    malloc_increase_local_flush(objspace);
+#endif
+    current_sweep_thread_page = NULL;
+
+    psweep_debug(1, "[sweep] gc_pre_sweep_page(heap:%p page:%p) done, deferred free:%d\n", heap, page, page->pre_deferred_free_slots);
+}
+
+static inline bool
+done_worker_incremental_sweep_steps_p(rb_objspace_t *objspace, rb_heap_t *heap)
+{
+    if (rbimpl_atomic_load(&heap->foreground_sweep_steps, RBIMPL_ATOMIC_ACQUIRE) != heap->background_sweep_steps) {
+        GC_ASSERT(ATOMIC_LOAD_RELAXED(heap->foreground_sweep_steps) > heap->background_sweep_steps);
+        return true;
+    }
+    return false;
+}
+
+static bool
+bitmap_is_all_zero(bits_t *bits, size_t count)
+{
+    for (size_t i = 0; i < count; i++) {
+        if (bits[i] != 0) return false;
+    }
+    return true;
+}
+
+static void
+move_to_empty_pages(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *page)
+{
+    GC_ASSERT(bitmap_is_all_zero(page->mark_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->uncollectible_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->wb_unprotected_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->marking_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->remembered_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->deferred_free_bits, HEAP_PAGE_BITMAP_LIMIT));
+    GC_ASSERT(bitmap_is_all_zero(page->age_bits, HEAP_PAGE_BITMAP_LIMIT * RVALUE_AGE_BIT_COUNT));
+    // NOTE: pinned bits can still be set, but it's okay because they are cleared when compaction starts
+
+    heap_unlink_page(objspace, heap, page);
+
+    page->start = 0;
+    page->total_slots = 0;
+    page->slot_size = 0;
+    page->heap = NULL;
+    page->free_slots = 0;
+
+    asan_unlock_freelist(page);
+    page->freelist = NULL;
+    asan_lock_freelist(page);
+
+    asan_poison_memory_region(page->body, HEAP_PAGE_SIZE);
+
+    objspace->empty_pages_count++;
+    page->free_next = objspace->empty_pages;
+    objspace->empty_pages = page;
+}
+
+static void
+clear_pre_sweep_fields(struct heap_page *page)
+{
+    page->pre_freed_slots = 0;
+    page->pre_deferred_free_slots = 0;
+    memset(page->deferred_free_bits, 0, sizeof(page->deferred_free_bits));
+    page->pre_empty_slots = 0;
+    page->pre_final_slots = 0;
+    page->pre_zombie_slots = 0;
+    page->pre_freed_malloc_bytes = 0;
+}
+
+// Perform incremental (lazy) sweep on a heap by the background sweep thread.
+static void
+gc_sweep_step_worker(rb_objspace_t *objspace, rb_heap_t *heap)
+{
+    // sweep_lock is acquired
+    //
+    // We're finished either when they are no pages left to pre-sweep, OR:
+    // 1) When we're not in `sweep_rest` or `background_mode`, if we've encountered a change in `heap->foreground_sweep_steps`
+    GC_ASSERT(heap->background_sweep_steps <= ATOMIC_LOAD_RELAXED(heap->foreground_sweep_steps));
+    if (heap->done_background_sweep) {
+        psweep_debug(-2, "[sweep] gc_sweep_step_worker: heap:%p (%ld) - done (early return)\n", heap, heap - heaps);
+        return;
+    }
+    else if (heap->skip_sweep_continue) {
+        psweep_debug(-2, "[sweep] gc_sweep_step_worker: heap:%p (%ld) - skip_continue (early return)\n", heap, heap - heaps);
+        heap->skip_sweep_continue = false;
+        return;
+    }
+    while (1) {
+        struct heap_page *sweep_page = heap->sweeping_page;
+        if (!sweep_page) {
+            GC_ASSERT(!heap->done_background_sweep);
+            GC_ASSERT(objspace->heaps_done_background_sweep < HEAP_COUNT);
+            heap->done_background_sweep = true;
+            objspace->heaps_done_background_sweep++;
+            psweep_debug(-2, "[sweep] gc_sweep_step_worker: heap:%p (%ld) - !sweeping_page\n", heap, heap - heaps);
+            break;
+        }
+        sweep_page->free_next = NULL;
+        struct heap_page *next = ccan_list_next(&heap->pages, sweep_page, page_node);
+
+        if (!next) {
+            GC_ASSERT(!heap->done_background_sweep);
+            GC_ASSERT(objspace->heaps_done_background_sweep < HEAP_COUNT);
+            heap->done_background_sweep = true;
+            objspace->heaps_done_background_sweep++;
+            psweep_debug(-2, "[sweep] gc_sweep_step_worker: heap:%p (%ld) - !next\n", heap, heap - heaps);
+            // Let Ruby thread deal with last page of the heap.
+            break;
+        }
+
+        heap->sweeping_page = next;
+        heap->pre_sweeping_page = sweep_page;
+
+        sweep_lock_unlock(&objspace->sweep_lock);
+
+        gc_pre_sweep_page(objspace, heap, sweep_page);
+
+        sweep_lock_lock(&objspace->sweep_lock);
+        heap->pre_sweeping_page = NULL;
+        sweep_page->free_next = NULL;
+
+        int pre_freed_slots = sweep_page->pre_freed_slots;
+        int pre_empty_slots = sweep_page->pre_empty_slots;
+        int free_slots = pre_freed_slots + pre_empty_slots;
+
+#if PSWEEP_LOCK_STATS > 0
+        instrumented_lock_acquire(&heap->swept_pages_lock, &swept_pages_lock_stats);
+#else
+        rb_native_mutex_lock(&heap->swept_pages_lock);
+#endif
+        {
+            if (heap->swept_pages) {
+                // NOTE: heap->swept_pages needs to be in swept order for gc_sweep_step to work properly.
+                // TODO: Change to LIFO to get better shared memory cache benefits across threads (L2/L3)
+                struct heap_page *latest = heap->latest_swept_page;
+                GC_ASSERT(latest);
+                latest->free_next = sweep_page;
+            }
+            else {
+                heap->swept_pages = sweep_page;
+            }
+            heap->latest_swept_page = sweep_page;
+        }
+        rb_native_mutex_unlock(&heap->swept_pages_lock);
+
+        psweep_debug(-2, "[sweep] gc_sweep_step_worker: heap:%p (%ld) - swept page:%p\n", heap, heap - heaps, sweep_page);
+
+        if (!objspace->background_sweep_mode) {
+            if (!objspace->sweep_rest && done_worker_incremental_sweep_steps_p(objspace, heap)) {
+                rb_native_cond_broadcast(&heap->sweep_page_cond);
+                psweep_debug(-2, "[sweep] (fg) gc_sweep_step_worker: done incremental step heap:%p (%ld)\n", heap, heap - heaps);
+                heap->background_sweep_steps = ATOMIC_LOAD_RELAXED(heap->foreground_sweep_steps);
+                break;
+            }
+        }
+        else {
+            heap->pre_swept_slots_deferred += free_slots;
+            if (RB_UNLIKELY(objspace->background_sweep_abort)) {
+                psweep_debug(-2, "[sweep] (bg) gc_sweep_step_worker: break early heap:%p (%ld) (abort)\n", heap, heap - heaps);
+                break;
+            }
+            else if (objspace->background_sweep_restart_heaps) {
+                psweep_debug(-2, "[sweep] (bg) gc_sweep_step_worker: break early heap:%p (%ld) (restart)\n", heap, heap - heaps);
+                break;
+            }
+        }
+        // notify of newly swept page in case Ruby thread is waiting on us
+        rb_native_cond_broadcast(&heap->sweep_page_cond);
+    }
+    // sweep_lock is acquired
+}
+
+static void *
+gc_sweep_thread_func(void *ptr)
+{
+    rb_objspace_t *objspace = ptr;
+
+    psweep_debug(1, "[sweep] sweep_thread start\n");
+    sweep_lock_lock(&objspace->sweep_lock);
+    objspace->sweep_thread_sweep_exited = false;
+
+    while (objspace->sweep_thread_running) {
+        while (!objspace->sweep_thread_sweep_requested && objspace->sweep_thread_running) {
+            psweep_debug(1, "[sweep] sweep_thread wait\n");
+            objspace->sweep_thread_waiting_request = true;
+            sweep_lock_set_unlocked();
+            rb_native_cond_wait(&objspace->sweep_cond, &objspace->sweep_lock);
+            sweep_lock_set_locked();
+            objspace->sweep_thread_waiting_request = false;
+            psweep_debug(1, "[sweep] sweep_thread wake\n"); // requested or signalled to exit
+        }
+        if (!objspace->sweep_thread_running) {
+            break;
+        }
+
+        objspace->sweep_thread_sweep_requested = false;
+        objspace->sweep_thread_sweeping = true;
+
+    restart_heaps:
+        for (int i = 0; i < HEAP_COUNT; i++) {
+            rb_heap_t *heap = &heaps[i];
+            if (RB_UNLIKELY(objspace->background_sweep_mode && objspace->background_sweep_abort)) {
+                psweep_debug(-2, "[sweep] abort: break before sweeping heap:%p (%d)\n", heap, i);
+                break;
+            }
+            if (objspace->background_sweep_mode && objspace->background_sweep_restart_heaps) {
+                objspace->background_sweep_restart_heaps = false;
+                psweep_debug(-2, "[sweep] restart heaps from 0 (at %d)\n", i);
+                goto restart_heaps;
+            }
+            psweep_debug(-2, "[sweep] sweep heap:%p (%d)\n", heap, i);
+            gc_sweep_step_worker(objspace, heap);
+        }
+        psweep_debug(1, "[sweep] /sweep_heaps\n");
+
+        objspace->sweep_thread_sweeping = false;
+        rb_native_cond_broadcast(&objspace->sweep_cond);
+    }
+    psweep_debug(-5, "[sweep] sweep_thread exit\n");
+    objspace->sweep_thread_sweep_requested = false;
+    objspace->sweep_thread_sweep_exited = true;
+    rb_native_cond_broadcast(&objspace->sweep_cond);
+    sweep_lock_unlock(&objspace->sweep_lock);
+
+    return NULL;
+}
+
 static void
 gc_sweep_start_heap(rb_objspace_t *objspace, rb_heap_t *heap)
 {
+    // Background thread is not sweeping right now
     heap->sweeping_page = ccan_list_top(&heap->pages, struct heap_page, page_node);
     heap->free_pages = NULL;
+    heap->swept_pages = NULL;
     heap->pooled_pages = NULL;
-    if (!objspace->flags.immediate_sweep) {
-        struct heap_page *page = NULL;
+    heap->latest_swept_page = NULL;
+    heap->pre_swept_slots_deferred = 0;
+#if RUBY_DEBUG
+    heap->made_zombies = 0;
+#endif
 
+    heap->pre_sweeping_page = NULL;
+    heap->background_sweep_steps = heap->foreground_sweep_steps;
+    heap->is_finished_sweeping = false;
+    heap->done_background_sweep = false;
+    heap->skip_sweep_continue = false;
+
+    struct heap_page *page = NULL;
+
+    if (!objspace->flags.immediate_sweep) {
         ccan_list_for_each(&heap->pages, page, page_node) {
-            page->flags.before_sweep = TRUE;
+            page->before_sweep = 1;
+            GC_ASSERT(page->pre_deferred_free_slots == 0);
         }
     }
 }
@@ -3767,8 +4833,16 @@ static void
 gc_sweep_start(rb_objspace_t *objspace)
 {
     gc_mode_transition(objspace, gc_mode_sweeping);
+    objspace->during_lazy_sweeping = TRUE;
     objspace->rincgc.pooled_slots = 0;
 
+// Background sweeping cannot be happening
+#if VM_CHECK_MODE > 0
+    sweep_lock_lock(&objspace->sweep_lock);
+    GC_ASSERT(!objspace->sweep_thread_sweeping && !objspace->sweep_thread_sweep_requested);
+    sweep_lock_unlock(&objspace->sweep_lock);
+#endif
+
 #if GC_CAN_COMPILE_COMPACTION
     if (objspace->flags.during_compacting) {
         gc_sort_heap_by_compare_func(
@@ -3778,6 +4852,8 @@ gc_sweep_start(rb_objspace_t *objspace)
     }
 #endif
 
+    objspace->heaps_done_background_sweep = 0;
+
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
         gc_sweep_start_heap(objspace, heap);
@@ -3791,6 +4867,22 @@ gc_sweep_start(rb_objspace_t *objspace)
     }
 
     rb_gc_ractor_newobj_cache_foreach(gc_ractor_newobj_cache_clear, NULL);
+
+    psweep_debug(1, "[gc] gc_sweep_start\n");
+    if (!objspace->flags.during_compacting && !(objspace->hook_events & RUBY_INTERNAL_EVENT_FREEOBJ)) {
+        rbimpl_atomic_store(&objspace->use_background_sweep_thread, true, RBIMPL_ATOMIC_RELEASE);
+        psweep_debug(-1, "[gc] gc_sweep_start: requesting sweep thread\n");
+        sweep_lock_lock(&objspace->sweep_lock);
+        {
+            objspace->sweep_thread_sweep_requested = true;
+            rb_native_cond_broadcast(&objspace->sweep_cond);
+        }
+        sweep_lock_unlock(&objspace->sweep_lock);
+    }
+    else {
+        rbimpl_atomic_store(&objspace->use_background_sweep_thread, false, RBIMPL_ATOMIC_RELEASE);
+        psweep_debug(-1, "[gc] gc_sweep_start: not using background sweep thread\n");
+    }
 }
 
 static void
@@ -3802,6 +4894,20 @@ gc_sweep_finish_heap(rb_objspace_t *objspace, rb_heap_t *heap)
     size_t init_slots = gc_params.heap_init_bytes / heap->slot_size;
     size_t min_free_slots = (size_t)(MAX(total_slots, init_slots) * gc_params.heap_free_slots_min_ratio);
 
+    psweep_debug(-1, "[gc] gc_sweep_finish heap:%p (%ld)\n", heap, heap - heaps);
+
+#if RUBY_DEBUG
+    if (!objspace->flags.during_compacting) {
+        objspace->have_swept_slots += swept_slots;
+        objspace->have_swept_slots += heap->made_zombies;
+        objspace->will_be_swept_slots -= heap->zombie_slots;
+    }
+#endif
+
+    GC_ASSERT(heap->background_sweep_steps <= ATOMIC_LOAD_RELAXED(heap->foreground_sweep_steps));
+    GC_ASSERT(!heap->is_finished_sweeping);
+    heap->is_finished_sweeping = true;
+
     if (swept_slots < min_free_slots &&
             /* The heap is a growth heap if it freed more slots than had empty slots. */
             ((heap->empty_slots == 0 && total_slots > 0) || heap->freed_slots > heap->empty_slots)) {
@@ -3812,8 +4918,8 @@ gc_sweep_finish_heap(rb_objspace_t *objspace, rb_heap_t *heap)
         struct heap_page *resurrected_page;
         while (swept_slots < min_free_slots &&
                 (resurrected_page = heap_page_resurrect(objspace))) {
-            heap_add_page(objspace, heap, resurrected_page);
-            heap_add_freepage(heap, resurrected_page);
+            heap_add_page(objspace, heap, resurrected_page, false);
+            heap_add_freepage(heap, resurrected_page, "gc_sweep_finish_heap");
 
             swept_slots += resurrected_page->free_slots;
         }
@@ -3839,6 +4945,27 @@ static void
 gc_sweep_finish(rb_objspace_t *objspace)
 {
     gc_report(1, objspace, "gc_sweep_finish\n");
+    psweep_debug(-1, "[gc] gc_sweep_finish\n");
+
+    rbimpl_atomic_store(&objspace->use_background_sweep_thread, false, RBIMPL_ATOMIC_RELEASE);
+
+#if RUBY_DEBUG
+    // When calling GC.start, if in the middle of a non-full mark it will be set as full mark in gc_rest() so the numbers
+    // will be off.
+    if (!objspace->flags.was_compacting && !objspace->sweep_rest && gc_config_full_mark_val) {
+        if (objspace->will_be_swept_slots != objspace->have_swept_slots) {
+            fprintf(stderr, "Expecting to free %lu slots, freed %lu slots (major:%d)\n", objspace->will_be_swept_slots, objspace->have_swept_slots, is_full_marking(objspace));
+            for (int i = 0; i < HEAP_COUNT; i++) {
+                rb_heap_t *heap = &heaps[i];
+                fprintf(stderr, "heap %ld zombies_created:%u freed_slots:%lu empty_slots:%lu zombie_slots:%lu, total_slots:%lu\n",
+                    heap - heaps, heap->made_zombies, heap->freed_slots, heap->empty_slots, heap->zombie_slots, heap->total_slots);
+            }
+
+            rb_bug("MISMATCH: marked_slots:%lu, pooled_slots:%lu, empty_pages:%lu", objspace->marked_slots, objspace->rincgc.pooled_slots, objspace->empty_pages_count);
+        }
+    }
+    objspace->flags.was_compacting = FALSE;
+#endif
 
     gc_prof_set_heap_info(objspace);
     heap_pages_free_unused_pages(objspace);
@@ -3846,8 +4973,23 @@ gc_sweep_finish(rb_objspace_t *objspace)
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
 
+#if RUBY_DEBUG
+        {
+            struct heap_page *page;
+            ccan_list_for_each(&heap->pages, page, page_node) {
+                if (RUBY_ATOMIC_LOAD(page->before_sweep)) {
+                    rb_bug("gc_sweep_finish: page %p in heap %d still has before_sweep set", (void *)page, i);
+                }
+            }
+        }
+        heap->zombie_slots = 0;
+#endif
+
         heap->freed_slots = 0;
         heap->empty_slots = 0;
+        if (heap->background_sweep_steps < heap->foreground_sweep_steps) {
+            heap->background_sweep_steps = heap->foreground_sweep_steps;
+        }
 
         if (!will_be_incremental_marking(objspace)) {
             struct heap_page *end_page = heap->free_pages;
@@ -3865,71 +5007,305 @@ gc_sweep_finish(rb_objspace_t *objspace)
 
     rb_gc_event_hook(0, RUBY_INTERNAL_EVENT_GC_END_SWEEP);
     gc_mode_transition(objspace, gc_mode_none);
+    objspace->during_lazy_sweeping = FALSE;
 
 #if RGENGC_CHECK_MODE >= 2
     gc_verify_internal_consistency(objspace);
 #endif
 }
 
+// Dequeue a page swept by the sweep thread. If `free_in_user_thread` is true, then
+// dequeue an unswept page to be swept by the Ruby thread. It can also dequeue an unswept
+// page if otherwise it would have to wait for the sweep thread. In that case, `dequeued_unswept_page`
+// is set to true.
+//
+// It returns NULL when there are no more pages to sweep for the heap.
+static struct heap_page *
+gc_sweep_dequeue_page(rb_objspace_t *objspace, rb_heap_t *heap, bool free_in_user_thread, bool *dequeued_unswept_page)
+{
+    if (free_in_user_thread) {
+        GC_ASSERT(!objspace->use_background_sweep_thread);
+        if (heap->sweeping_page == NULL) {
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page: NULL page (synchronous) from heap(%p) (%ld)\n", heap, heap - heaps);
+            return NULL;
+        }
+        else {
+            struct heap_page *cur = heap->sweeping_page;
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page:%p (synchronous) from heap(%p %ld)\n", cur, heap, heap - heaps);
+            struct heap_page *next = ccan_list_next(&heap->pages, cur, page_node);
+            heap->sweeping_page = next;
+            return cur;
+        }
+    }
+
+    struct heap_page *page = NULL;
+
+    // Avoid taking the global sweep_lock if we can
+#if PSWEEP_LOCK_STATS > 0
+    instrumented_lock_acquire(&heap->swept_pages_lock, &swept_pages_lock_stats);
+#else
+    rb_native_mutex_lock(&heap->swept_pages_lock);
+#endif
+    {
+        if (heap->swept_pages) {
+            page = heap->swept_pages;
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page: got page:%p from heap(%p)->swept_pages (swept_pages lock) (heap %ld)\n", page, heap, heap - heaps);
+            heap->swept_pages = page->free_next;
+        }
+    }
+    rb_native_mutex_unlock(&heap->swept_pages_lock);
+    if (page) return page;
+
+    sweep_lock_lock(&objspace->sweep_lock);
+    {
+        GC_ASSERT(!objspace->background_sweep_mode);
+    retry_swept_pages:
+        if (heap->swept_pages) { // grab the earliest page that the sweep thread swept (ie: it dequeues in swept order)
+            page = heap->swept_pages;
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page: got page:%p from heap(%p)->swept_pages (sweep_lock) (heap %ld)\n", page, heap, heap - heaps);
+            heap->swept_pages = page->free_next;
+        }
+        else if (!heap->sweeping_page) { // This heap is finished
+            while (heap->pre_sweeping_page) {
+                sweep_lock_set_unlocked();
+                rb_native_cond_wait(&heap->sweep_page_cond, &objspace->sweep_lock);
+                sweep_lock_set_locked();
+                goto retry_swept_pages;
+            }
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page: got nil page from heap(%p) (heap %ld) end\n", heap, heap - heaps);
+        }
+        else {
+            *dequeued_unswept_page = true;
+            page = heap->sweeping_page; // this could be the last page
+            heap->sweeping_page = ccan_list_next(&heap->pages, page, page_node);
+            psweep_debug(0, "[gc] gc_sweep_dequeue_page: dequeued unswept page from heap(%p) (heap %ld)\n", heap, heap - heaps);
+        }
+        GC_ASSERT(!objspace->background_sweep_mode);
+    }
+    sweep_lock_unlock(&objspace->sweep_lock);
+
+    return page;
+}
+
+MAYBE_UNUSED(static int
+freelist_size(struct free_slot *slot))
+{
+    if (!slot) return 0;
+    int size = 0;
+    while (slot) {
+        size++;
+        slot = slot->next;
+    }
+    return size;
+}
+
+static inline bool
+is_last_heap(rb_objspace_t *objspace, rb_heap_t *heap)
+{
+    return heap - heaps == (HEAP_COUNT - 1);
+}
+
+static void
+gc_sweep_step_deferred_free(rb_objspace_t *objspace, rb_heap_t *heap, struct heap_page *sweep_page, unsigned short *freed_out, unsigned short *finals_out)
+{
+    unsigned short freed = 0;
+    unsigned short finals = 0;
+    uintptr_t p = (uintptr_t)sweep_page->start;
+    bits_t *deferred_bits = sweep_page->deferred_free_bits;
+    int total_slots = sweep_page->total_slots;
+    short slot_size = sweep_page->slot_size;
+
+    int bitmap_plane_count = CEILDIV(total_slots, BITS_BITLENGTH);
+    int out_of_range_bits = total_slots % BITS_BITLENGTH;
+    bits_t bitset;
+
+    if (out_of_range_bits != 0) {
+        deferred_bits[bitmap_plane_count - 1] &= (((bits_t)1 << out_of_range_bits) - 1);
+    }
+
+    for (int i = 0; i < bitmap_plane_count; i++) {
+        bitset = deferred_bits[i];
+        p = (uintptr_t)sweep_page->start + (i * BITS_BITLENGTH * slot_size);
+        while (bitset) {
+            if (bitset & 1) {
+                VALUE obj = (VALUE)p;
+                GC_ASSERT(GET_HEAP_PAGE(obj) == sweep_page);
+                GC_ASSERT(!RVALUE_MARKED(objspace, obj));
+                if (deferred_free(objspace, obj)) {
+                    freed++;
+                }
+                else {
+                    finals++;
+                }
+            }
+            p += slot_size;
+            bitset >>= 1;
+        }
+    }
+    *freed_out = freed;
+    *finals_out = finals;
+}
+
+// Perform incremental (lazy) sweep on a heap.
 static int
 gc_sweep_step(rb_objspace_t *objspace, rb_heap_t *heap)
 {
-    struct heap_page *sweep_page = heap->sweeping_page;
-    int swept_slots = 0;
-    int pooled_slots = 0;
+    size_t swept_slots = 0;
+    size_t pooled_slots = 0;
+
+#if RUBY_DEBUG
+    sweep_lock_lock(&objspace->sweep_lock);
+    GC_ASSERT(!objspace->background_sweep_mode);
+    sweep_lock_unlock(&objspace->sweep_lock);
+#endif
 
-    if (sweep_page == NULL) return FALSE;
+    if (heap_is_sweep_done(objspace, heap)) {
+        psweep_debug(0, "[gc] gc_sweep_step: heap %p (%ld) is heap_is_sweep_done() early!\n", heap, heap - heaps);
+        GC_ASSERT(heap->sweeping_page == NULL);
+        GC_ASSERT(heap->is_finished_sweeping);
+        return heap->free_pages != NULL;
+    }
 
 #if GC_ENABLE_LAZY_SWEEP
     gc_prof_sweep_timer_start(objspace);
 #endif
+    psweep_debug(-2, "[gc] gc_sweep_step heap:%p (%ld) use_sweep_thread:%d\n", heap, heap - heaps, objspace->use_background_sweep_thread);
+    bool sweep_rest = objspace->sweep_rest;
+    bool use_sweep_thread = objspace->use_background_sweep_thread;
+
+    while (1) {
+        bool free_in_user_thread_p = !use_sweep_thread;
+        bool dequeued_unswept_page = false;
+        // NOTE: pages we dequeue from the sweep thread need to be AFTER the list of heap->free_pages so we don't free from pages
+        // we've allocated from since sweep started.
+        struct heap_page *sweep_page = gc_sweep_dequeue_page(objspace, heap, free_in_user_thread_p, &dequeued_unswept_page);
+        if (RB_UNLIKELY(!sweep_page)) {
+            psweep_debug(-2, "[gc] gc_sweep_step heap:%p (%ld) deq() = nil, break\n", heap, heap - heaps);
+            break;
+        }
+        if (dequeued_unswept_page) {
+            free_in_user_thread_p =  true;
+            psweep_debug(-2, "[gc] gc_sweep_step heap:%p (%ld) deq unswept page\n", heap, heap - heaps);
+        }
+        else {
+            psweep_debug(-2, "[gc] gc_sweep_step heap:%p (%ld) deq preswept page\n", heap, heap - heaps);
+        }
+        GC_ASSERT(sweep_page->heap == heap);
 
-    do {
         RUBY_DEBUG_LOG("sweep_page:%p", (void *)sweep_page);
 
         struct gc_sweep_context ctx = {
-            .page = sweep_page,
-            .final_slots = 0,
-            .freed_slots = 0,
-            .empty_slots = 0,
+            .page = sweep_page
         };
-        gc_sweep_page(objspace, heap, &ctx);
-        int free_slots = ctx.freed_slots + ctx.empty_slots;
 
-        heap->sweeping_page = ccan_list_next(&heap->pages, sweep_page, page_node);
+        if (free_in_user_thread_p) {
+            gc_sweep_page(objspace, heap, &ctx);
+            GC_ASSERT(sweep_page->pre_deferred_free_slots == 0);
+        }
+        else {
+            unsigned short deferred_free_freed = 0;
+            unsigned short deferred_free_final_slots = 0;
+            unsigned short deferred_to_free = sweep_page->pre_deferred_free_slots;
+
+            psweep_debug(-2, "[gc] gc_sweep_step: (heap:%p %ld, page:%p) free_ruby_th: %d, deferred_to_free:%d, pre_freed:%d, pre_empty:%d\n",
+                heap, heap - heaps, sweep_page, free_in_user_thread_p, deferred_to_free, sweep_page->pre_freed_slots, sweep_page->pre_empty_slots);
 
-        if (free_slots == sweep_page->total_slots) {
-            /* There are no living objects, so move this page to the global empty pages. */
-            heap_unlink_page(objspace, heap, sweep_page);
+            if (deferred_to_free > 0) {
+                gc_sweep_step_deferred_free(objspace, heap, sweep_page, &deferred_free_freed, &deferred_free_final_slots);
+            }
+            GC_ASSERT(deferred_to_free == (deferred_free_freed + deferred_free_final_slots));
+
+            ctx.final_slots = sweep_page->pre_final_slots + deferred_free_final_slots;
+            ctx.freed_slots = sweep_page->pre_freed_slots + deferred_free_freed;
+            ctx.empty_slots = sweep_page->pre_empty_slots;
+            ctx.zombie_slots = sweep_page->pre_zombie_slots;
+
+            gc_post_sweep_page(objspace, heap, sweep_page); // clear bits
+        }
+
+        if (0) fprintf(stderr, "gc_sweep_page(%"PRIdSIZE"): total_slots: %d, freed_slots: %d, empty_slots: %d, final_slots: %d\n",
+                       rb_gc_count(),
+                       sweep_page->total_slots,
+                       ctx.freed_slots, ctx.empty_slots, ctx.final_slots);
+#if GC_PROFILE_MORE_DETAIL
+        if (gc_prof_enabled(objspace)) {
+            gc_profile_record *record = gc_prof_record(objspace);
+            record->removing_objects += ctx.final_slots + ctx.freed_slots;
+            record->empty_objects += ctx.empty_slots;
+        }
+#endif
+
+        int free_slots = ctx.freed_slots + ctx.empty_slots;
+        GC_ASSERT(sweep_page->total_slots > 0);
+        GC_ASSERT(sweep_page->total_slots >= free_slots);
+
+        if (free_in_user_thread_p) {
+            GC_ASSERT(sweep_page->free_slots == free_slots); // gc_sweep_page() sets sweep_page->free slots
+            GC_ASSERT(sweep_page->heap->total_freed_objects >= (unsigned long)ctx.freed_slots);
+        } else {
+            sweep_page->free_slots = free_slots;
+            // NOTE: sweep_page->final slots have already been updated by make_zombie
+            GC_ASSERT(sweep_page->free_slots <= sweep_page->total_slots);
+            GC_ASSERT(sweep_page->final_slots <= sweep_page->total_slots);
+            sweep_page->heap->total_freed_objects += ctx.freed_slots;
+
+            if (sweep_page->pre_freed_malloc_bytes > 0) {
+                atomic_sub_nounderflow(&malloc_increase, sweep_page->pre_freed_malloc_bytes);
+#if RGENGC_ESTIMATE_OLDMALLOC
+                atomic_sub_nounderflow(&objspace->malloc_counters.oldmalloc_increase, sweep_page->pre_freed_malloc_bytes);
+#endif
+            }
+            clear_pre_sweep_fields(sweep_page);
+        }
 
-            sweep_page->start = 0;
-            sweep_page->total_slots = 0;
-            sweep_page->slot_size = 0;
-            sweep_page->heap = NULL;
-            sweep_page->free_slots = 0;
+#if RGENGC_CHECK_MODE
+        short freelist_len = 0;
+        asan_unlock_freelist(sweep_page);
+        struct free_slot *ptr = sweep_page->freelist;
+        while (ptr) {
+            freelist_len++;
+            rb_asan_unpoison_object((VALUE)ptr, false);
+            struct free_slot *next = ptr->next;
+            rb_asan_poison_object((VALUE)ptr);
+            ptr = next;
+        }
+        asan_lock_freelist(sweep_page);
+        if (freelist_len != sweep_page->free_slots) {
+            rb_bug("inconsistent freelist length: expected %d but was %d", sweep_page->free_slots, freelist_len);
+        }
+#endif
 
-            asan_unlock_freelist(sweep_page);
-            sweep_page->freelist = NULL;
-            asan_lock_freelist(sweep_page);
+        psweep_debug(0, "[gc] gc_sweep_step: dequeued page(heap:%p %ld, page:%p) free_slots:%u,total_slots:%u\n", heap, heap - heaps, sweep_page, free_slots, sweep_page->total_slots);
 
-            asan_poison_memory_region(sweep_page->body, HEAP_PAGE_SIZE);
+#if RUBY_DEBUG
+        heap->zombie_slots += ctx.zombie_slots;
+#endif
 
-            objspace->empty_pages_count++;
-            sweep_page->free_next = objspace->empty_pages;
-            objspace->empty_pages = sweep_page;
+        if (free_slots == sweep_page->total_slots) {
+#if RUBY_DEBUG
+            objspace->have_swept_slots += free_slots;
+#endif
+            psweep_debug(0, "[gc] gc_sweep_step: adding to empty_pages:%p\n", sweep_page);
+            move_to_empty_pages(objspace, heap, sweep_page);
         }
         else if (free_slots > 0) {
             heap->freed_slots += ctx.freed_slots;
             heap->empty_slots += ctx.empty_slots;
 
             if (pooled_slots < GC_INCREMENTAL_SWEEP_POOL_SLOT_COUNT) {
+                psweep_debug(0, "[gc] gc_sweep_step: adding pooled_page:%p, pooled_slots:%d\n", sweep_page, pooled_slots);
                 heap_add_poolpage(objspace, heap, sweep_page);
                 pooled_slots += free_slots;
             }
             else {
-                heap_add_freepage(heap, sweep_page);
+                psweep_debug(0, "[gc] gc_sweep_step: adding freepage:%p, swept_slots:%d\n", sweep_page, swept_slots);
+                heap_add_freepage(heap, sweep_page, "gc_sweep_step");
                 swept_slots += free_slots;
                 if (swept_slots > GC_INCREMENTAL_SWEEP_SLOT_COUNT) {
+                    if (!sweep_rest && use_sweep_thread) {
+                        rbimpl_atomic_inc(&heap->foreground_sweep_steps, RBIMPL_ATOMIC_RELEASE); // signal sweep thread to move on
+                    }
+                    psweep_debug(0, "[gc] gc_sweep_step got to SWEEP_SLOT_COUNT, break\n");
                     break;
                 }
             }
@@ -3937,13 +5313,14 @@ gc_sweep_step(rb_objspace_t *objspace, rb_heap_t *heap)
         else {
             sweep_page->free_next = NULL;
         }
-    } while ((sweep_page = heap->sweeping_page));
+    }
 
-    if (!heap->sweeping_page) {
+    if (heap_is_sweep_done(objspace, heap)) {
+        psweep_debug(0, "[gc] gc_sweep_step heap:%p (%ld) sweep done\n", heap, heap - heaps);
         gc_sweep_finish_heap(objspace, heap);
 
         if (!has_sweeping_pages(objspace)) {
-            gc_sweep_finish(objspace);
+            gc_sweep_finish(objspace); // done, no more pages in any heap
         }
     }
 
@@ -3951,36 +5328,121 @@ gc_sweep_step(rb_objspace_t *objspace, rb_heap_t *heap)
     gc_prof_sweep_timer_stop(objspace);
 #endif
 
+    psweep_debug(1, "[gc] gc_sweep_step: finished for heap:%p (%ld), got free page:%d\n", heap, heap - heaps, heap->free_pages != NULL);
     return heap->free_pages != NULL;
 }
 
+static bool
+background_sweep_done_p(rb_objspace_t *objspace)
+{
+    // must have sweep_lock acquired (TODO: add assertion)
+    return objspace->heaps_done_background_sweep == HEAP_COUNT;
+}
+
+unsigned long long sweep_rest_count = 0;
+
 static void
 gc_sweep_rest(rb_objspace_t *objspace)
 {
+    sweep_rest_count++;
+    sweep_lock_lock(&objspace->sweep_lock);
+    {
+        objspace->sweep_rest = true; // reset to false in `gc_sweeping_exit`
+        if (background_sweep_done_p(objspace)) {
+            psweep_debug(-2, "[gc] gc_sweep_rest: bg done, not requesting\n");
+        }
+        else {
+            if (objspace->use_background_sweep_thread && !objspace->sweep_thread_sweeping && !objspace->sweep_thread_sweep_requested) {
+                psweep_debug(-2, "[gc] gc_sweep_rest: request sweep thread\n");
+                objspace->sweep_thread_sweep_requested = true;
+                rb_native_cond_broadcast(&objspace->sweep_cond);
+            }
+            else if (objspace->use_background_sweep_thread) {
+                psweep_debug(-2, "[gc] gc_sweep_rest: restart sweep thread\n");
+                objspace->background_sweep_restart_heaps = true; // restart sweeping heaps from heap 0
+            }
+        }
+    }
+    sweep_lock_unlock(&objspace->sweep_lock);
+
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
 
-        while (heap->sweeping_page) {
+        while (!heap_is_sweep_done(objspace, heap)) {
+            psweep_debug(0, "[gc] gc_sweep_rest: gc_sweep_step heap:%p (heap %ld)\n", heap, heap - heaps);
             gc_sweep_step(objspace, heap);
         }
+        GC_ASSERT(heap->is_finished_sweeping);
+        heap->background_sweep_steps = heap->foreground_sweep_steps;
     }
+
+    GC_ASSERT(!has_sweeping_pages(objspace));
+    GC_ASSERT(gc_mode(objspace) == gc_mode_none);
 }
 
+unsigned long long sweep_continue_count = 0;
+
 static void
 gc_sweep_continue(rb_objspace_t *objspace, rb_heap_t *sweep_heap)
 {
     GC_ASSERT(dont_gc_val() == FALSE || objspace->profile.latest_gc_info & GPR_FLAG_METHOD);
     if (!GC_ENABLE_LAZY_SWEEP) return;
 
-    gc_sweeping_enter(objspace);
+    psweep_debug(-2, "[gc] gc_sweep_continue\n");
+
+    sweep_continue_count++;
+
+    gc_sweeping_enter(objspace, "gc_sweep_continue");
+    sweep_lock_lock(&objspace->sweep_lock);
+    {
+        if (objspace->use_background_sweep_thread) {
+            if (background_sweep_done_p(objspace)) {
+                psweep_debug(-2, "[gc] gc_sweep_continue: bg done, not requesting\n");
+            }
+            else {
+                int num_heaps_need_continue = 0;
+                for (int i = 0; i < HEAP_COUNT; i++) {
+                    rb_heap_t *heap = &heaps[i];
+                    heap->background_sweep_steps = heap->foreground_sweep_steps;
+                    if (heap->pre_swept_slots_deferred >= (GC_INCREMENTAL_SWEEP_SLOT_COUNT + GC_INCREMENTAL_SWEEP_POOL_SLOT_COUNT)) {
+                        heap->skip_sweep_continue = true;
+                    }
+                    else {
+                        if (!heap->is_finished_sweeping && !heap->done_background_sweep) {
+                            num_heaps_need_continue++;
+                        }
+                        heap->skip_sweep_continue = false;
+                    }
+                    heap->pre_swept_slots_deferred = 0;
+                }
+                if (num_heaps_need_continue > 0) {
+                    if (!objspace->sweep_thread_sweeping && !objspace->sweep_thread_sweep_requested) {
+                        psweep_debug(-2, "[gc] gc_sweep_continue: requesting sweep thread\n");
+                        objspace->sweep_thread_sweep_requested = true;
+                        rb_native_cond_broadcast(&objspace->sweep_cond);
+                    }
+                    else {
+                        psweep_debug(-2, "[gc] gc_sweep_continue: sweep thread restart heaps\n");
+                        objspace->background_sweep_restart_heaps = true;
+                    }
+                }
+            }
+        }
+        else {
+            psweep_debug(-2, "[gc] gc_sweep_continue: !use_background_sweep_thread\n");
+        }
+    }
+    sweep_lock_unlock(&objspace->sweep_lock);
 
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
+
         if (gc_sweep_step(objspace, heap)) {
             GC_ASSERT(heap->free_pages != NULL);
         }
         else if (heap == sweep_heap) {
             if (objspace->empty_pages_count > 0 || objspace->heap_pages.allocatable_bytes > 0) {
+                GC_ASSERT(!sweep_heap->sweeping_page); // went through whole heap, couldn't find free page
                 /* [Bug #21548]
                  *
                  * If this heap is the heap we want to sweep, but we weren't able
@@ -3991,7 +5453,7 @@ gc_sweep_continue(rb_objspace_t *objspace, rb_heap_t *sweep_heap)
                  * empty/allocatable pages. If other heaps are not finished sweeping
                  * then we do not finish this GC and we will end up triggering a new
                  * GC cycle during this GC phase. */
-                heap_page_allocate_and_initialize(objspace, heap);
+                heap_page_allocate_and_initialize(objspace, heap, false);
 
                 GC_ASSERT(heap->free_pages != NULL);
             }
@@ -4059,7 +5521,7 @@ invalidate_moved_plane(rb_objspace_t *objspace, struct heap_page *page, uintptr_
                     struct heap_page *orig_page = GET_HEAP_PAGE(object);
                     orig_page->free_slots++;
                     RVALUE_AGE_SET_BITMAP(object, 0);
-                    heap_page_add_freeobj(objspace, orig_page, object);
+                    heap_page_add_freeobj(objspace, orig_page, object, false);
 
                     GC_ASSERT(RVALUE_MARKED(objspace, forwarding_object));
                     GC_ASSERT(BUILTIN_TYPE(forwarding_object) != T_MOVED);
@@ -4103,10 +5565,16 @@ gc_compact_start(rb_objspace_t *objspace)
     struct heap_page *page = NULL;
     gc_mode_transition(objspace, gc_mode_compacting);
 
+#if RUBY_DEBUG
+    sweep_lock_lock(&objspace->sweep_lock);
+    GC_ASSERT(!objspace->sweep_thread_sweeping && !objspace->sweep_thread_sweep_requested);
+    sweep_lock_unlock(&objspace->sweep_lock);
+#endif
+
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
         ccan_list_for_each(&heap->pages, page, page_node) {
-            page->flags.before_sweep = TRUE;
+            page->before_sweep = 1;
         }
 
         heap->compact_cursor = ccan_list_tail(&heap->pages, struct heap_page, page_node);
@@ -4132,10 +5600,10 @@ static void gc_sweep_compact(rb_objspace_t *objspace);
 static void
 gc_sweep(rb_objspace_t *objspace)
 {
-    gc_sweeping_enter(objspace);
-
     const unsigned int immediate_sweep = objspace->flags.immediate_sweep;
 
+    gc_sweeping_enter(objspace, "gc_sweep");
+
     gc_report(1, objspace, "gc_sweep: immediate: %d\n", immediate_sweep);
 
     gc_sweep_start(objspace);
@@ -4148,12 +5616,12 @@ gc_sweep(rb_objspace_t *objspace)
         gc_prof_sweep_timer_start(objspace);
 #endif
         gc_sweep_rest(objspace);
+
 #if !GC_ENABLE_LAZY_SWEEP
         gc_prof_sweep_timer_stop(objspace);
 #endif
     }
     else {
-
         /* Sweep every size pool. */
         for (int i = 0; i < HEAP_COUNT; i++) {
             rb_heap_t *heap = &heaps[i];
@@ -4931,6 +6399,7 @@ struct verify_internal_consistency_struct {
     int err_count;
     size_t live_object_count;
     size_t zombie_object_count;
+    size_t zombie_ran_finalizer_object_count;
 
     VALUE parent;
     size_t old_object_count;
@@ -5027,7 +6496,11 @@ verify_internal_consistency_i(void *page_start, void *page_end, size_t stride,
                 if (BUILTIN_TYPE(obj) == T_ZOMBIE) {
                     data->zombie_object_count++;
 
-                    if ((RBASIC(obj)->flags & ~ZOMBIE_OBJ_KEPT_FLAGS) != T_ZOMBIE) {
+                    if (FL_TEST(obj, ZOMBIE_NEEDS_FREE_FLAG)) {
+                        data->zombie_ran_finalizer_object_count++;
+                    }
+
+                    if ((RBASIC(obj)->flags & ~(ZOMBIE_OBJ_KEPT_FLAGS|ZOMBIE_NEEDS_FREE_FLAG)) != T_ZOMBIE) {
                         fprintf(stderr, "verify_internal_consistency_i: T_ZOMBIE has extra flags set: %s\n",
                                 rb_obj_info(obj));
                         data->err_count++;
@@ -5164,6 +6637,7 @@ gc_verify_internal_consistency_(rb_objspace_t *objspace)
         uintptr_t end = start + page->total_slots * slot_size;
 
         verify_internal_consistency_i((void *)start, (void *)end, slot_size, &data);
+        data.live_object_count += (page->pre_freed_slots + page->pre_final_slots + page->pre_zombie_slots);
     }
 
     if (data.err_count != 0) {
@@ -5216,7 +6690,7 @@ gc_verify_internal_consistency_(rb_objspace_t *objspace)
         }
 
         if (total_final_slots_count(objspace) != data.zombie_object_count ||
-            total_final_slots_count(objspace) != list_count) {
+            (data.zombie_object_count - data.zombie_ran_finalizer_object_count) != list_count) {
 
             rb_bug("inconsistent finalizing object count:\n"
                     "  expect %"PRIuSIZE"\n"
@@ -5241,6 +6715,7 @@ gc_verify_internal_consistency(void *objspace_ptr)
         rb_gc_vm_barrier(); // stop other ractors
 
         unsigned int prev_during_gc = during_gc;
+        wait_for_background_sweeping_to_finish(objspace, true, false, "verify_internal_consistency");
         during_gc = FALSE; // stop gc here
         {
             gc_verify_internal_consistency_(objspace);
@@ -5427,7 +6902,14 @@ gc_marks_finish(rb_objspace_t *objspace)
             min_free_slots = gc_params.heap_free_slots * r_mul;
         }
 
+
         int full_marking = is_full_marking(objspace);
+#if RUBY_DEBUG
+        if (!objspace->flags.during_compacting) {
+            objspace->have_swept_slots = 0;
+            objspace->will_be_swept_slots = sweep_slots;
+        }
+#endif
 
         GC_ASSERT(objspace_available_slots(objspace) >= objspace->marked_slots);
 
@@ -5465,7 +6947,13 @@ gc_marks_finish(rb_objspace_t *objspace)
             }
 
             if (full_marking) {
-                heap_allocatable_bytes_expand(objspace, NULL, sweep_slots, total_slots, heaps[0].slot_size);
+                /* Use weighted average slot size since total_slots spans all heaps */
+                size_t total_heap_bytes = 0;
+                for (int i = 0; i < HEAP_COUNT; i++) {
+                    total_heap_bytes += heaps[i].total_slots * heaps[i].slot_size;
+                }
+                size_t avg_slot_size = total_slots > 0 ? total_heap_bytes / total_slots : heaps[0].slot_size;
+                heap_allocatable_bytes_expand(objspace, NULL, sweep_slots, total_slots, avg_slot_size);
             }
         }
 
@@ -5563,7 +7051,7 @@ gc_compact_move(rb_objspace_t *objspace, rb_heap_t *heap, VALUE src)
         unlock_page_body(objspace, GET_PAGE_BODY(src));
 
         if (dest_pool->sweeping_page->free_slots > 0) {
-            heap_add_freepage(dest_pool, dest_pool->sweeping_page);
+            heap_add_freepage(dest_pool, dest_pool->sweeping_page, "gc_compact_move");
         }
 
         dest_pool->sweeping_page = ccan_list_next(&dest_pool->pages, dest_pool->sweeping_page, page_node);
@@ -5753,6 +7241,7 @@ gc_marks_continue(rb_objspace_t *objspace, rb_heap_t *heap)
 static void
 gc_marks_start(rb_objspace_t *objspace, int full_mark)
 {
+    //  NOTE: background sweeping cannot be running during marking.
     /* start marking */
     gc_report(1, objspace, "gc_marks_start: (%s)\n", full_mark ? "full" : "minor");
     gc_mode_transition(objspace, gc_mode_marking);
@@ -5765,7 +7254,7 @@ gc_marks_start(rb_objspace_t *objspace, int full_mark)
                        "objspace->rincgc.pooled_page_num: %"PRIdSIZE", "
                        "objspace->rincgc.step_slots: %"PRIdSIZE", \n",
                        objspace->marked_slots, objspace->rincgc.pooled_slots, objspace->rincgc.step_slots);
-        objspace->flags.during_minor_gc = FALSE;
+        objspace->during_minor_gc = FALSE;
         if (ruby_enable_autocompact) {
             objspace->flags.during_compacting |= TRUE;
         }
@@ -5790,7 +7279,7 @@ gc_marks_start(rb_objspace_t *objspace, int full_mark)
         }
     }
     else {
-        objspace->flags.during_minor_gc = TRUE;
+        objspace->during_minor_gc = TRUE;
         objspace->marked_slots =
           objspace->rgengc.old_objects + objspace->rgengc.uncollectible_wb_unprotected_objects; /* uncollectible objects are marked already */
         objspace->profile.minor_gc_count++;
@@ -6278,9 +7767,9 @@ static void
 heap_ready_to_gc(rb_objspace_t *objspace, rb_heap_t *heap)
 {
     if (!heap->free_pages) {
-        if (!heap_page_allocate_and_initialize(objspace, heap)) {
+        if (!heap_page_allocate_and_initialize(objspace, heap, false)) {
             objspace->heap_pages.allocatable_bytes = HEAP_PAGE_SIZE;
-            heap_page_allocate_and_initialize(objspace, heap);
+            heap_page_allocate_and_initialize(objspace, heap, false);
         }
     }
 }
@@ -6400,6 +7889,8 @@ gc_start(rb_objspace_t *objspace, unsigned int reason)
     if (!rb_darray_size(objspace->heap_pages.sorted)) return TRUE; /* heap is not ready */
     if (!(reason & GPR_FLAG_METHOD) && !ready_to_gc(objspace)) return TRUE; /* GC is not allowed */
 
+    wait_for_background_sweeping_to_finish(objspace, true, false, "gc_start"); // in case user called `GC.start` explicitly
+
     GC_ASSERT(gc_mode(objspace) == gc_mode_none, "gc_mode is %s\n", gc_mode_name(gc_mode(objspace)));
     GC_ASSERT(!is_lazy_sweeping(objspace));
     GC_ASSERT(!is_incremental_marking(objspace));
@@ -6451,12 +7942,18 @@ gc_start(rb_objspace_t *objspace, unsigned int reason)
     /* Explicitly enable compaction (GC.compact) */
     if (do_full_mark && ruby_enable_autocompact) {
         objspace->flags.during_compacting = TRUE;
+#if RUBY_DEBUG
+        objspace->flags.was_compacting =  TRUE;
+#endif
 #if RGENGC_CHECK_MODE
         objspace->rcompactor.compare_func = ruby_autocompact_compare_func;
 #endif
     }
     else {
         objspace->flags.during_compacting = !!(reason & GPR_FLAG_COMPACT);
+#if RUBY_DEBUG
+        objspace->flags.was_compacting = objspace->flags.during_compacting;
+#endif
     }
 
     if (!GC_ENABLE_LAZY_SWEEP || objspace->flags.dont_incremental) {
@@ -6479,6 +7976,11 @@ gc_start(rb_objspace_t *objspace, unsigned int reason)
 #if RGENGC_ESTIMATE_OLDMALLOC
         (void)RB_DEBUG_COUNTER_INC_IF(gc_major_oldmalloc, reason & GPR_FLAG_MAJOR_BY_OLDMALLOC);
 #endif
+        if (reason & GPR_FLAG_MAJOR_BY_NOFREE)    objspace->profile.major_gc_count_by_nofree++;
+        if (reason & GPR_FLAG_MAJOR_BY_OLDGEN)    objspace->profile.major_gc_count_by_oldgen++;
+        if (reason & GPR_FLAG_MAJOR_BY_SHADY)     objspace->profile.major_gc_count_by_shady++;
+        if (reason & GPR_FLAG_MAJOR_BY_FORCE)     objspace->profile.major_gc_count_by_force++;
+        if (reason & GPR_FLAG_MAJOR_BY_OLDMALLOC) objspace->profile.major_gc_count_by_oldmalloc++;
     }
     else {
         (void)RB_DEBUG_COUNTER_INC_IF(gc_minor_newobj, reason & GPR_FLAG_NEWOBJ);
@@ -6531,7 +8033,7 @@ gc_rest(rb_objspace_t *objspace)
         }
 
         if (is_lazy_sweeping(objspace)) {
-            gc_sweeping_enter(objspace);
+            gc_sweeping_enter(objspace, "gc_rest");
             gc_sweep_rest(objspace);
             gc_sweeping_exit(objspace);
         }
@@ -6668,6 +8170,30 @@ gc_clock_end(struct timespec *ts)
     return 0;
 }
 
+#if PSWEEP_COLLECT_TIMINGS > 0
+/* Wall time clock functions using CLOCK_MONOTONIC */
+static void
+gc_wall_clock_start(struct timespec *ts)
+{
+    if (clock_gettime(CLOCK_MONOTONIC, ts) != 0) {
+        ts->tv_sec = 0;
+        ts->tv_nsec = 0;
+    }
+}
+
+static unsigned long long
+gc_wall_clock_end(struct timespec *ts)
+{
+    struct timespec end_time;
+
+    if ((ts->tv_sec > 0 || ts->tv_nsec > 0) &&
+            clock_gettime(CLOCK_MONOTONIC, &end_time) == 0) {
+        return (unsigned long long)(end_time.tv_sec - ts->tv_sec) * (1000 * 1000 * 1000) + (end_time.tv_nsec - ts->tv_nsec);
+    }
+    return 0;
+}
+#endif
+
 static inline void
 gc_enter(rb_objspace_t *objspace, enum gc_enter_event event, unsigned int *lock_lev)
 {
@@ -6688,6 +8214,7 @@ gc_enter(rb_objspace_t *objspace, enum gc_enter_event event, unsigned int *lock_
     if (RB_UNLIKELY(during_gc != 0)) rb_bug("during_gc != 0");
     if (RGENGC_CHECK_MODE >= 3) gc_verify_internal_consistency(objspace);
 
+    GC_ASSERT(!is_sweep_thread_p());
     during_gc = TRUE;
     RUBY_DEBUG_LOG("%s (%s)",gc_enter_event_cstr(event), gc_current_status(objspace));
     gc_report(1, objspace, "gc_enter: %s [%s]\n", gc_enter_event_cstr(event), gc_current_status(objspace));
@@ -6706,7 +8233,8 @@ gc_exit(rb_objspace_t *objspace, enum gc_enter_event event, unsigned int *lock_l
     gc_record(objspace, 1, gc_enter_event_cstr(event));
     RUBY_DEBUG_LOG("%s (%s)", gc_enter_event_cstr(event), gc_current_status(objspace));
     gc_report(1, objspace, "gc_exit: %s [%s]\n", gc_enter_event_cstr(event), gc_current_status(objspace));
-    during_gc = FALSE;
+    GC_ASSERT(!is_sweep_thread_p());
+    during_gc = FALSE; // NOTE: background thread could still be sweeping even if !during_gc
 
     RB_GC_VM_UNLOCK(*lock_lev);
 }
@@ -6735,24 +8263,80 @@ gc_marking_exit(rb_objspace_t *objspace)
     }
 }
 
+unsigned long long sweeping_enter_count = 0;
+
 static void
-gc_sweeping_enter(rb_objspace_t *objspace)
+gc_sweeping_enter(rb_objspace_t *objspace, const char *from_fn)
 {
+    MAYBE_UNUSED(const unsigned int immediate_sweep) = objspace->flags.immediate_sweep;
+    psweep_debug(1, "[gc] gc_sweeping_enter from %s (immediate:%u)\n", from_fn, immediate_sweep);
     GC_ASSERT(during_gc != 0);
 
+    sweep_lock_lock(&objspace->sweep_lock);
+    {
+        objspace->background_sweep_mode = false;
+    }
+    sweep_lock_unlock(&objspace->sweep_lock);
+
     if (MEASURE_GC) {
         gc_clock_start(&objspace->profile.sweeping_start_time);
     }
+
+    sweeping_enter_count++;
+    /* Always track Ruby thread sweep time */
+#if PSWEEP_COLLECT_TIMINGS > 0
+    gc_clock_start(&objspace->profile.ruby_thread_sweep_cpu_start_time);
+    gc_wall_clock_start(&objspace->profile.ruby_thread_sweep_wall_start_time);
+#endif
 }
 
 static void
 gc_sweeping_exit(rb_objspace_t *objspace)
 {
     GC_ASSERT(during_gc != 0);
+    psweep_debug(1, "[gc] gc_sweeping_exit\n");
+    MAYBE_UNUSED(bool was_rest) = objspace->sweep_rest;
+
+    bool continue_sweep_in_background = objspace->use_background_sweep_thread &&
+        !objspace->sweep_rest && !dont_gc_val() && is_lazy_sweeping(objspace);
+
+    if (continue_sweep_in_background) {
+        if (background_sweep_done_p(objspace)) {
+            psweep_debug(-2, "[gc] gc_sweeping_exit: bg done, not requesting\n");
+        }
+        else {
+            psweep_debug(-2, "[gc] gc_sweeping_exit: continue in background\n");
+            sweep_lock_lock(&objspace->sweep_lock);
+            objspace->background_sweep_mode = true;
+            if (!objspace->sweep_thread_sweeping && !objspace->sweep_thread_sweep_requested) {
+                psweep_debug(-2, "[gc] gc_sweeping_exit: requested\n");
+                objspace->sweep_thread_sweep_requested = true;
+                rb_native_cond_broadcast(&objspace->sweep_cond);
+            }
+            else {
+                psweep_debug(-2, "[gc] gc_sweeping_exit: restart heaps\n");
+                objspace->background_sweep_restart_heaps = true; // restart sweeping heaps from heap 0
+            }
+            sweep_lock_unlock(&objspace->sweep_lock);
+        }
+    }
+    else {
+        GC_ASSERT(!objspace->background_sweep_mode);
+        psweep_debug(-2, "[gc] gc_sweeping_exit: don't continue (rest:%d, use:%d)\n", was_rest, objspace->use_background_sweep_thread);
+        sweep_lock_lock(&objspace->sweep_lock);
+        objspace->sweep_rest = false;
+        sweep_lock_unlock(&objspace->sweep_lock);
+    }
 
     if (MEASURE_GC) {
         objspace->profile.sweeping_time_ns += gc_clock_end(&objspace->profile.sweeping_start_time);
     }
+
+    /* Always track Ruby thread sweep time */
+#if PSWEEP_COLLECT_TIMINGS > 0
+    objspace->profile.ruby_thread_sweep_cpu_time_ns += gc_clock_end(&objspace->profile.ruby_thread_sweep_cpu_start_time);
+    objspace->profile.ruby_thread_sweep_wall_time_ns += gc_wall_clock_end(&objspace->profile.ruby_thread_sweep_wall_start_time);
+#endif
 }
 
 static void *
@@ -6836,11 +8420,32 @@ rb_gc_impl_start(void *objspace_ptr, bool full_mark, bool immediate_mark, bool i
     }
 
     garbage_collect(objspace, reason);
+#if RUBY_DEBUG
+    if (immediate_sweep) {
+        sweep_lock_lock(&objspace->sweep_lock);
+        {
+            GC_ASSERT(!objspace->sweep_thread_sweeping);
+            for (int j = 0; j < HEAP_COUNT; j++) {
+                rb_heap_t *heap = &heaps[j];
+                GC_ASSERT(!heap->swept_pages);
+                GC_ASSERT(!heap->sweeping_page);
+            }
+        }
+        sweep_lock_unlock(&objspace->sweep_lock);
+    }
+#endif
+    // NOTE: background sweeping can still be active here. We also may enter a new GC cycle from finalizers below.
     gc_finalize_deferred(objspace);
 
     gc_config_full_mark_set(full_marking_p);
 }
 
+void
+rb_gc_stop_background_threads(rb_objspace_t *objspace, const char *from_fn)
+{
+    wait_for_background_sweeping_to_finish(objspace, true, true, from_fn);
+}
+
 void
 rb_gc_impl_prepare_heap(void *objspace_ptr)
 {
@@ -7064,8 +8669,8 @@ gc_sort_heap_by_compare_func(rb_objspace_t *objspace, gc_compact_compare_func co
 
         for (i = 0; i < total_pages; i++) {
             ccan_list_add(&heap->pages, &page_list[i]->page_node);
-            if (page_list[i]->free_slots != 0) {
-                heap_add_freepage(heap, page_list[i]);
+            if (page_list[i]->free_slots != 0 && page_list[i]->start) {
+                heap_add_freepage(heap, page_list[i], "sort_by_compare_func");
             }
         }
 
@@ -7109,7 +8714,7 @@ gc_ref_update(void *vstart, void *vend, size_t stride, rb_objspace_t *objspace,
                 if (RVALUE_REMEMBERED(objspace, v)) {
                     page->flags.has_remembered_objects = TRUE;
                 }
-                if (page->flags.before_sweep) {
+                if (page->before_sweep) {
                     if (RVALUE_MARKED(objspace, v)) {
                         rb_gc_update_object_references(objspace, v);
                     }
@@ -7448,6 +9053,11 @@ enum gc_stat_sym {
     gc_stat_sym_malloc_increase_bytes_limit,
     gc_stat_sym_minor_gc_count,
     gc_stat_sym_major_gc_count,
+    gc_stat_sym_major_gc_count_by_nofree,
+    gc_stat_sym_major_gc_count_by_oldgen,
+    gc_stat_sym_major_gc_count_by_shady,
+    gc_stat_sym_major_gc_count_by_force,
+    gc_stat_sym_major_gc_count_by_oldmalloc,
     gc_stat_sym_compact_count,
     gc_stat_sym_read_barrier_faults,
     gc_stat_sym_total_moved_objects,
@@ -7455,6 +9065,8 @@ enum gc_stat_sym {
     gc_stat_sym_remembered_wb_unprotected_objects_limit,
     gc_stat_sym_old_objects,
     gc_stat_sym_old_objects_limit,
+    gc_stat_sym_pages_swept_by_sweep_thread,
+    gc_stat_sym_pages_swept_by_sweep_thread_had_deferred_free_objects,
 #if RGENGC_ESTIMATE_OLDMALLOC
     gc_stat_sym_oldmalloc_increase_bytes,
     gc_stat_sym_oldmalloc_increase_bytes_limit,
@@ -7498,6 +9110,11 @@ setup_gc_stat_symbols(void)
         S(malloc_increase_bytes_limit);
         S(minor_gc_count);
         S(major_gc_count);
+        S(major_gc_count_by_nofree);
+        S(major_gc_count_by_oldgen);
+        S(major_gc_count_by_shady);
+        S(major_gc_count_by_force);
+        S(major_gc_count_by_oldmalloc);
         S(compact_count);
         S(read_barrier_faults);
         S(total_moved_objects);
@@ -7505,6 +9122,8 @@ setup_gc_stat_symbols(void)
         S(remembered_wb_unprotected_objects_limit);
         S(old_objects);
         S(old_objects_limit);
+        S(pages_swept_by_sweep_thread);
+        S(pages_swept_by_sweep_thread_had_deferred_free_objects);
 #if RGENGC_ESTIMATE_OLDMALLOC
         S(oldmalloc_increase_bytes);
         S(oldmalloc_increase_bytes_limit);
@@ -7527,7 +9146,7 @@ ns_to_ms(uint64_t ns)
     return ns / (1000 * 1000);
 }
 
-static void malloc_increase_local_flush(rb_objspace_t *objspace);
+static size_t malloc_increase_local_flush(rb_objspace_t *objspace);
 
 VALUE
 rb_gc_impl_stat(void *objspace_ptr, VALUE hash_or_sym)
@@ -7579,6 +9198,11 @@ rb_gc_impl_stat(void *objspace_ptr, VALUE hash_or_sym)
     SET(malloc_increase_bytes_limit, malloc_limit);
     SET(minor_gc_count, objspace->profile.minor_gc_count);
     SET(major_gc_count, objspace->profile.major_gc_count);
+    SET(major_gc_count_by_nofree, objspace->profile.major_gc_count_by_nofree);
+    SET(major_gc_count_by_oldgen, objspace->profile.major_gc_count_by_oldgen);
+    SET(major_gc_count_by_shady, objspace->profile.major_gc_count_by_shady);
+    SET(major_gc_count_by_force, objspace->profile.major_gc_count_by_force);
+    SET(major_gc_count_by_oldmalloc, objspace->profile.major_gc_count_by_oldmalloc);
     SET(compact_count, objspace->profile.compact_count);
     SET(read_barrier_faults, objspace->profile.read_barrier_faults);
     SET(total_moved_objects, objspace->rcompactor.total_moved);
@@ -7586,6 +9210,8 @@ rb_gc_impl_stat(void *objspace_ptr, VALUE hash_or_sym)
     SET(remembered_wb_unprotected_objects_limit, objspace->rgengc.uncollectible_wb_unprotected_objects_limit);
     SET(old_objects, objspace->rgengc.old_objects);
     SET(old_objects_limit, objspace->rgengc.old_objects_limit);
+    SET(pages_swept_by_sweep_thread, objspace->profile.pages_swept_by_sweep_thread);
+    SET(pages_swept_by_sweep_thread_had_deferred_free_objects, objspace->profile.pages_swept_by_sweep_thread_had_deferred_free_objects);
 #if RGENGC_ESTIMATE_OLDMALLOC
     SET(oldmalloc_increase_bytes, objspace->malloc_counters.oldmalloc_increase);
     SET(oldmalloc_increase_bytes_limit, objspace->rgengc.oldmalloc_increase_limit);
@@ -7794,7 +9420,7 @@ rb_gc_impl_stress_set(void *objspace_ptr, VALUE flag)
 {
     rb_objspace_t *objspace = objspace_ptr;
 
-    objspace->flags.gc_stressful = RTEST(flag);
+    objspace->gc_stressful = RTEST(flag);
     objspace->gc_stress_mode = flag;
 }
 
@@ -8001,9 +9627,9 @@ atomic_sub_nounderflow(size_t *var, size_t sub)
     if (sub == 0) return;
 
     while (1) {
-        size_t val = *var;
+        size_t val = rbimpl_atomic_size_load(var, RBIMPL_ATOMIC_RELAXED);
         if (val < sub) sub = val;
-        if (RUBY_ATOMIC_SIZE_CAS(*var, val, val-sub) == val) break;
+        if (rbimpl_atomic_size_cas(var, val, val-sub, RBIMPL_ATOMIC_RELAXED, RBIMPL_ATOMIC_RELAXED) == val) break;
     }
 }
 
@@ -8024,42 +9650,53 @@ objspace_malloc_gc_stress(rb_objspace_t *objspace)
     }
 }
 
-static void
-malloc_increase_commit(rb_objspace_t *objspace, size_t new_size, size_t old_size)
+static size_t
+malloc_increase_commit(rb_objspace_t *objspace, size_t new_size, size_t old_size, struct heap_page *sweep_thread_page)
 {
     if (new_size > old_size) {
-        RUBY_ATOMIC_SIZE_ADD(malloc_increase, new_size - old_size);
+        GC_ASSERT(!is_sweep_thread_p());
+        size_t delta = new_size - old_size;
+        size_t old_val = rbimpl_atomic_size_fetch_add(&malloc_increase, delta, RBIMPL_ATOMIC_RELAXED);
 #if RGENGC_ESTIMATE_OLDMALLOC
-        RUBY_ATOMIC_SIZE_ADD(objspace->malloc_counters.oldmalloc_increase, new_size - old_size);
+        rbimpl_atomic_size_add(&objspace->malloc_counters.oldmalloc_increase, delta, RBIMPL_ATOMIC_RELAXED);
 #endif
+        return old_val + delta;
     }
     else {
-        atomic_sub_nounderflow(&malloc_increase, old_size - new_size);
+        size_t delta = old_size - new_size;
+        if (sweep_thread_page) {
+            sweep_thread_page->pre_freed_malloc_bytes += delta;
+        }
+        else {
+            atomic_sub_nounderflow(&malloc_increase, delta);
 #if RGENGC_ESTIMATE_OLDMALLOC
-        atomic_sub_nounderflow(&objspace->malloc_counters.oldmalloc_increase, old_size - new_size);
+            atomic_sub_nounderflow(&objspace->malloc_counters.oldmalloc_increase, delta);
 #endif
+        }
+        return 0;
     }
 }
 
 #if USE_MALLOC_INCREASE_LOCAL
-static void
+static size_t
 malloc_increase_local_flush(rb_objspace_t *objspace)
 {
     int delta = malloc_increase_local;
-    if (delta == 0) return;
+    if (delta == 0) return 0;
 
     malloc_increase_local = 0;
     if (delta > 0) {
-        malloc_increase_commit(objspace, (size_t)delta, 0);
+        return malloc_increase_commit(objspace, (size_t)delta, 0, NULL);
     }
     else {
-        malloc_increase_commit(objspace, 0, (size_t)(-delta));
+        return malloc_increase_commit(objspace, 0, (size_t)(-delta), current_sweep_thread_page);
     }
 }
 #else
-static void
+static size_t
 malloc_increase_local_flush(rb_objspace_t *objspace)
 {
+    return 0;
 }
 #endif
 
@@ -8078,6 +9715,8 @@ objspace_malloc_increase_report(rb_objspace_t *objspace, void *mem, size_t new_s
 static bool
 objspace_malloc_increase_body(rb_objspace_t *objspace, void *mem, size_t new_size, size_t old_size, enum memop_type type, bool gc_allowed)
 {
+    size_t current_malloc_increase = 0;
+
 #if USE_MALLOC_INCREASE_LOCAL
     if (new_size < GC_MALLOC_INCREASE_LOCAL_THRESHOLD &&
         old_size < GC_MALLOC_INCREASE_LOCAL_THRESHOLD) {
@@ -8085,22 +9724,23 @@ objspace_malloc_increase_body(rb_objspace_t *objspace, void *mem, size_t new_siz
 
         if (malloc_increase_local >= GC_MALLOC_INCREASE_LOCAL_THRESHOLD ||
             malloc_increase_local <= -GC_MALLOC_INCREASE_LOCAL_THRESHOLD) {
-            malloc_increase_local_flush(objspace);
+            current_malloc_increase = malloc_increase_local_flush(objspace);
         }
     }
     else {
         malloc_increase_local_flush(objspace);
-        malloc_increase_commit(objspace, new_size, old_size);
+        current_malloc_increase = malloc_increase_commit(objspace, new_size, old_size, current_sweep_thread_page);
     }
 #else
-    malloc_increase_commit(objspace, new_size, old_size);
+    current_malloc_increase = malloc_increase_commit(objspace, new_size, old_size, is_sweep_thread_p() ? current_sweep_thread_page : NULL);
 #endif
 
     if (type == MEMOP_TYPE_MALLOC && gc_allowed) {
       retry:
-        if (malloc_increase > malloc_limit && ruby_native_thread_p() && !dont_gc_val()) {
+        if (current_malloc_increase > malloc_limit && ruby_native_thread_p() && !dont_gc_val()) {
             if (ruby_thread_has_gvl_p() && is_lazy_sweeping(objspace)) {
                 gc_rest(objspace); /* gc_rest can reduce malloc_increase */
+                current_malloc_increase = rbimpl_atomic_size_load(&malloc_increase, RBIMPL_ATOMIC_RELAXED);
                 goto retry;
             }
             garbage_collect_with_gvl(objspace, GPR_FLAG_MALLOC);
@@ -8168,13 +9808,19 @@ objspace_malloc_prepare(rb_objspace_t *objspace, size_t size)
 }
 
 static bool
-malloc_during_gc_p(rb_objspace_t *objspace)
+bad_malloc_during_gc_p(rb_objspace_t *objspace)
 {
     /* malloc is not allowed during GC when we're not using multiple ractors
      * (since ractors can run while another thread is sweeping) and when we
      * have the GVL (since if we don't have the GVL, we'll try to acquire the
      * GVL which will block and ensure the other thread finishes GC). */
-    return during_gc && !dont_gc_val() && !rb_gc_multi_ractor_p() && ruby_thread_has_gvl_p();
+    if (is_sweep_thread_p()) {
+        fprintf(stderr, "ERROR: bad malloc/calloc call family during GC in sweep thread!\n");
+        return true;
+    }
+    else {
+        return during_gc && !dont_gc_val() && !rb_gc_multi_ractor_p() && ruby_thread_has_gvl_p();
+    }
 }
 
 static inline void *
@@ -8233,10 +9879,16 @@ objspace_malloc_fixup(rb_objspace_t *objspace, void *mem, size_t size, bool gc_a
 static void
 check_malloc_not_in_gc(rb_objspace_t *objspace, const char *msg)
 {
-    if (RB_UNLIKELY(malloc_during_gc_p(objspace))) {
-        dont_gc_on();
-        during_gc = false;
-        rb_bug("Cannot %s during GC", msg);
+    if (RB_UNLIKELY(bad_malloc_during_gc_p(objspace))) {
+        if (is_sweep_thread_p()) {
+            fprintf(stderr, "Bad %s in sweep thread, exiting\n", msg);
+            exit(EXIT_FAILURE);
+        }
+        else {
+            dont_gc_on();
+            during_gc = false;
+            rb_bug("Cannot %s during GC", msg);
+        }
     }
 }
 
@@ -8295,11 +9947,16 @@ rb_gc_impl_calloc(void *objspace_ptr, size_t size, bool gc_allowed)
 {
     rb_objspace_t *objspace = objspace_ptr;
 
-    if (RB_UNLIKELY(malloc_during_gc_p(objspace))) {
-        rb_warn("calloc during GC detected, this could cause crashes if it triggers another GC");
+    if (RB_UNLIKELY(bad_malloc_during_gc_p(objspace))) {
+        if (is_sweep_thread_p()) {
+            fprintf(stderr, "calloc in sweep thread detected! This could cause crashes!\n");
+        }
+        else {
+            rb_warn("calloc during GC detected, this could cause crashes if it triggers another GC");
 #if RGENGC_CHECK_MODE || RUBY_DEBUG
-        rb_bug("Cannot calloc during GC");
+            rb_bug("Cannot calloc during GC");
 #endif
+        }
     }
 
     void *mem;
@@ -9312,7 +10969,7 @@ gc_verify_compaction_references(int argc, VALUE* argv, VALUE self)
                  */
                 objspace->heap_pages.allocatable_bytes = desired_compaction.required_slots[i] * heap->slot_size;
                 while (objspace->heap_pages.allocatable_bytes > 0) {
-                    heap_page_allocate_and_initialize(objspace, heap);
+                    heap_page_allocate_and_initialize(objspace, heap, false);
                 }
                 /*
                  * Step 3: Add two more pages so that the compact & sweep cursors will meet _after_ all objects
@@ -9321,7 +10978,7 @@ gc_verify_compaction_references(int argc, VALUE* argv, VALUE self)
                 pages_to_add += 2;
 
                 for (; pages_to_add > 0; pages_to_add--) {
-                    heap_page_allocate_and_initialize_force(objspace, heap);
+                    heap_page_allocate_and_initialize_force(objspace, heap, false);
                 }
             }
         }
@@ -9350,29 +11007,51 @@ rb_gc_impl_objspace_free(void *objspace_ptr)
 {
     rb_objspace_t *objspace = objspace_ptr;
 
-    if (is_lazy_sweeping(objspace))
-        rb_bug("lazy sweeping underway when freeing object space");
+//    if (is_lazy_sweeping(objspace))
+//        rb_bug("lazy sweeping underway when freeing object space");
+
+    rb_gc_stop_background_threads(objspace, "objspace_free");
+
+#if PSWEEP_LOCK_STATS > 0
+    /* Print lock contention statistics before freeing */
+    print_lock_stats();
+#endif
+
+#if PSWEEP_COLLECT_TIMINGS > 0
+    /* Print Ruby thread sweep time to stdout */
+    double ruby_thread_sweep_cpu_time_ms = (double)(objspace->profile.ruby_thread_sweep_cpu_time_ns) / 1000000.0;
+    double ruby_thread_sweep_wall_time_ms = ((double)objspace->profile.ruby_thread_sweep_wall_time_ns) / 1000000.0;
+    fprintf(stderr, "\nSweep Time (CPU): %.3f ms (%.6f seconds)\n", ruby_thread_sweep_cpu_time_ms, ruby_thread_sweep_cpu_time_ms / 1000.0);
+    fprintf(stderr, "\nSweep Time (Wall): %.3f ms (%.6f seconds)\n", ruby_thread_sweep_wall_time_ms, ruby_thread_sweep_wall_time_ms / 1000.0);
+    fprintf(stderr, "\nSweeping enter count: %llu\n", sweeping_enter_count);
+    fprintf(stderr, "\nSweep continue count: %llu\n", sweep_continue_count);
+    fprintf(stderr, "\nSweep rest count: %llu\n", sweep_rest_count);
+#endif
 
     free(objspace->profile.records);
     objspace->profile.records = NULL;
 
     for (size_t i = 0; i < rb_darray_size(objspace->heap_pages.sorted); i++) {
-        heap_page_free(objspace, rb_darray_get(objspace->heap_pages.sorted, i));
+        heap_page_free(objspace, rb_darray_get(objspace->heap_pages.sorted, i), false);
     }
     rb_darray_free_without_gc(objspace->heap_pages.sorted);
     heap_pages_lomem = 0;
     heap_pages_himem = 0;
 
+    free_stack_chunks(&objspace->mark_stack);
+    mark_stack_free_cache(&objspace->mark_stack);
+
     for (int i = 0; i < HEAP_COUNT; i++) {
         rb_heap_t *heap = &heaps[i];
+        rb_native_mutex_destroy(&heap->swept_pages_lock);
+        rb_native_cond_destroy(&heap->sweep_page_cond);
         heap->total_pages = 0;
         heap->total_slots = 0;
     }
 
-    free_stack_chunks(&objspace->mark_stack);
-    mark_stack_free_cache(&objspace->mark_stack);
-
     rb_darray_free_without_gc(objspace->weak_references);
+    rb_native_cond_destroy(&objspace->sweep_cond);
+    rb_native_mutex_destroy(&objspace->sweep_lock);
 
     free(objspace);
 }
@@ -9416,8 +11095,11 @@ rb_gc_impl_before_fork(void *objspace_ptr)
 {
     rb_objspace_t *objspace = objspace_ptr;
 
+    wait_for_background_sweeping_to_finish(objspace, true, false, "impl_before_fork");
+
     objspace->fork_vm_lock_lev = RB_GC_VM_LOCK();
     rb_gc_vm_barrier();
+    GC_ASSERT(!during_gc);
 }
 
 void
@@ -9428,8 +11110,44 @@ rb_gc_impl_after_fork(void *objspace_ptr, rb_pid_t pid)
     RB_GC_VM_UNLOCK(objspace->fork_vm_lock_lev);
     objspace->fork_vm_lock_lev = 0;
 
+    void fiber_pool_lock_reset(void);
+    fiber_pool_lock_reset();
+    // TODO: reset the id_table lock in case of Ractors.
+
+    GC_ASSERT(!during_gc);
     if (pid == 0) { /* child process */
+        objspace->sweep_thread = 0;
+        rb_native_mutex_initialize(&objspace->sweep_lock);
+        rb_native_cond_initialize(&objspace->sweep_cond);
+        for (int i = 0; i < HEAP_COUNT; i++) {
+            rb_heap_t *heap = &heaps[i];
+
+            rb_native_mutex_initialize(&heap->swept_pages_lock);
+            rb_native_cond_initialize(&heap->sweep_page_cond);
+            heap->pre_sweeping_page = NULL;
+            heap->background_sweep_steps = heap->foreground_sweep_steps;
+        }
         rb_gc_ractor_newobj_cache_foreach(gc_ractor_newobj_cache_clear, NULL);
+
+        sweep_lock_owner = 0;
+        /* Start the sweep thread after fork */
+        objspace->sweep_thread_running = true;
+        objspace->sweep_thread_sweep_requested = false;
+        objspace->sweep_thread_sweeping = false;
+        objspace->sweep_thread_waiting_request = false;
+        GC_ASSERT(!objspace->background_sweep_mode);
+        GC_ASSERT(!objspace->background_sweep_abort);
+        GC_ASSERT(!objspace->background_sweep_restart_heaps);
+        pthread_create(&objspace->sweep_thread, NULL, gc_sweep_thread_func, objspace);
+        GET_VM()->gc.sweep_thread = objspace->sweep_thread;
+        sweep_lock_lock(&objspace->sweep_lock);
+        // The thread needs to be ready to accept sweep requests.
+        while (!objspace->sweep_thread_waiting_request) {
+            sweep_lock_unlock(&objspace->sweep_lock);
+            usleep(50);
+            sweep_lock_lock(&objspace->sweep_lock);
+        }
+        sweep_lock_unlock(&objspace->sweep_lock);
     }
 }
 
@@ -9516,6 +11234,8 @@ rb_gc_impl_objspace_init(void *objspace_ptr)
         slot_div_magics[i] = (uint32_t)((uint64_t)UINT32_MAX / heap->slot_size + 1);
 
         ccan_list_head_init(&heap->pages);
+        rb_native_mutex_initialize(&heap->swept_pages_lock);
+        rb_native_cond_initialize(&heap->sweep_page_cond);
     }
 
     init_size_to_heap_idx();
@@ -9536,6 +11256,12 @@ rb_gc_impl_objspace_init(void *objspace_ptr)
 
     objspace->profile.invoke_time = getrusage_time();
     finalizer_table = st_init_numtable();
+
+    rb_native_mutex_initialize(&objspace->sweep_lock);
+    rb_native_cond_initialize(&objspace->sweep_cond);
+    objspace->sweep_thread_running = true;
+    pthread_create(&objspace->sweep_thread, NULL, gc_sweep_thread_func, objspace);
+    GET_VM()->gc.sweep_thread = objspace->sweep_thread;
 }
 
 void
diff --git a/gc/gc.h b/gc/gc.h
index 469a4902f03365..44ff018aa123e7 100644
--- a/gc/gc.h
+++ b/gc/gc.h
@@ -81,7 +81,8 @@ MODULAR_GC_FN void *rb_gc_get_objspace(void);
 MODULAR_GC_FN void rb_gc_run_obj_finalizer(VALUE objid, long count, VALUE (*callback)(long i, void *data), void *data);
 MODULAR_GC_FN void rb_gc_set_pending_interrupt(void);
 MODULAR_GC_FN void rb_gc_unset_pending_interrupt(void);
-MODULAR_GC_FN void rb_gc_obj_free_vm_weak_references(VALUE obj);
+MODULAR_GC_FN bool rb_gc_obj_free_vm_weak_references(VALUE obj);
+MODULAR_GC_FN bool rb_gc_obj_free_concurrency_safe_vm_weak_references(VALUE obj);
 MODULAR_GC_FN bool rb_gc_obj_free(void *objspace, VALUE obj);
 MODULAR_GC_FN void rb_gc_save_machine_context(void);
 MODULAR_GC_FN void rb_gc_mark_roots(void *objspace, const char **categoryp);
diff --git a/hash.c b/hash.c
index 773df7e78d8c7f..0df553db67853b 100644
--- a/hash.c
+++ b/hash.c
@@ -6905,7 +6905,7 @@ static const rb_data_type_t env_data_type = {
         NULL,
         NULL,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 /*
diff --git a/id_table.c b/id_table.c
index 76841d0cff8d07..c15867cc8715e1 100644
--- a/id_table.c
+++ b/id_table.c
@@ -349,7 +349,7 @@ const rb_data_type_t rb_managed_id_table_type = {
         .dfree = managed_id_table_free,
         .dsize = managed_id_table_memsize,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static inline struct rb_id_table *
diff --git a/include/ruby/atomic.h b/include/ruby/atomic.h
index fcc48f532c89ba..32371953f4037c 100644
--- a/include/ruby/atomic.h
+++ b/include/ruby/atomic.h
@@ -36,6 +36,7 @@
 
 #if RBIMPL_COMPILER_IS(MSVC)
 # pragma intrinsic(_InterlockedOr)
+# pragma intrinsic(_InterlockedAnd)
 #elif defined(__sun) && defined(HAVE_ATOMIC_H)
 # include <atomic.h>
 #endif
@@ -140,6 +141,48 @@ typedef unsigned int rb_atomic_t;
  */
 #define RUBY_ATOMIC_OR(var, val) rbimpl_atomic_or(&(var), (val), RBIMPL_ATOMIC_SEQ_CST)
 
+/**
+ * Atomically  replaces  the  value  pointed   by  `var`  with  the  result  of
+ * bitwise AND between `val` and the old value of `var`.
+ *
+ * @param   var   A variable of ::rb_atomic_t.
+ * @param   val   Value to mask.
+ * @return  void
+ * @post    `var` holds `var & val`.
+ */
+#define RUBY_ATOMIC_AND(var, val) rbimpl_atomic_and(&(var), (val), RBIMPL_ATOMIC_SEQ_CST)
+
+/**
+ * Atomically  replaces  the  value  pointed   by  `var`  with  the  result  of
+ * bitwise AND between `val` and the old value of `var`.
+ *
+ * @param   var   A variable of ::rb_atomic_t.
+ * @param   val   Value to mask.
+ * @return  What was stored in `var` before the operation.
+ * @post    `var` holds `var & val`.
+ */
+#define RUBY_ATOMIC_FETCH_AND(var, val) rbimpl_atomic_fetch_and(&(var), (val), RBIMPL_ATOMIC_SEQ_CST)
+
+/**
+ * Identical to #RUBY_ATOMIC_OR, except it expects its arguments are ::VALUE.
+ *
+ * @param   var   A variable of ::VALUE.
+ * @param   val   Value to mix.
+ * @return  void
+ * @post    `var` holds `var | val`.
+ */
+#define RUBY_ATOMIC_VALUE_OR(var, val) rbimpl_atomic_size_or((volatile size_t *)&(var), (size_t)(val), RBIMPL_ATOMIC_SEQ_CST)
+
+/**
+ * Identical to #RUBY_ATOMIC_AND, except it expects its arguments are ::VALUE.
+ *
+ * @param   var   A variable of ::VALUE.
+ * @param   val   Value to mask.
+ * @return  void
+ * @post    `var` holds `var & val`.
+ */
+#define RUBY_ATOMIC_VALUE_AND(var, val) rbimpl_atomic_size_and((volatile size_t *)&(var), (size_t)(val), RBIMPL_ATOMIC_SEQ_CST)
+
 /**
  * Atomically replaces the value pointed by  `var` with `val`.  This is just an
  * assignment, but you can additionally know the previous value.
@@ -559,6 +602,76 @@ rbimpl_atomic_size_add(volatile size_t *ptr, size_t val, int memory_order)
 #endif
 }
 
+RBIMPL_ATTR_ARTIFICIAL()
+RBIMPL_ATTR_NOALIAS()
+RBIMPL_ATTR_NONNULL((1))
+static inline void
+rbimpl_atomic_size_or(volatile size_t *ptr, size_t val, int memory_order)
+{
+    (void)memory_order;
+#if 0
+
+#elif defined(HAVE_GCC_ATOMIC_BUILTINS)
+    __atomic_or_fetch(ptr, val, memory_order);
+
+#elif defined(HAVE_GCC_SYNC_BUILTINS)
+    __sync_or_and_fetch(ptr, val);
+
+#elif defined(_WIN64)
+    InterlockedOr64(ptr, val);
+
+#elif defined(__sun) && defined(HAVE_ATOMIC_H) && (defined(_LP64) || defined(_I32LPx))
+    atomic_or_ulong(ptr, val);
+
+#elif defined(_WIN32) || (defined(__sun) && defined(HAVE_ATOMIC_H))
+    RBIMPL_STATIC_ASSERT(size_of_size_t, sizeof *ptr == sizeof(rb_atomic_t));
+
+    volatile rb_atomic_t *const tmp = RBIMPL_CAST((volatile rb_atomic_t *)ptr);
+    rbimpl_atomic_or(tmp, val, memory_order);
+
+#elif defined(HAVE_STDATOMIC_H)
+    atomic_fetch_or_explicit((_Atomic volatile size_t *)ptr, val, memory_order);
+
+#else
+# error Unsupported platform.
+#endif
+}
+
+RBIMPL_ATTR_ARTIFICIAL()
+RBIMPL_ATTR_NOALIAS()
+RBIMPL_ATTR_NONNULL((1))
+static inline void
+rbimpl_atomic_size_and(volatile size_t *ptr, size_t val, int memory_order)
+{
+    (void)memory_order;
+#if 0
+
+#elif defined(HAVE_GCC_ATOMIC_BUILTINS)
+    __atomic_and_fetch(ptr, val, memory_order);
+
+#elif defined(HAVE_GCC_SYNC_BUILTINS)
+    __sync_and_and_fetch(ptr, val);
+
+#elif defined(_WIN64)
+    InterlockedAnd64(ptr, val);
+
+#elif defined(__sun) && defined(HAVE_ATOMIC_H) && (defined(_LP64) || defined(_I32LPx))
+    atomic_and_ulong(ptr, val);
+
+#elif defined(_WIN32) || (defined(__sun) && defined(HAVE_ATOMIC_H))
+    RBIMPL_STATIC_ASSERT(size_of_size_t, sizeof *ptr == sizeof(rb_atomic_t));
+
+    volatile rb_atomic_t *const tmp = RBIMPL_CAST((volatile rb_atomic_t *)ptr);
+    rbimpl_atomic_and(tmp, val, memory_order);
+
+#elif defined(HAVE_STDATOMIC_H)
+    atomic_fetch_and_explicit((_Atomic volatile size_t *)ptr, val, memory_order);
+
+#else
+# error Unsupported platform.
+#endif
+}
+
 RBIMPL_ATTR_ARTIFICIAL()
 RBIMPL_ATTR_NOALIAS()
 RBIMPL_ATTR_NONNULL((1))
@@ -804,6 +917,70 @@ rbimpl_atomic_or(volatile rb_atomic_t *ptr, rb_atomic_t val, int memory_order)
 #endif
 }
 
+RBIMPL_ATTR_ARTIFICIAL()
+RBIMPL_ATTR_NOALIAS()
+RBIMPL_ATTR_NONNULL((1))
+static inline rb_atomic_t
+rbimpl_atomic_fetch_and(volatile rb_atomic_t *ptr, rb_atomic_t val, int memory_order)
+{
+    (void)memory_order;
+#if 0
+
+#elif defined(HAVE_GCC_ATOMIC_BUILTINS)
+    return __atomic_fetch_and(ptr, val, memory_order);
+
+#elif defined(HAVE_GCC_SYNC_BUILTINS)
+    return __sync_fetch_and_and(ptr, val);
+
+#elif RBIMPL_COMPILER_IS(MSVC)
+    return _InterlockedAnd(ptr, val);
+
+#elif defined(__sun) && defined(HAVE_ATOMIC_H)
+    /* TODO: Solaris atomic_and_uint does not return the old value.
+     * Using CAS loop as fallback. */
+    rb_atomic_t old = *ptr;
+    while (atomic_cas_uint(ptr, old, old & val) != old) {
+        old = *ptr;
+    }
+    return old;
+
+#elif !defined(_WIN32) && defined(HAVE_STDATOMIC_H)
+    return atomic_fetch_and_explicit((_Atomic volatile rb_atomic_t *)ptr, val, memory_order);
+
+#else
+# error Unsupported platform.
+#endif
+}
+
+RBIMPL_ATTR_ARTIFICIAL()
+RBIMPL_ATTR_NOALIAS()
+RBIMPL_ATTR_NONNULL((1))
+static inline void
+rbimpl_atomic_and(volatile rb_atomic_t *ptr, rb_atomic_t val, int memory_order)
+{
+    (void)memory_order;
+#if 0
+
+#elif defined(HAVE_GCC_ATOMIC_BUILTINS)
+    __atomic_and_fetch(ptr, val, memory_order);
+
+#elif defined(HAVE_GCC_SYNC_BUILTINS)
+    __sync_and_and_fetch(ptr, val);
+
+#elif RBIMPL_COMPILER_IS(MSVC)
+    _InterlockedAnd(ptr, val);
+
+#elif defined(__sun) && defined(HAVE_ATOMIC_H)
+    atomic_and_uint(ptr, val);
+
+#elif !defined(_WIN32) && defined(HAVE_STDATOMIC_H)
+    atomic_fetch_and_explicit((_Atomic volatile rb_atomic_t *)ptr, val, memory_order);
+
+#else
+# error Unsupported platform.
+#endif
+}
+
 RBIMPL_ATTR_ARTIFICIAL()
 RBIMPL_ATTR_NOALIAS()
 RBIMPL_ATTR_NONNULL((1))
@@ -869,6 +1046,15 @@ rbimpl_atomic_size_exchange(volatile size_t *ptr, size_t val, int memory_order)
 #endif
 }
 
+RBIMPL_ATTR_ARTIFICIAL()
+RBIMPL_ATTR_NOALIAS()
+RBIMPL_ATTR_NONNULL((1))
+static inline size_t
+rbimpl_atomic_size_load(volatile size_t *ptr, int memory_order)
+{
+    return rbimpl_atomic_size_fetch_add(ptr, 0, memory_order);
+}
+
 RBIMPL_ATTR_ARTIFICIAL()
 RBIMPL_ATTR_NOALIAS()
 RBIMPL_ATTR_NONNULL((1))
diff --git a/include/ruby/internal/core/rtypeddata.h b/include/ruby/internal/core/rtypeddata.h
index 22bf46eb031bba..204cf0b539c689 100644
--- a/include/ruby/internal/core/rtypeddata.h
+++ b/include/ruby/internal/core/rtypeddata.h
@@ -120,6 +120,7 @@ static inline VALUE rbimpl_check_external_typeddata(VALUE obj);
  * Macros to see if each corresponding flag is defined.
  */
 #define RUBY_TYPED_FREE_IMMEDIATELY  RUBY_TYPED_FREE_IMMEDIATELY
+#define RUBY_TYPED_CONCURRENT_FREE_SAFE RUBY_TYPED_CONCURRENT_FREE_SAFE
 #define RUBY_TYPED_FROZEN_SHAREABLE  RUBY_TYPED_FROZEN_SHAREABLE
 #define RUBY_TYPED_WB_PROTECTED      RUBY_TYPED_WB_PROTECTED
 #define RUBY_TYPED_EMBEDDABLE        RUBY_TYPED_EMBEDDABLE
@@ -164,6 +165,14 @@ rbimpl_typeddata_flags {
      */
     RUBY_TYPED_EMBEDDABLE = 2,
 
+    /**
+     * This flag indicates that the dfree function for this type is safe to
+     * call concurrently from a background sweep thread. When set, the GC
+     * may free objects of this type without holding the GVL. Only set this
+     * flag if the dfree function does not access shared mutable state.
+     */
+    RUBY_TYPED_CONCURRENT_FREE_SAFE = 4,
+
     /**
      * This flag has something to do with Ractor.  Multiple Ractors run without
      * protecting each  other.  Sharing  an object  among Ractors  is basically
diff --git a/include/ruby/internal/intern/variable.h b/include/ruby/internal/intern/variable.h
index 479c3950c1e373..d983a0b0ebc30e 100644
--- a/include/ruby/internal/intern/variable.h
+++ b/include/ruby/internal/intern/variable.h
@@ -214,7 +214,7 @@ void rb_alias_variable(ID dst, ID src);
  * This just  destroys the given object.   @shyouhei has no idea  why extension
  * libraries should use this API.
  */
-void rb_free_generic_ivar(VALUE obj);
+bool rb_free_generic_ivar(VALUE obj);
 
 /**
  * Identical to rb_iv_get(), except it accepts the name as an ::ID instead of a
diff --git a/include/ruby/internal/value_type.h b/include/ruby/internal/value_type.h
index b47d8afb97b2a7..88c9027f7ee537 100644
--- a/include/ruby/internal/value_type.h
+++ b/include/ruby/internal/value_type.h
@@ -81,6 +81,7 @@
 #define T_TRUE     RUBY_T_TRUE     /**< @old{RUBY_T_TRUE} */
 #define T_UNDEF    RUBY_T_UNDEF    /**< @old{RUBY_T_UNDEF} */
 #define T_ZOMBIE   RUBY_T_ZOMBIE   /**< @old{RUBY_T_ZOMBIE} */
+#define T_LAST     RUBY_T_MOVED
 
 #define BUILTIN_TYPE      RB_BUILTIN_TYPE   /**< @old{RB_BUILTIN_TYPE} */
 #define DYNAMIC_SYM_P     RB_DYNAMIC_SYM_P  /**< @old{RB_DYNAMIC_SYM_P} */
diff --git a/internal/concurrent_set.h b/internal/concurrent_set.h
index 76cbefab0413ec..ce0b366a3cdc66 100644
--- a/internal/concurrent_set.h
+++ b/internal/concurrent_set.h
@@ -11,11 +11,11 @@ struct rb_concurrent_set_funcs {
     void (*free)(VALUE key);
 };
 
-VALUE rb_concurrent_set_new(const struct rb_concurrent_set_funcs *funcs, int capacity);
+VALUE rb_concurrent_set_new(const struct rb_concurrent_set_funcs *funcs, int capacity, int key_type);
 rb_atomic_t rb_concurrent_set_size(VALUE set_obj);
 VALUE rb_concurrent_set_find(VALUE *set_obj_ptr, VALUE key);
 VALUE rb_concurrent_set_find_or_insert(VALUE *set_obj_ptr, VALUE key, void *data);
-VALUE rb_concurrent_set_delete_by_identity(VALUE set_obj, VALUE key);
+VALUE rb_concurrent_set_delete_by_identity(VALUE *set_obj_ptr, VALUE key);
 void rb_concurrent_set_foreach_with_replace(VALUE set_obj, int (*callback)(VALUE *key, void *data), void *data);
 
 #endif
diff --git a/io.c b/io.c
index ab04d8df22864c..596f7db352bee4 100644
--- a/io.c
+++ b/io.c
@@ -10017,7 +10017,7 @@ argf_memsize(const void *ptr)
 static const rb_data_type_t argf_type = {
     "ARGF",
     {argf_mark_and_move, RUBY_TYPED_DEFAULT_FREE, argf_memsize, argf_mark_and_move},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static inline void
diff --git a/io_buffer.c b/io_buffer.c
index 3c7b3eb16a756b..684bb8e1c53717 100644
--- a/io_buffer.c
+++ b/io_buffer.c
@@ -332,7 +332,7 @@ static const rb_data_type_t rb_io_buffer_type = {
         .dcompact = rb_io_buffer_type_compact,
     },
     .data = NULL,
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static inline enum rb_io_buffer_flags
diff --git a/iseq.c b/iseq.c
index 6f87b2df3e085b..2c4ecb1caedf4a 100644
--- a/iseq.c
+++ b/iseq.c
@@ -1606,7 +1606,7 @@ static const rb_data_type_t iseqw_data_type = {
         iseqw_memsize,
         iseqw_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED|RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -2846,7 +2846,7 @@ iseq_inspect(const rb_iseq_t *iseq)
 static const rb_data_type_t tmp_set = {
     "tmpset",
     {(void (*)(void *))rb_mark_set, (void (*)(void *))st_free_table, 0, 0,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -3324,7 +3324,7 @@ cdhash_each(VALUE key, VALUE value, VALUE ary)
 static const rb_data_type_t label_wrapper = {
     "label_wrapper",
     {(void (*)(void *))rb_mark_tbl, (void (*)(void *))st_free_table, 0, 0,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define DECL_ID(name) \
diff --git a/marshal.c b/marshal.c
index 967855529e6d76..c592f7fe387b4e 100644
--- a/marshal.c
+++ b/marshal.c
@@ -237,7 +237,7 @@ memsize_dump_arg(const void *ptr)
 static const rb_data_type_t dump_arg_data = {
     "dump_arg",
     {mark_dump_arg, free_dump_arg, memsize_dump_arg,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
@@ -1317,7 +1317,7 @@ memsize_load_arg(const void *ptr)
 static const rb_data_type_t load_arg_data = {
     "load_arg",
     {mark_load_arg, free_load_arg, memsize_load_arg,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define r_entry(v, arg) r_entry0((v), (arg)->data->num_entries, (arg))
@@ -2626,7 +2626,7 @@ static const rb_data_type_t marshal_compat_type = {
         .dsize = marshal_compat_table_memsize,
         .dcompact = marshal_compat_table_mark_and_move,
     },
-    .flags = RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY,
+    .flags = RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static st_table *
diff --git a/memory_view.c b/memory_view.c
index 9f5d6715804b22..f360c6c88091de 100644
--- a/memory_view.c
+++ b/memory_view.c
@@ -65,7 +65,7 @@ const rb_data_type_t rb_memory_view_exported_object_registry_data_type = {
         exported_object_registry_free,
         0,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static int
@@ -124,7 +124,7 @@ static const rb_data_type_t memory_view_entry_data_type = {
         0,
         0,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 /* Register memory view functions for the given class */
diff --git a/parse.y b/parse.y
index bcff7918bfa4c3..170ed08a5e87a7 100644
--- a/parse.y
+++ b/parse.y
@@ -2773,7 +2773,7 @@ rb_parser_ary_free(rb_parser_t *p, rb_parser_ary_t *ary)
 %type <node> if_tail opt_else case_body case_args cases opt_rescue exc_list exc_var opt_ensure
 %type <node> args arg_splat call_args opt_call_args
 %type <node> paren_args opt_paren_args
-%type <node_args> args_tail block_args_tail block_args-opt_tail
+%type <node_args> args_tail block_args_tail
 %type <node> command_args aref_args
 %type <node_block_pass> opt_block_arg block_arg
 %type <node> var_ref var_lhs
@@ -2798,7 +2798,7 @@ rb_parser_ary_free(rb_parser_t *p, rb_parser_ary_t *ary)
 %type <node> p_value p_primitive p_variable p_var_ref p_expr_ref p_const
 %type <node> p_kwargs p_kwarg p_kw
 %type <id>   keyword_variable user_variable sym operation2 operation3
-%type <id>   cname fname op f_rest_arg f_block_arg opt_f_block_arg f_norm_arg f_bad_arg
+%type <id>   cname fname op f_rest_arg f_block_arg opt_comma f_norm_arg f_bad_arg
 %type <id>   f_kwrest f_label f_arg_asgn call_op call_op2 reswords relop dot_or_colon
 %type <id>   p_kwrest p_kwnorest p_any_kwrest p_kw_label
 %type <id>   f_no_kwarg f_any_kwrest args_forward excessed_comma nonlocal_var def_name
@@ -2923,18 +2923,18 @@ rb_parser_ary_free(rb_parser_t *p, rb_parser_ary_t *ary)
                     }
                 ;
 
-%rule args_tail_basic(value) <node_args>
-                : f_kwarg(value) ',' f_kwrest opt_f_block_arg
+%rule args_tail_basic(value, trailing) <node_args>
+                : f_kwarg(value) ',' f_kwrest opt_f_block_arg(trailing)
                     {
                         $$ = new_args_tail(p, $1, $3, $4, &@3);
                     /*% ripper: [$:1, $:3, $:4] %*/
                     }
-                | f_kwarg(value) opt_f_block_arg
+                | f_kwarg(value) opt_f_block_arg(trailing)
                     {
                         $$ = new_args_tail(p, $1, 0, $2, &@1);
                     /*% ripper: [$:1, Qnil, $:2] %*/
                     }
-                | f_any_kwrest opt_f_block_arg
+                | f_any_kwrest opt_f_block_arg(trailing)
                     {
                         $$ = new_args_tail(p, 0, $1, $2, &@1);
                     /*% ripper: [Qnil, $:1, $:2] %*/
@@ -2946,6 +2946,15 @@ rb_parser_ary_free(rb_parser_t *p, rb_parser_ary_t *ary)
                     }
                 ;
 
+%rule opt_f_block_arg(trailing) <id>
+                : ',' f_block_arg
+                    {
+                        $$ = $2;
+                    /*% ripper: $:2 %*/
+                    }
+                | trailing
+                ;
+
 %rule def_endless_method(bodystmt) <node>
                 : defn_head[head] f_opt_paren_args[args] '=' bodystmt
                     {
@@ -3087,13 +3096,13 @@ rb_parser_ary_free(rb_parser_t *p, rb_parser_ary_t *ary)
                     }
                 ;
 
-%rule opt_args_tail(tail) <node_args>
+%rule opt_args_tail(tail, trailing) <node_args>
                 : ',' tail
                     {
                         $$ = $tail;
                     /*% ripper: $:tail %*/
                     }
-                | /* none */
+                | trailing
                     {
                         $$ = new_empty_args_tail(p, &@$);
                     /*% ripper: [Qnil, Qnil, Qnil] %*/
@@ -4973,10 +4982,7 @@ f_any_kwrest	: f_kwrest
 
 f_eq		: {p->ctxt.in_argdef = 0;} '=';
 
-block_args_tail	: args_tail_basic(primary_value)
-                ;
-
-block_args-opt_tail : opt_args_tail(block_args_tail)
+block_args_tail	: args_tail_basic(primary_value, none)
                 ;
 
 excessed_comma	: ','
@@ -4987,14 +4993,14 @@ excessed_comma	: ','
                     }
                 ;
 
-block_param	: args-list(primary_value, block_args-opt_tail)
+block_param	: args-list(primary_value, opt_args_tail(block_args_tail, none))
                 | f_arg[pre] excessed_comma
                     {
                         $$ = new_empty_args_tail(p, &@excessed_comma);
                         $$ = new_args(p, $pre, 0, $excessed_comma, 0, $$, &@$);
                     /*% ripper: params!($:pre, Qnil, $:excessed_comma, Qnil, Qnil, Qnil, Qnil) %*/
                     }
-                | f_arg[pre] opt_args_tail(block_args_tail)[tail]
+                | f_arg[pre] opt_args_tail(block_args_tail, none)[tail]
                     {
                         $$ = new_args(p, $pre, 0, 0, 0, $tail, &@$);
                     /*% ripper: params!($:pre, Qnil, Qnil, Qnil, *$:tail[0..2]) %*/
@@ -6240,7 +6246,7 @@ f_arglist	: f_paren_args
                     }
                 ;
 
-args_tail	: args_tail_basic(arg_value)
+args_tail	: args_tail_basic(arg_value, opt_comma)
                 | args_forward
                     {
                         add_forwarding_args(p);
@@ -6250,7 +6256,7 @@ args_tail	: args_tail_basic(arg_value)
                     }
                 ;
 
-largs_tail	: args_tail_basic(arg_value)
+largs_tail	: args_tail_basic(arg_value, none)
                 | args_forward
                     {
                         yyerror1(&@args_forward, "unexpected ... in lambda argument");
@@ -6331,14 +6337,9 @@ largs_tail	: args_tail_basic(arg_value)
                     }
                 ;
 
-%rule f_args-opt_tail(tail) <node_args>
-                : opt_args_tail(tail)
-                ;
-
-
-%rule f_args-list(tail) <node_args>
-                : args-list(arg_value, f_args-opt_tail(tail))
-                | f_arg[pre] opt_args_tail(tail)[tail]
+%rule f_args-list(tail, trailing) <node_args>
+                : args-list(arg_value, opt_args_tail(tail, trailing))
+                | f_arg[pre] opt_args_tail(tail, trailing)[tail]
                     {
                         $$ = new_args(p, $pre, 0, 0, 0, $tail, &@$);
                     /*% ripper: params!($:pre, Qnil, Qnil, Qnil, *$:tail[0..2]) %*/
@@ -6347,10 +6348,10 @@ largs_tail	: args_tail_basic(arg_value)
                 | f_empty_arg
                 ;
 
-f_args		: f_args-list(args_tail)
+f_args		: f_args-list(args_tail, opt_comma)
                 ;
 
-f_largs		: f_args-list(largs_tail)
+f_largs		: f_args-list(largs_tail, none)
                 ;
 
 args_forward	: tBDOT3
@@ -6538,12 +6539,11 @@ f_block_arg	: blkarg_mark tIDENTIFIER
                     }
                 ;
 
-opt_f_block_arg	: ',' f_block_arg
+opt_comma	: ','?
                     {
-                        $$ = $2;
-                    /*% ripper: $:2 %*/
+                        $$ = 0;
+                    /*% ripper: Qnil %*/
                     }
-                | none
                 ;
 
 
diff --git a/proc.c b/proc.c
index 99fb880881b9d8..1550b9ad8c5ec1 100644
--- a/proc.c
+++ b/proc.c
@@ -106,7 +106,7 @@ const rb_data_type_t ruby_proc_data_type = {
         proc_memsize,
         proc_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define proc_data_type ruby_proc_data_type
@@ -285,7 +285,7 @@ const rb_data_type_t ruby_binding_data_type = {
         binding_memsize,
         binding_mark_and_move,
     },
-    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 VALUE
@@ -1795,7 +1795,7 @@ static const rb_data_type_t method_data_type = {
         NULL, // No external memory to report,
         bm_mark_and_move,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_FROZEN_SHAREABLE_NO_REC
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_FROZEN_SHAREABLE_NO_REC | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 VALUE
diff --git a/process.c b/process.c
index 126e36ee8d0d2a..be912be27cc754 100644
--- a/process.c
+++ b/process.c
@@ -597,7 +597,7 @@ static const rb_data_type_t rb_process_status_type = {
         .dfree = RUBY_DEFAULT_FREE,
         .dsize = NULL,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static VALUE
@@ -1582,8 +1582,6 @@ before_fork_ruby(void)
 static void
 after_fork_ruby(rb_pid_t pid)
 {
-    rb_gc_after_fork(pid);
-
     if (pid == 0) {
         // child
         clear_pid_cache();
@@ -1593,6 +1591,8 @@ after_fork_ruby(rb_pid_t pid)
         // parent
         after_exec();
     }
+
+    rb_gc_after_fork(pid);
 }
 #endif
 
@@ -1740,7 +1740,7 @@ memsize_exec_arg(const void *ptr)
 static const rb_data_type_t exec_arg_data_type = {
     "exec_arg",
     {mark_exec_arg, RUBY_TYPED_DEFAULT_FREE, memsize_exec_arg},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #ifdef _WIN32
@@ -4131,7 +4131,7 @@ rb_fork_ruby(int *status)
     struct child_handler_disabler_state old;
 
     do {
-        prefork();
+        prefork(); // NOTE: can context switch
 
         before_fork_ruby();
         rb_thread_acquire_fork_lock();
diff --git a/ractor.c b/ractor.c
index 4726cf107bfb03..3deef5f6719410 100644
--- a/ractor.c
+++ b/ractor.c
@@ -321,7 +321,7 @@ static const rb_data_type_t ractor_data_type = {
         ractor_memsize,
         NULL, // update
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY /* | RUBY_TYPED_WB_PROTECTED */
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE /* | RUBY_TYPED_WB_PROTECTED */
 };
 
 bool
@@ -2450,7 +2450,7 @@ static const rb_data_type_t cross_ractor_require_data_type = {
         NULL, // memsize
         NULL, // compact
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_DECL_MARKING | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/ractor_core.h b/ractor_core.h
index c692ebbbbfc638..8f53e599bbc3f0 100644
--- a/ractor_core.h
+++ b/ractor_core.h
@@ -5,6 +5,8 @@
 #include "id_table.h"
 #include "vm_debug.h"
 
+// FIXME: parallel sweep
+#define RACTOR_CHECK_MODE 0
 #ifndef RACTOR_CHECK_MODE
 #define RACTOR_CHECK_MODE (VM_CHECK_MODE || RUBY_DEBUG) && (SIZEOF_UINT64_T == SIZEOF_VALUE)
 #endif
diff --git a/ractor_sync.c b/ractor_sync.c
index 44c84ded92696f..405a7f8248eb08 100644
--- a/ractor_sync.c
+++ b/ractor_sync.c
@@ -36,7 +36,7 @@ static const rb_data_type_t ractor_port_data_type = {
         NULL, // memsize
         NULL, // update
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_EMBEDDABLE,
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_EMBEDDABLE,
 };
 
 static st_data_t
diff --git a/random.c b/random.c
index b6c96f1b4d25ff..6795165962fe86 100644
--- a/random.c
+++ b/random.c
@@ -272,7 +272,7 @@ const rb_data_type_t rb_random_data_type = {
         random_free,
         random_memsize,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define random_mt_mark rb_random_mark
@@ -293,7 +293,7 @@ static const rb_data_type_t random_mt_type = {
     },
     &rb_random_data_type,
     (void *)&random_mt_if,
-    RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static rb_random_t *
@@ -578,7 +578,7 @@ release_crypt(void *p)
 static const rb_data_type_t crypt_prov_type = {
     "HCRYPTPROV",
     {0, release_crypt,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static int
diff --git a/regexec.c b/regexec.c
index 3210c7cc1b5603..73f49d2963ad5a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -905,16 +905,13 @@ onig_region_resize(OnigRegion* region, int n)
   if (n < ONIG_NREGION)
     n = ONIG_NREGION;
 
+  size_t region_half_sz = n * sizeof(OnigPosition);
   if (region->allocated == 0) {
-    region->beg = (OnigPosition* )xmalloc(n * sizeof(OnigPosition));
+    region->beg = (OnigPosition* )xmalloc(region_half_sz * 2);
     if (region->beg == 0)
       return ONIGERR_MEMORY;
 
-    region->end = (OnigPosition* )xmalloc(n * sizeof(OnigPosition));
-    if (region->end == 0) {
-      xfree(region->beg);
-      return ONIGERR_MEMORY;
-    }
+    region->end = (OnigPosition* )region->beg + n;
 
     region->allocated = n;
   }
@@ -922,20 +919,13 @@ onig_region_resize(OnigRegion* region, int n)
     OnigPosition *tmp;
 
     region->allocated = 0;
-    tmp = (OnigPosition* )xrealloc(region->beg, n * sizeof(OnigPosition));
+    tmp = (OnigPosition* )xrealloc(region->beg, region_half_sz * 2);
     if (tmp == 0) {
       xfree(region->beg);
-      xfree(region->end);
       return ONIGERR_MEMORY;
     }
     region->beg = tmp;
-    tmp = (OnigPosition* )xrealloc(region->end, n * sizeof(OnigPosition));
-    if (tmp == 0) {
-      xfree(region->beg);
-      xfree(region->end);
-      return ONIGERR_MEMORY;
-    }
-    region->end = tmp;
+    region->end = (OnigPosition*)region->beg + n;
 
     region->allocated = n;
   }
@@ -998,7 +988,6 @@ onig_region_free(OnigRegion* r, int free_self)
   if (r) {
     if (r->allocated > 0) {
       xfree(r->beg);
-      xfree(r->end);
     }
 #ifdef USE_CAPTURE_HISTORY
     history_root_free(r);
diff --git a/ruby_parser.c b/ruby_parser.c
index 267f619bf9cd18..d58d69de535f59 100644
--- a/ruby_parser.c
+++ b/ruby_parser.c
@@ -508,7 +508,7 @@ static const rb_data_type_t ruby_parser_data_type = {
         parser_free,
         parser_memsize,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #ifdef UNIVERSAL_PARSER
@@ -736,7 +736,7 @@ static const rb_data_type_t ast_data_type = {
         ast_free,
         NULL, // No dsize() because this object does not appear in ObjectSpace.
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/scheduler.c b/scheduler.c
index c2f370a22aee4e..d542702d45b86d 100644
--- a/scheduler.c
+++ b/scheduler.c
@@ -90,7 +90,7 @@ static const rb_data_type_t blocking_operation_data_type = {
         RUBY_DEFAULT_FREE,
         blocking_operation_memsize,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
 };
 
 /*
diff --git a/set.c b/set.c
index 6bfded02a414ee..fc826aa5f6eeee 100644
--- a/set.c
+++ b/set.c
@@ -186,7 +186,7 @@ static const rb_data_type_t set_data_type = {
         .dsize = set_size,
         .dcompact = set_update_references,
     },
-    .flags = RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE
+    .flags = RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static inline set_table *
diff --git a/shape.c b/shape.c
index 90036722f10026..bd9c2fc089c3b3 100644
--- a/shape.c
+++ b/shape.c
@@ -322,7 +322,7 @@ static const rb_data_type_t shape_tree_type = {
         .dsize = shape_tree_memsize,
         .dcompact = shape_tree_mark_and_move,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 
diff --git a/string.c b/string.c
index 55a229f37c3b5c..a6b6427bc1f5d2 100644
--- a/string.c
+++ b/string.c
@@ -549,7 +549,7 @@ static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
 void
 Init_fstring_table(void)
 {
-    fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
+    fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192, T_STRING);
     rb_gc_register_address(&fstring_table_obj);
 }
 
@@ -593,13 +593,11 @@ rb_obj_is_fstring_table(VALUE obj)
 void
 rb_gc_free_fstring(VALUE obj)
 {
-    ASSERT_vm_locking_with_barrier();
-
     RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
     RUBY_ASSERT(OBJ_FROZEN(obj));
     RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
 
-    rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
+    rb_concurrent_set_delete_by_identity(&fstring_table_obj, obj);
 
     RB_DEBUG_COUNTER_INC(obj_str_fstr);
 
@@ -7835,7 +7833,7 @@ mapping_buffer_free(void *p)
 static const rb_data_type_t mapping_buffer_type = {
     "mapping_buffer",
     {0, mapping_buffer_free,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/symbol.c b/symbol.c
index d3d7e13ea43626..e7a74b2550e6ce 100644
--- a/symbol.c
+++ b/symbol.c
@@ -233,17 +233,19 @@ static VALUE
 dup_string_for_create(VALUE str)
 {
     rb_encoding *enc = rb_enc_get(str);
+    VALUE new_str;
 
-    str = rb_enc_str_new(RSTRING_PTR(str), RSTRING_LEN(str), enc);
+    new_str = rb_enc_str_new(RSTRING_PTR(str), RSTRING_LEN(str), enc);
+    RB_GC_GUARD(str);
 
     rb_encoding *ascii = rb_usascii_encoding();
-    if (enc != ascii && sym_check_asciionly(str, false)) {
-        rb_enc_associate(str, ascii);
+    if (enc != ascii && sym_check_asciionly(new_str, false)) {
+        rb_enc_associate(new_str, ascii);
     }
-    OBJ_FREEZE(str);
+    OBJ_FREEZE(new_str);
 
-    str = rb_fstring(str);
-    return str;
+    new_str = rb_fstring(new_str);
+    return new_str;
 }
 
 static int
@@ -338,6 +340,7 @@ sym_set_create(VALUE sym, void *data)
         RB_VM_LOCKING() {
             set_id_entry(&ruby_global_symbols, rb_id_to_serial(STATIC_SYM2ID(static_sym)), str, static_sym);
         }
+        RB_GC_GUARD(str);
 
         return sym_set_static_sym_tag(new_static_sym_entry);
     }
@@ -415,7 +418,7 @@ Init_sym(void)
 {
     rb_symbols_t *symbols = &ruby_global_symbols;
 
-    symbols->sym_set = rb_concurrent_set_new(&sym_set_funcs, 1024);
+    symbols->sym_set = rb_concurrent_set_new(&sym_set_funcs, 1024, T_SYMBOL);
     symbols->ids = rb_ary_hidden_new(0);
 
     Init_op_tbl();
@@ -950,7 +953,7 @@ rb_gc_free_dsymbol(VALUE sym)
     VALUE str = RSYMBOL(sym)->fstr;
 
     if (str) {
-        rb_concurrent_set_delete_by_identity(ruby_global_symbols.sym_set, sym);
+        rb_concurrent_set_delete_by_identity(&ruby_global_symbols.sym_set, sym);
 
         RSYMBOL(sym)->fstr = 0;
     }
diff --git a/test/-ext-/tracepoint/test_tracepoint.rb b/test/-ext-/tracepoint/test_tracepoint.rb
index 603fd01fd5c7e6..4805b323baa9af 100644
--- a/test/-ext-/tracepoint/test_tracepoint.rb
+++ b/test/-ext-/tracepoint/test_tracepoint.rb
@@ -47,7 +47,6 @@ def test_tracks_objspace_count
     assert_operator stat2[:total_allocated_objects] - stat1[:total_allocated_objects], :>=, newobj_count
     assert_operator 1_000_000, :<=, newobj_count
 
-    assert_operator stat2[:total_freed_objects] + stat2[:heap_final_slots] - stat1[:total_freed_objects], :>=, free_count
     assert_operator stat2[:count] - stat1[:count], :==, gc_start_count
 
     assert_operator gc_start_count, :==, gc_end_mark_count
diff --git a/test/ruby/test_process.rb b/test/ruby/test_process.rb
index d99e356e69bfd4..276a18e931b63f 100644
--- a/test/ruby/test_process.rb
+++ b/test/ruby/test_process.rb
@@ -1941,7 +1941,7 @@ def test_daemon_no_threads
           puts Dir.entries("/proc/self/task") - %W[. ..]
         end
         bug4920 = '[ruby-dev:43873]'
-        assert_include(1..2, data.size, bug4920)
+        assert_include(1..3, data.size, bug4920)
         assert_not_include(data.map(&:to_i), pid)
       end
     else # darwin
diff --git a/test/ruby/test_syntax.rb b/test/ruby/test_syntax.rb
index 70e19568160d0e..2d04858bde46da 100644
--- a/test/ruby/test_syntax.rb
+++ b/test/ruby/test_syntax.rb
@@ -222,6 +222,16 @@ def test_no_block_argument_in_method
     assert_raise_with_message(ArgumentError, /block accepted/) {obj.f(&proc {})}
   end
 
+  def test_trailing_comma_in_method_parameters
+    assert_valid_syntax("def f(a,b,c,); end")
+    assert_valid_syntax("def f(a,b,*c,); end")
+    assert_valid_syntax("def f(a,b,*,); end")
+    assert_valid_syntax("def f(a,b,**c,); end")
+    assert_valid_syntax("def f(a,b,**,); end")
+    assert_syntax_error("def f(a,b,&block,); end", /unexpected/)
+    assert_syntax_error("def f(a,b,...,); end", /unexpected/)
+  end
+
   def test_no_block_argument_in_block
     assert_valid_syntax("proc do |&nil| end")
     assert_valid_syntax("proc do |a, &nil| end")
diff --git a/test/ruby/test_zjit.rb b/test/ruby/test_zjit.rb
index 0c7d76bdf67292..84ded50300b114 100644
--- a/test/ruby/test_zjit.rb
+++ b/test/ruby/test_zjit.rb
@@ -395,7 +395,9 @@ def array.itself = :not_itself
       test(array)
 
       fxt_files = Dir.glob("/tmp/perfetto-\#{Process.pid}.fxt")
-      fxt_files.length == 1 && !File.empty?(fxt_files.first)
+      result = fxt_files.length == 1 && !File.empty?(fxt_files.first)
+      File.unlink(*fxt_files)
+      result
     RUBY
   end
 
diff --git a/thread.c b/thread.c
index f876b4bd05c80e..444cd14d955e3b 100644
--- a/thread.c
+++ b/thread.c
@@ -446,18 +446,26 @@ rb_threadptr_join_list_wakeup(rb_thread_t *thread)
     }
 }
 
+void mutexes_lock_lock(void);
+void mutexes_lock_unlock(void);
+
 void
 rb_threadptr_unlock_all_locking_mutexes(rb_thread_t *th)
 {
+    mutexes_lock_lock();
     while (th->keeping_mutexes) {
         rb_mutex_t *mutex = th->keeping_mutexes;
-        th->keeping_mutexes = mutex->next_mutex;
-
+        rb_mutex_t *next = mutex->next_mutex;
+        th->keeping_mutexes = next;
+        mutex->next_mutex = NULL;
+        mutexes_lock_unlock();
         // rb_warn("mutex #<%p> was not unlocked by thread #<%p>", (void *)mutex, (void*)th);
         VM_ASSERT(mutex->ec_serial);
-        const char *error_message = rb_mutex_unlock_th(mutex, th, 0);
+        const char *error_message = rb_mutex_unlock_th(mutex, th, 0, false);
         if (error_message) rb_bug("invalid keeping_mutexes: %s", error_message);
+        mutexes_lock_lock();
     }
+    mutexes_lock_unlock();
 }
 
 void
@@ -5011,6 +5019,9 @@ rb_thread_atfork_internal(rb_thread_t *th, void (*atfork)(rb_thread_t *, const r
     rb_thread_reset_timer_thread();
     rb_thread_start_timer_thread();
 
+    void mutexes_lock_reset(void);
+    mutexes_lock_reset(); // TODO: should be on thread
+
     VM_ASSERT(vm->ractor.blocking_cnt == 0);
     VM_ASSERT(vm->ractor.cnt == 1);
 }
@@ -5081,7 +5092,7 @@ static const rb_data_type_t thgroup_data_type = {
         RUBY_TYPED_DEFAULT_FREE,
         NULL, // No external memory to report
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 /*
@@ -5250,7 +5261,7 @@ thread_shield_mark(void *ptr)
 static const rb_data_type_t thread_shield_data_type = {
     "thread_shield",
     {thread_shield_mark, 0, 0,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/thread_sync.c b/thread_sync.c
index cf4e3843ff6c2f..1ee77b6aeeae0f 100644
--- a/thread_sync.c
+++ b/thread_sync.c
@@ -80,7 +80,7 @@ static void rb_mutex_abandon_all(rb_mutex_t *mutexes);
 static void rb_mutex_abandon_keeping_mutexes(rb_thread_t *th);
 static void rb_mutex_abandon_locking_mutex(rb_thread_t *th);
 #endif
-static const char* rb_mutex_unlock_th(rb_mutex_t *mutex, rb_thread_t *th, rb_serial_t ec_serial);
+static const char* rb_mutex_unlock_th(rb_mutex_t *mutex, rb_thread_t *th, rb_serial_t ec_serial, bool unlink_from_keeping);
 
 static size_t
 rb_mutex_num_waiting(rb_mutex_t *mutex)
@@ -95,7 +95,52 @@ rb_mutex_num_waiting(rb_mutex_t *mutex)
     return n;
 }
 
-rb_thread_t* rb_fiber_threadptr(const rb_fiber_t *fiber);
+// TODO: mutexes_lock should be per-thread (on rb_thread_struct)
+rb_nativethread_lock_t mutexes_lock = PTHREAD_MUTEX_INITIALIZER;
+#ifdef RUBY_THREAD_PTHREAD_H
+pthread_t mutexes_lock_lock_owner;
+#endif
+
+static inline void
+ASSERT_mutexes_lock_locked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() == mutexes_lock_lock_owner);
+#endif
+}
+
+static inline void
+ASSERT_mutexes_lock_unlocked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() != mutexes_lock_lock_owner);
+#endif
+}
+
+void
+mutexes_lock_lock(void) {
+    ASSERT_mutexes_lock_unlocked();
+    rb_native_mutex_lock(&mutexes_lock);
+#ifdef RUBY_THREAD_PTHREAD_H
+    mutexes_lock_lock_owner = pthread_self();
+#endif
+}
+
+void
+mutexes_lock_unlock(void) {
+    ASSERT_mutexes_lock_locked();
+#ifdef RUBY_THREAD_PTHREAD_H
+    mutexes_lock_lock_owner = 0;
+#endif
+    rb_native_mutex_unlock(&mutexes_lock);
+}
+
+void
+mutexes_lock_reset(void)
+{
+    rb_native_mutex_initialize(&mutexes_lock);
+}
+
 
 static bool
 mutex_locked_p(rb_mutex_t *mutex)
@@ -108,7 +153,7 @@ mutex_free(void *ptr)
 {
     rb_mutex_t *mutex = ptr;
     if (mutex_locked_p(mutex)) {
-        const char *err = rb_mutex_unlock_th(mutex, mutex->th, 0);
+        const char *err = rb_mutex_unlock_th(mutex, mutex->th, 0, true);
         if (err) rb_bug("%s", err);
     }
     ruby_xfree(ptr);
@@ -123,7 +168,7 @@ mutex_memsize(const void *ptr)
 static const rb_data_type_t mutex_data_type = {
     "mutex",
     {NULL, mutex_free, mutex_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static rb_mutex_t *
@@ -172,27 +217,35 @@ static void
 thread_mutex_insert(rb_thread_t *thread, rb_mutex_t *mutex)
 {
     RUBY_ASSERT(!mutex->next_mutex);
-    if (thread->keeping_mutexes) {
-        mutex->next_mutex = thread->keeping_mutexes;
-    }
+    mutexes_lock_lock();
+    {
+        if (thread->keeping_mutexes) {
+            mutex->next_mutex = thread->keeping_mutexes;
+        }
 
-    thread->keeping_mutexes = mutex;
+        thread->keeping_mutexes = mutex;
+    }
+    mutexes_lock_unlock();
 }
 
 static void
 thread_mutex_remove(rb_thread_t *thread, rb_mutex_t *mutex)
 {
-    rb_mutex_t **keeping_mutexes = &thread->keeping_mutexes;
+    mutexes_lock_lock();
+    {
+        rb_mutex_t **keeping_mutexes = &thread->keeping_mutexes;
 
-    while (*keeping_mutexes && *keeping_mutexes != mutex) {
-        // Move to the next mutex in the list:
-        keeping_mutexes = &(*keeping_mutexes)->next_mutex;
-    }
+        while (*keeping_mutexes && *keeping_mutexes != mutex) {
+            // Move to the next mutex in the list:
+            keeping_mutexes = &(*keeping_mutexes)->next_mutex;
+        }
 
-    if (*keeping_mutexes) {
-        *keeping_mutexes = mutex->next_mutex;
-        mutex->next_mutex = NULL;
+        if (*keeping_mutexes) {
+            *keeping_mutexes = mutex->next_mutex;
+            mutex->next_mutex = NULL;
+        }
     }
+    mutexes_lock_unlock();
 }
 
 static void
@@ -441,7 +494,10 @@ rb_mutex_owned_p(VALUE self)
 }
 
 static const char *
-rb_mutex_unlock_th(rb_mutex_t *mutex, rb_thread_t *th, rb_serial_t ec_serial)
+// m = Mutex.new
+// m.lock() Thread.current.keeping_mutexes << m
+//
+rb_mutex_unlock_th(rb_mutex_t *mutex, rb_thread_t *th, rb_serial_t ec_serial, bool unlink_from_keeping)
 {
     RUBY_DEBUG_LOG("%p", mutex);
 
@@ -455,7 +511,9 @@ rb_mutex_unlock_th(rb_mutex_t *mutex, rb_thread_t *th, rb_serial_t ec_serial)
     struct sync_waiter *cur = 0, *next;
 
     mutex->ec_serial = 0;
-    thread_mutex_remove(th, mutex);
+    if (unlink_from_keeping) {
+        thread_mutex_remove(th, mutex);
+    }
 
     ccan_list_for_each_safe(&mutex->waitq, cur, next, node) {
         ccan_list_del_init(&cur->node);
@@ -492,7 +550,7 @@ do_mutex_unlock(struct mutex_args *args)
     rb_mutex_t *mutex = args->mutex;
     rb_thread_t *th = rb_ec_thread_ptr(args->ec);
 
-    err = rb_mutex_unlock_th(mutex, th, rb_ec_serial(args->ec));
+    err = rb_mutex_unlock_th(mutex, th, rb_ec_serial(args->ec), true);
     if (err) rb_raise(rb_eThreadError, "%s", err);
 }
 
@@ -535,8 +593,12 @@ rb_mut_unlock(rb_execution_context_t *ec, VALUE self)
 static void
 rb_mutex_abandon_keeping_mutexes(rb_thread_t *th)
 {
-    rb_mutex_abandon_all(th->keeping_mutexes);
-    th->keeping_mutexes = NULL;
+    mutexes_lock_lock();
+    {
+        rb_mutex_abandon_all(th->keeping_mutexes);
+        th->keeping_mutexes = NULL;
+    }
+    mutexes_lock_unlock();
 }
 
 static void
@@ -727,7 +789,7 @@ static const rb_data_type_t queue_data_type = {
         .dsize = queue_memsize,
         .dcompact = queue_mark_and_move,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static VALUE
@@ -833,7 +895,7 @@ static const rb_data_type_t szqueue_data_type = {
         .dcompact = szqueue_mark_and_move,
     },
     .parent = &queue_data_type,
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static VALUE
@@ -1173,7 +1235,7 @@ condvar_memsize(const void *ptr)
 static const rb_data_type_t cv_data_type = {
     "condvar",
     {0, RUBY_TYPED_DEFAULT_FREE, condvar_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED|RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct rb_condvar *
diff --git a/time.c b/time.c
index c3bda3f6af0472..261437a747a2f7 100644
--- a/time.c
+++ b/time.c
@@ -1909,7 +1909,7 @@ static const rb_data_type_t time_data_type = {
         .dsize = NULL,
         .dcompact = time_mark_and_move,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static VALUE
diff --git a/transcode.c b/transcode.c
index f8b0fec42ef275..ede9002d7d8152 100644
--- a/transcode.c
+++ b/transcode.c
@@ -3019,7 +3019,7 @@ econv_memsize(const void *ptr)
 static const rb_data_type_t econv_data_type = {
     "econv",
     {0, econv_free, econv_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/variable.c b/variable.c
index 9d0e4e4a2b9eac..35eb86443a9d75 100644
--- a/variable.c
+++ b/variable.c
@@ -579,6 +579,7 @@ void
 rb_free_generic_fields_tbl_(void)
 {
     st_free_table(generic_fields_tbl_);
+    generic_fields_tbl_ = NULL;
 }
 
 static struct rb_global_entry*
@@ -1225,11 +1226,71 @@ ivar_ractor_check(VALUE obj, ID id)
     }
 }
 
+// TODO: platforms other than pthread
+static rb_nativethread_lock_t gen_fields_tbl_lock_ = PTHREAD_MUTEX_INITIALIZER;
+#ifdef RUBY_THREAD_PTHREAD_H
+static pthread_t gen_fields_tbl_lock_owner;
+#endif
+static unsigned int gen_fields_tbl_lock_lvl;
+
+static inline void
+ASSERT_gen_fields_tbl_locked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() == gen_fields_tbl_lock_owner);
+#endif
+}
+
+static inline void
+ASSERT_gen_fields_tbl_unlocked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() != gen_fields_tbl_lock_owner);
+#endif
+}
+
+static inline void
+gen_fields_tbl_lock(bool allow_reentry)
+{
+    if (allow_reentry && pthread_self() == gen_fields_tbl_lock_owner) {
+    } else {
+        ASSERT_gen_fields_tbl_unlocked();
+        rb_native_mutex_lock(&gen_fields_tbl_lock_);
+        gen_fields_tbl_lock_owner = pthread_self();
+    }
+    gen_fields_tbl_lock_lvl++;
+}
+
+static inline bool
+gen_fields_tbl_trylock(bool allow_reentry)
+{
+    if (allow_reentry && pthread_self() == gen_fields_tbl_lock_owner) {
+    } else {
+        ASSERT_gen_fields_tbl_unlocked();
+        if (rb_native_mutex_trylock(&gen_fields_tbl_lock_) == EBUSY) {
+            return false;
+        }
+        gen_fields_tbl_lock_owner = pthread_self();
+    }
+    gen_fields_tbl_lock_lvl++;
+    return true;
+}
+
+static inline void
+gen_fields_tbl_unlock(void)
+{
+    ASSERT_gen_fields_tbl_locked();
+    RUBY_ASSERT(gen_fields_tbl_lock_lvl > 0);
+    gen_fields_tbl_lock_lvl--;
+    if (gen_fields_tbl_lock_lvl == 0) {
+        gen_fields_tbl_lock_owner = 0;
+        rb_native_mutex_unlock(&gen_fields_tbl_lock_);
+    }
+}
+
 static inline struct st_table *
 generic_fields_tbl_no_ractor_check(void)
 {
-    ASSERT_vm_locking();
-
     return generic_fields_tbl_;
 }
 
@@ -1243,21 +1304,27 @@ void
 rb_mark_generic_ivar(VALUE obj)
 {
     VALUE data;
-    // Bypass ASSERT_vm_locking() check because marking may happen concurrently with mmtk
-    if (st_lookup(generic_fields_tbl_, (st_data_t)obj, (st_data_t *)&data)) {
-        rb_gc_mark_movable(data);
+    gen_fields_tbl_lock(true);
+    {
+        // Bypass ASSERT_vm_locking() check because marking may happen concurrently with mmtk
+        if (st_lookup(generic_fields_tbl_, (st_data_t)obj, (st_data_t *)&data)) {
+            rb_gc_mark_movable(data);
+        }
     }
+    gen_fields_tbl_unlock();
 }
 
 VALUE
 rb_obj_fields_generic_uncached(VALUE obj)
 {
     VALUE fields_obj = 0;
-    RB_VM_LOCKING() {
+    gen_fields_tbl_lock(false);
+    {
         if (!st_lookup(generic_fields_tbl_, (st_data_t)obj, (st_data_t *)&fields_obj)) {
             rb_bug("Object is missing entry in generic_fields_tbl");
         }
     }
+    gen_fields_tbl_unlock();
     return fields_obj;
 }
 
@@ -1301,9 +1368,10 @@ rb_obj_fields(VALUE obj, ID field_name)
     return fields_obj;
 }
 
-void
+bool
 rb_free_generic_ivar(VALUE obj)
 {
+    bool result = true;
     if (rb_obj_gen_fields_p(obj)) {
         st_data_t key = (st_data_t)obj, value;
         switch (BUILTIN_TYPE(obj)) {
@@ -1324,20 +1392,32 @@ rb_free_generic_ivar(VALUE obj)
             {
                 // Other EC may have stale caches, so fields_obj should be
                 // invalidated and the GC will replace with Qundef
-                rb_execution_context_t *ec = GET_EC();
-                if (ec->gen_fields_cache.obj == obj) {
+                rb_execution_context_t *ec = rb_current_execution_context(false);
+                if (ec && ec->gen_fields_cache.obj == obj) {
                     ec->gen_fields_cache.obj = Qundef;
                     ec->gen_fields_cache.fields_obj = Qundef;
                 }
-                RB_VM_LOCKING() {
+                if (ec) {
+                    gen_fields_tbl_lock(true); // needs to be re-entrant
+                }
+                else {
+                    bool did_lock = gen_fields_tbl_trylock(false);
+                    // If we can't acquire it, bail (could lead to deadlock)
+                    if (!did_lock) return false;
+                }
+                // gen_fields_tbl_lock();
+                {
                     if (!st_delete(generic_fields_tbl_no_ractor_check(), &key, &value)) {
+                        gen_fields_tbl_unlock();
                         rb_bug("Object is missing entry in generic_fields_tbl");
                     }
                 }
+                gen_fields_tbl_unlock();
             }
         }
         RBASIC_SET_SHAPE_ID(obj, ROOT_SHAPE_ID);
     }
+    return result;
 }
 
 static void
@@ -1372,8 +1452,12 @@ rb_obj_set_fields(VALUE obj, VALUE fields_obj, ID field_name, VALUE original_fie
           default:
           generic_fields:
             {
-                RB_VM_LOCKING() {
-                    st_insert(generic_fields_tbl_, (st_data_t)obj, (st_data_t)fields_obj);
+                RB_VM_LOCKING() { // needed in case insert triggers GC
+                    gen_fields_tbl_lock(false);
+                    {
+                        st_insert(generic_fields_tbl_, (st_data_t)obj, (st_data_t)fields_obj);
+                    }
+                    gen_fields_tbl_unlock();
                 }
                 RB_OBJ_WRITTEN(obj, original_fields_obj, fields_obj);
 
@@ -2296,6 +2380,7 @@ rb_replace_generic_ivar(VALUE clone, VALUE obj)
 {
     RB_VM_LOCKING() {
         st_data_t fields_tbl, obj_data = (st_data_t)obj;
+        // We've STW at this point, no need to lock gen_fields_tbl_lock
         if (st_delete(generic_fields_tbl_, &obj_data, &fields_tbl)) {
             st_insert(generic_fields_tbl_, (st_data_t)clone, fields_tbl);
             RB_OBJ_WRITTEN(clone, Qundef, fields_tbl);
@@ -2584,6 +2669,45 @@ rb_mod_const_missing(VALUE klass, VALUE name)
     UNREACHABLE_RETURN(Qnil);
 }
 
+rb_nativethread_lock_t autoload_free_lock = PTHREAD_MUTEX_INITIALIZER;
+#ifdef RUBY_THREAD_PTHREAD_H
+pthread_t autoload_free_lock_owner;
+#endif
+
+static inline void
+ASSERT_autoload_free_lock_locked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() == autoload_free_lock_owner);
+#endif
+}
+
+static inline void
+ASSERT_autoload_free_lock_unlocked(void)
+{
+#ifdef RUBY_THREAD_PTHREAD_H
+    VM_ASSERT(pthread_self() != autoload_free_lock_owner);
+#endif
+}
+
+static inline void
+autoload_free_lock_lock(void) {
+    ASSERT_autoload_free_lock_unlocked();
+    rb_native_mutex_lock(&autoload_free_lock);
+#ifdef RUBY_THREAD_PTHREAD_H
+    autoload_free_lock_owner = pthread_self();
+#endif
+}
+
+static inline void
+autoload_free_lock_unlock(void) {
+    ASSERT_autoload_free_lock_locked();
+#ifdef RUBY_THREAD_PTHREAD_H
+    autoload_free_lock_owner = 0;
+#endif
+    rb_native_mutex_unlock(&autoload_free_lock);
+}
+
 static void
 autoload_table_mark(void *ptr)
 {
@@ -2612,7 +2736,7 @@ autoload_table_compact(void *ptr)
 static const rb_data_type_t autoload_table_type = {
     "autoload_table",
     {autoload_table_mark, autoload_table_free, autoload_table_memsize, autoload_table_compact,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define check_autoload_table(av) \
@@ -2705,10 +2829,14 @@ autoload_data_free(void *ptr)
 {
     struct autoload_data *p = ptr;
 
-    struct autoload_const *autoload_const, *next;
-    ccan_list_for_each_safe(&p->constants, autoload_const, next, cnode) {
-        ccan_list_del_init(&autoload_const->cnode);
+    autoload_free_lock_lock();
+    {
+        struct autoload_const *autoload_const, *next;
+        ccan_list_for_each_safe(&p->constants, autoload_const, next, cnode) {
+            ccan_list_del_init(&autoload_const->cnode);
+        }
     }
+    autoload_free_lock_unlock();
 
     SIZED_FREE(p);
 }
@@ -2722,7 +2850,7 @@ autoload_data_memsize(const void *ptr)
 static const rb_data_type_t autoload_data_type = {
     "autoload_data",
     {autoload_data_mark_and_move, autoload_data_free, autoload_data_memsize, autoload_data_mark_and_move},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static void
@@ -2748,14 +2876,19 @@ autoload_const_free(void *ptr)
 {
     struct autoload_const *autoload_const = ptr;
 
-    ccan_list_del(&autoload_const->cnode);
+    autoload_free_lock_lock();
+    {
+        ccan_list_del(&autoload_const->cnode);
+    }
+    autoload_free_lock_unlock();
+
     SIZED_FREE(autoload_const);
 }
 
 static const rb_data_type_t autoload_const_type = {
     "autoload_const",
     {autoload_const_mark_and_move, autoload_const_free, autoload_const_memsize, autoload_const_mark_and_move,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static struct autoload_data *
diff --git a/vm.c b/vm.c
index 0398b9f74c9683..0eabdfeaa5f1df 100644
--- a/vm.c
+++ b/vm.c
@@ -3415,6 +3415,8 @@ ruby_vm_destruct(rb_vm_t *vm)
 
     if (vm) {
         rb_thread_t *th = vm->ractor.main_thread;
+        void wait_for_background_sweeping_to_finish(void *, bool, bool, const char*);
+        wait_for_background_sweeping_to_finish(vm->gc.objspace, true, false, "vm_destruct");
 
         if (rb_free_at_exit) {
             rb_free_encoded_insn_data();
@@ -3559,7 +3561,7 @@ vm_memsize(const void *ptr)
 const rb_data_type_t ruby_vm_data_type = {
     "VM",
     {0, 0, vm_memsize,},
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 #define vm_data_type ruby_vm_data_type
@@ -3897,7 +3899,7 @@ const rb_data_type_t ruby_threadptr_data_type = {
         thread_memsize,
         thread_compact,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 VALUE
@@ -4724,7 +4726,7 @@ static const rb_data_type_t pin_array_list_type = {
         .dsize = pin_array_list_memsize,
         .dcompact = pin_array_list_update_references,
     },
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 static VALUE
diff --git a/vm_backtrace.c b/vm_backtrace.c
index c0bc46b8caf5c7..35faedc6e487e7 100644
--- a/vm_backtrace.c
+++ b/vm_backtrace.c
@@ -157,7 +157,7 @@ static const rb_data_type_t location_data_type = {
         NULL, // No external memory to report,
         location_ref_update,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 int
@@ -567,7 +567,7 @@ static const rb_data_type_t backtrace_data_type = {
     /* Cannot set the RUBY_TYPED_EMBEDDABLE flag because the loc of frame_info
      * points elements in the backtrace array. This can cause the loc to become
      * incorrect if this backtrace object is moved by compaction. */
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 int
diff --git a/vm_core.h b/vm_core.h
index 85664e18b8396b..4ca92b431b17e7 100644
--- a/vm_core.h
+++ b/vm_core.h
@@ -806,6 +806,7 @@ typedef struct rb_vm_struct {
             void *data;
             void (*mark_func)(VALUE v, void *data);
         } *mark_func_data;
+        pthread_t sweep_thread;
     } gc;
 
     rb_at_exit_list *at_exit;
@@ -1631,10 +1632,17 @@ VM_ENV_BOX_UNCHECKED(const VALUE *ep)
 int rb_vm_ep_in_heap_p(const VALUE *ep);
 #endif
 
+static rb_execution_context_t *rb_current_execution_context(bool expect_ec);
+
 static inline int
 VM_ENV_ESCAPED_P(const VALUE *ep)
 {
-    VM_ASSERT(rb_vm_ep_in_heap_p(ep) == !!VM_ENV_FLAGS(ep, VM_ENV_FLAG_ESCAPED));
+#if VM_CHECK_MODE > 0
+    if (rb_current_execution_context(false)) {
+        // Can be called from background sweep thread, and this uses GET_EC()
+        VM_ASSERT(rb_vm_ep_in_heap_p(ep) == !!VM_ENV_FLAGS(ep, VM_ENV_FLAG_ESCAPED));
+    }
+#endif
     return VM_ENV_FLAGS(ep, VM_ENV_FLAG_ESCAPED) ? 1 : 0;
 }
 
@@ -2158,11 +2166,6 @@ rb_current_ractor_raw(bool expect)
     }
 }
 
-static inline rb_ractor_t *
-rb_current_ractor(void)
-{
-    return rb_current_ractor_raw(true);
-}
 
 static inline rb_vm_t *
 rb_current_vm(void)
@@ -2178,6 +2181,16 @@ rb_current_vm(void)
     return ruby_current_vm_ptr;
 }
 
+static inline rb_ractor_t *
+rb_current_ractor(void)
+{
+    rb_vm_t *vm = GET_VM();
+    if (vm) {
+        VM_ASSERT(vm->gc.sweep_thread != pthread_self());
+    }
+    return rb_current_ractor_raw(true);
+}
+
 void rb_ec_vm_lock_rec_release(const rb_execution_context_t *ec,
                                unsigned int recorded_lock_rec,
                                unsigned int current_lock_rec);
diff --git a/vm_method.c b/vm_method.c
index 021b06bf00109b..03038ef688eef9 100644
--- a/vm_method.c
+++ b/vm_method.c
@@ -135,7 +135,7 @@ static const rb_data_type_t cc_table_type = {
         .dcompact = vm_cc_table_compact,
     },
     .parent = &rb_managed_id_table_type,
-    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE,
+    .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE,
 };
 
 VALUE
diff --git a/vm_sync.c b/vm_sync.c
index aca83dde5a73aa..5b33309ebbd572 100644
--- a/vm_sync.c
+++ b/vm_sync.c
@@ -8,10 +8,12 @@
 void rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr);
 void rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr);
 void rb_ractor_sched_barrier_end(rb_vm_t *vm, rb_ractor_t *cr);
+bool is_sweep_thread_p(void);
 
 static bool
 vm_locked(rb_vm_t *vm)
 {
+    if (!vm) return false;
     return vm_locked_by_ractor_p(vm, GET_RACTOR());
 }
 
@@ -68,6 +70,7 @@ vm_need_barrier_waiting(const rb_vm_t *vm)
 static bool
 vm_need_barrier(bool no_barrier, const rb_ractor_t *cr, const rb_vm_t *vm)
 {
+    VM_ASSERT(cr);
 #ifdef RUBY_THREAD_PTHREAD_H
     return !no_barrier && cr->threads.sched.running != NULL && vm_need_barrier_waiting(vm); // ractor has running threads.
 #else
@@ -80,6 +83,8 @@ vm_lock_enter(rb_ractor_t *cr, rb_vm_t *vm, bool locked, bool no_barrier, unsign
 {
     RUBY_DEBUG_LOG2(file, line, "start locked:%d", locked);
 
+    VM_ASSERT(!is_sweep_thread_p());
+
     if (locked) {
         ASSERT_vm_locking();
     }
@@ -152,6 +157,7 @@ void
 rb_vm_lock_enter_body(unsigned int *lev APPEND_LOCATION_ARGS)
 {
     rb_vm_t *vm = GET_VM();
+    VM_ASSERT(vm);
     if (vm_locked(vm)) {
         vm_lock_enter(NULL, vm, true, false, lev APPEND_LOCATION_PARAMS);
     }
@@ -164,6 +170,7 @@ void
 rb_vm_lock_enter_body_nb(unsigned int *lev APPEND_LOCATION_ARGS)
 {
     rb_vm_t *vm = GET_VM();
+    VM_ASSERT(vm);
     if (vm_locked(vm)) {
         vm_lock_enter(NULL, vm, true, true, lev APPEND_LOCATION_PARAMS);
     }
@@ -176,6 +183,7 @@ void
 rb_vm_lock_enter_body_cr(rb_ractor_t *cr, unsigned int *lev APPEND_LOCATION_ARGS)
 {
     rb_vm_t *vm = GET_VM();
+    VM_ASSERT(vm);
     vm_lock_enter(cr, vm, vm_locked(vm), false, lev APPEND_LOCATION_PARAMS);
 }
 
@@ -188,13 +196,14 @@ rb_vm_lock_leave_body_nb(unsigned int *lev APPEND_LOCATION_ARGS)
 void
 rb_vm_lock_leave_body(unsigned int *lev APPEND_LOCATION_ARGS)
 {
-    vm_lock_leave(GET_VM(),  false, lev APPEND_LOCATION_PARAMS);
+    vm_lock_leave(GET_VM(), false, lev APPEND_LOCATION_PARAMS);
 }
 
 void
 rb_vm_lock_body(LOCATION_ARGS)
 {
     rb_vm_t *vm = GET_VM();
+    VM_ASSERT(vm);
     ASSERT_vm_unlocking();
 
     vm_lock_enter(GET_RACTOR(), vm, false, false, &vm->ractor.sync.lock_rec APPEND_LOCATION_PARAMS);
@@ -254,6 +263,7 @@ void
 rb_vm_barrier(void)
 {
     RB_DEBUG_COUNTER_INC(vm_sync_barrier);
+    VM_ASSERT(!is_sweep_thread_p());
 
     if (!rb_multi_ractor_p()) {
         // no other ractors
diff --git a/vm_sync.h b/vm_sync.h
index 314a2238a96581..761c1795eeb09d 100644
--- a/vm_sync.h
+++ b/vm_sync.h
@@ -44,7 +44,7 @@ rb_multi_ractor_p(void)
 {
     if (LIKELY(ruby_single_main_ractor)) {
         // 0 on boot time.
-        RUBY_ASSERT(GET_VM()->ractor.cnt <= 1);
+        RUBY_ASSERT(!GET_VM() || GET_VM()->ractor.cnt <= 1);
         return false;
     }
     else {
diff --git a/vm_trace.c b/vm_trace.c
index 42b9991e7141bc..5457cc4627a8e5 100644
--- a/vm_trace.c
+++ b/vm_trace.c
@@ -905,7 +905,7 @@ static const rb_data_type_t tp_data_type = {
         RUBY_TYPED_DEFAULT_FREE,
         NULL, // Nothing allocated externally, so don't need a memsize function
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static VALUE
diff --git a/weakmap.c b/weakmap.c
index 7cef1fd46a63a7..256d0887655a03 100644
--- a/weakmap.c
+++ b/weakmap.c
@@ -141,7 +141,7 @@ const rb_data_type_t rb_weakmap_type = {
         wmap_compact,
         wmap_handle_weak_references,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static int
@@ -627,7 +627,7 @@ static const rb_data_type_t rb_weakkeymap_type = {
         wkmap_compact,
         wkmap_handle_weak_references,
     },
-    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE | RUBY_TYPED_CONCURRENT_FREE_SAFE
 };
 
 static int