From 997f09cacbc1bbbdd3c83cccbf46eabf8df353d4 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 5 Apr 2024 11:47:45 +0300 Subject: [PATCH] [mono][interp] Reduce false pinning from interp stack (#100400) * [mono][interp] Reduce false pinning from interp stack Interpreter opcodes operate on the interp stack, an area of memory separately allocated. Each interp var will have an allocated stack offset in the current interpreter stack frame. When we allocate the storage for an interp var we can take into account the var type. If the type can represent a potential ref to an object or an interior ref then we mark the pointer slot as potentially containing refs, for the method that is being compiled. During GC, we used to conservatively scan the entire interp stack space used by each thread. After this change, in the first stage, we do a stack walkwhere we detect slots in each interp frame where no refs can reside. We mark these slots in a bit array. Afterwards we conservatively scan the interp stack of the thread, while ignoring slots that were previously marked as not containing any refs. System.Runtime.Tests suite was used for testing the effectiveness of the change, by computing the cumulative number of pinned objects throughout all GCs (about 1100). minijit - avg 702000 pinned objects old-interp - avg 641000 pinned objects precise-interp - avg 578000 pinned objects This resulted in 10% reduction in the number of pinned objects during collection. This change is meant to reduce memory usage of apps by making objects die earlier. We could further improve by being more precise. For example, for call sites we could reuse liveness information to precisely know which slots actually contain refs. This is a bit more complex to implement and it is unclear yet how impactful it would be. * [mono][interp] Add option to disable precise scanning of stack * [mono][interp] Fix pushing of byrefs on execution stack A lot of times, when we were pushing a byref type on the stack during compilation, we would first get the mint_type which would be MINT_TYPE_I4/I8. From the mint_type we would then obtain the STACK_TYPE_I4/I8, losing information because it should have been STACK_TYPE_MP. Because of this, the underlying interp var would end up being created as MONO_TYPE_I4/I8 instead of MONO_TYPE_I. Add another method for pushing directly a MonoType, with less confusing indirections. Code around here could further be refactored. This is only relevant for GC stack scanning, since we would want to scan only slots containing MONO_TYPE_I. --- src/mono/mono/metadata/class-getters.h | 1 + src/mono/mono/mini/interp/interp-internals.h | 4 + src/mono/mono/mini/interp/interp.c | 73 ++++++++++- src/mono/mono/mini/interp/interp.h | 3 +- src/mono/mono/mini/interp/transform-opt.c | 7 +- src/mono/mono/mini/interp/transform.c | 127 +++++++++++++++++-- src/mono/mono/mini/interp/transform.h | 4 + 7 files changed, 203 insertions(+), 16 deletions(-) diff --git a/src/mono/mono/metadata/class-getters.h b/src/mono/mono/metadata/class-getters.h index eb69558a1d01b..57ff9afefceb3 100644 --- a/src/mono/mono/metadata/class-getters.h +++ b/src/mono/mono/metadata/class-getters.h @@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate) MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited) MONO_CLASS_GETTER(m_class_has_cctor, gboolean, , MonoClass, has_cctor) MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references) +MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields) MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs) MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields) MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index c5f3707ab1ca5..a815f39f9c8e6 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -145,6 +145,7 @@ struct InterpMethod { MonoFtnDesc *ftndesc_unbox; MonoDelegateTrampInfo *del_info; + /* locals_size is equal to the offset of the param_area */ guint32 locals_size; guint32 alloca_size; int num_clauses; // clauses @@ -153,6 +154,7 @@ struct InterpMethod { unsigned int hasthis; // boolean MonoProfilerCallInstrumentationFlags prof_flags; InterpMethodCodeType code_type; + MonoBitSet *ref_slots; #ifdef ENABLE_EXPERIMENT_TIERED MiniTieredCounter tiered_counter; #endif @@ -268,6 +270,8 @@ typedef struct { guchar *stack_pointer; /* Used for allocation of localloc regions */ FrameDataAllocator data_stack; + /* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */ + guint8 *no_ref_slots; } ThreadContext; typedef struct { diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index ee9566afc2339..f8e2ad02ad95c 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -412,6 +412,9 @@ get_context (void) if (context == NULL) { context = g_new0 (ThreadContext, 1); context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK); + // A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array + if (mono_interp_opt & INTERP_OPT_PRECISE_GC) + context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK); context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE; context->stack_real_end = context->stack_start + INTERP_STACK_SIZE; /* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */ @@ -8011,6 +8014,8 @@ interp_parse_options (const char *options) #endif else if (strncmp (arg, "ssa", 3) == 0) opt = INTERP_OPT_SSA; + else if (strncmp (arg, "precise", 7) == 0) + opt = INTERP_OPT_PRECISE_GC; else if (strncmp (arg, "all", 3) == 0) opt = ~INTERP_OPT_NONE; @@ -8473,6 +8478,57 @@ interp_stop_single_stepping (void) ss_enabled = FALSE; } + +static void +interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit) +{ + InterpMethod *imethod = frame->imethod; + gpointer *frame_stack = (gpointer*)frame->stack; + gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size); + // The way interpreter implements calls is by moving arguments to the param area, at the + // top of the stack and then proceed with the call. Up to the moment of the call these slots + // are owned by the calling frame. Once we do the call, the stack pointer of the called + // frame will point inside the param area of the calling frame. + // + // We mark no ref slots from top to bottom and we use the top limit to ignore slots + // that were already handled in the called frame. + if (top_limit && top_limit < frame_stack_end) + frame_stack_end = top_limit; + + for (gpointer *current = frame_stack; current < frame_stack_end; current++) { + gsize slot_index = current - frame_stack; + if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) { + gsize global_slot_index = current - (gpointer*)context->stack_start; + gsize table_index = global_slot_index / 8; + int bit_index = global_slot_index % 8; + context->no_ref_slots [table_index] |= 1 << bit_index; + } + } +} + +static void +interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf) +{ + memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1); + while (lmf) { + if ((gsize)lmf->previous_lmf & 2) { + MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf; + if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) { + InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data; + gpointer *top_limit = NULL; + while (frame) { + if (frame->imethod) { + interp_mark_frame_no_ref_slots (context, frame, top_limit); + top_limit = (gpointer*)frame->stack; + } + frame = frame->parent; + } + } + } + lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3); + } +} + /* * interp_mark_stack: * @@ -8505,9 +8561,20 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo if (!context || !context->stack_start) return; - // FIXME: Scan the whole area with 1 call - for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) - func (p, gc_data); + if (mono_interp_opt & INTERP_OPT_PRECISE_GC) { + MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR]; + if (lmf_addr) + interp_mark_no_ref_slots (context, *lmf_addr); + } + + int slot_index = 0; + for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) { + if (context->no_ref_slots && (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8)))) + ;// This slot is marked as no ref, we don't scan it + else + func (p, gc_data); + slot_index++; + } FrameDataFragment *frag; for (frag = context->data_stack.first; frag; frag = frag->next) { diff --git a/src/mono/mono/mini/interp/interp.h b/src/mono/mono/mini/interp/interp.h index 742e93bf06e59..a09111c490bec 100644 --- a/src/mono/mono/mini/interp/interp.h +++ b/src/mono/mono/mini/interp/interp.h @@ -42,7 +42,8 @@ enum { INTERP_OPT_JITERPRETER = 64, #endif INTERP_OPT_SSA = 128, - INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA + INTERP_OPT_PRECISE_GC = 256, + INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA | INTERP_OPT_PRECISE_GC #if HOST_BROWSER | INTERP_OPT_JITERPRETER #endif diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 88231ac8bd40f..4ee96b7a541d2 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos) int interp_alloc_global_var_offset (TransformData *td, int var) { - return alloc_var_offset (td, var, &td->total_locals_size); + int offset = alloc_var_offset (td, var, &td->total_locals_size); + interp_mark_ref_slots_for_var (td, var); + return offset; } static void @@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td) add_active_call (td, &ac, td->vars [var].call); } else if (!td->vars [var].global && td->vars [var].offset == -1) { alloc_var_offset (td, var, ¤t_offset); + interp_mark_ref_slots_for_var (td, var); + if (current_offset > final_total_locals_size) final_total_locals_size = current_offset; @@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td) // These are allocated separately at the end of the stack if (td->vars [i].call_args) { td->vars [i].offset += td->param_area_offset; + interp_mark_ref_slots_for_var (td, i); final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size); } } diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 144967c4e8806..70773797a47f7 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -551,6 +551,22 @@ set_simple_type_and_var (TransformData *td, StackInfo *sp, int type) set_type_and_var (td, sp, type, NULL); } +static void +push_mono_type (TransformData *td, MonoType *type, int mt, MonoClass *k) +{ + if (mt == -1) + mt = mono_mint_type (type); + if (!k) + k = mono_class_from_mono_type_internal (type); + + g_assert (mt != MINT_TYPE_VT); + + if (m_type_is_byref (type)) + push_type_explicit (td, STACK_TYPE_MP, k, MINT_STACK_SLOT_SIZE); + else + push_type_explicit (td, stack_type [mt], k, MINT_STACK_SLOT_SIZE); +} + static void push_type (TransformData *td, int type, MonoClass *k) { @@ -1006,7 +1022,7 @@ load_arg(TransformData *td, int n) if (hasthis && n == 0) { mt = MINT_TYPE_I; klass = NULL; - push_type (td, stack_type [mt], klass); + push_type (td, STACK_TYPE_MP, klass); } else { g_assert (size < G_MAXUINT16); push_type_vt (td, klass, size); @@ -1020,7 +1036,7 @@ load_arg(TransformData *td, int n) if (mt == MINT_TYPE_O) klass = mono_class_from_mono_type_internal (type); } - push_type (td, stack_type [mt], klass); + push_mono_type (td, type, mt, klass); } interp_add_ins (td, interp_get_mov_for_type (mt, TRUE)); interp_ins_set_sreg (td->last_ins, n); @@ -1069,7 +1085,7 @@ load_local (TransformData *td, int local) MonoClass *klass = NULL; if (mt == MINT_TYPE_O) klass = mono_class_from_mono_type_internal (type); - push_type (td, stack_type [mt], klass); + push_mono_type (td, type, mt, klass); } interp_add_ins (td, interp_get_mov_for_type (mt, TRUE)); interp_ins_set_sreg (td->last_ins, local); @@ -3699,7 +3715,7 @@ interp_transform_call (TransformData *td, MonoMethod *method, MonoMethod *target return FALSE; } } else { - push_type (td, stack_type[mt], klass); + push_mono_type (td, csignature->ret, mt, klass); } dreg = td->sp [-1].var; } else { @@ -4346,6 +4362,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [i].size = size; offset = ALIGN_TO (offset, align); td->vars [i].offset = offset; + interp_mark_ref_slots_for_var (td, i); offset += size; } offset = ALIGN_TO (offset, MINT_STACK_ALIGNMENT); @@ -4371,6 +4388,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [index].mt = mono_mint_type (header->locals [i]); td->vars [index].ext_index = -1; td->vars [index].size = size; + interp_mark_ref_slots_for_var (td, index); // Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals offset += size; } @@ -4568,7 +4586,7 @@ interp_emit_sfld_access (TransformData *td, MonoClassField *field, MonoClass *fi interp_add_ins (td, interp_get_ldind_for_mt (mt)); interp_ins_set_sreg (td->last_ins, td->sp [-1].var); td->sp--; - push_type (td, stack_type [mt], field_class); + push_mono_type (td, ftype, mt, field_class); interp_ins_set_dreg (td->last_ins, td->sp [-1].var); } } else { @@ -4595,14 +4613,14 @@ interp_emit_sfld_access (TransformData *td, MonoClassField *field, MonoClass *fi if (mt == MINT_TYPE_VT) { push_type_vt (td, field_class, size); } else { - push_type (td, stack_type [mt], field_class); + push_mono_type (td, ftype, mt, field_class); } } else if (mt == MINT_TYPE_VT) { interp_add_ins (td, MINT_LDSFLD_VT); push_type_vt (td, field_class, size); } else { interp_add_ins (td, MINT_LDSFLD_I1 + mt - MINT_TYPE_I1); - push_type (td, stack_type [mt], field_class); + push_mono_type (td, ftype, mt, field_class); } interp_ins_set_dreg (td->last_ins, td->sp [-1].var); } else { @@ -6709,7 +6727,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, if (mt == MINT_TYPE_VT) push_type_vt (td, field_klass, field_size); else - push_type (td, stack_type [mt], field_klass); + push_mono_type (td, ftype, mt, field_klass); interp_ins_set_dreg (td->last_ins, td->sp [-1].var); } else { if (G_UNLIKELY (m_field_is_from_update (field))) { @@ -6739,7 +6757,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, if (mt == MINT_TYPE_VT) push_type_vt (td, field_klass, field_size); else - push_type (td, stack_type [mt], field_klass); + push_mono_type (td, ftype, mt, field_klass); interp_ins_set_dreg (td->last_ins, td->sp [-1].var); } } @@ -7695,8 +7713,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, int param_offset = get_tos_offset (td); if (!MONO_TYPE_IS_VOID (info->sig->ret)) { - mt = mono_mint_type (info->sig->ret); - push_simple_type (td, stack_type [mt]); + push_mono_type (td, info->sig->ret, -1, NULL); dreg = td->sp [-1].var; } else { // dummy dreg @@ -8507,6 +8524,78 @@ get_short_brop (int opcode) return opcode; } +static void +interp_mark_ref_slots_for_vt (TransformData *td, int base_offset, MonoClass *klass) +{ + if (!m_class_has_references (klass) && !m_class_has_ref_fields (klass)) + return; + + gpointer iter = NULL; + MonoClassField *field; + while ((field = mono_class_get_fields_internal (klass, &iter))) { + MonoType *ftype = mono_field_get_type_internal (field); + if (ftype->attrs & FIELD_ATTRIBUTE_STATIC) + continue; + int offset = base_offset + m_field_get_offset (field) - MONO_ABI_SIZEOF (MonoObject); +retry: + if (mini_type_is_reference (ftype) || ftype->type == MONO_TYPE_I || ftype->type == MONO_TYPE_U || m_type_is_byref (ftype)) { + int index = offset / sizeof (gpointer); + mono_bitset_set_fast (td->ref_slots, index); + if (td->verbose_level) + g_print ("Stack ref slot vt field at off %d\n", offset); + } else if (ftype->type == MONO_TYPE_VALUETYPE || ftype->type == MONO_TYPE_GENERICINST) { + interp_mark_ref_slots_for_vt (td, offset, mono_class_from_mono_type_internal (ftype)); + } + + if (m_class_is_inlinearray (klass)) { + int max_offset = base_offset + m_class_get_instance_size (klass) - MONO_ABI_SIZEOF (MonoObject); + int align; + int field_size = mono_type_size (ftype, &align); + offset += field_size; + offset = ALIGN_TO (offset, align); + if (offset < max_offset) + goto retry; + } + } +} + +void +interp_mark_ref_slots_for_var (TransformData *td, int var) +{ + if (!(mono_interp_opt & INTERP_OPT_PRECISE_GC)) + return; + + g_assert (td->vars [var].offset != -1); + + gsize max_index = (td->vars [var].offset + td->vars [var].size) / sizeof (gpointer); + + if (!td->ref_slots || max_index >= td->ref_slots->size) { + guint32 old_size = td->ref_slots ? (guint32)td->ref_slots->size : 0; + guint32 new_size = old_size ? old_size * 2 : 32; + + gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (new_size, 0)); + MonoBitSet *new_ref_slots = mono_bitset_mem_new (mem, new_size, 0); + + if (old_size) + memcpy (&new_ref_slots->data, &td->ref_slots->data, old_size / 8); + td->ref_slots = new_ref_slots; + } + + MonoType *type = td->vars [var].type; + if (td->vars [var].mt == MINT_TYPE_VT) { + MonoClass *klass = mono_class_from_mono_type_internal (type); + interp_mark_ref_slots_for_vt (td, td->vars [var].offset, klass); + } else { + // Managed pointers in interp are normally MONO_TYPE_I + if (mini_type_is_reference (type) || type->type == MONO_TYPE_I || type->type == MONO_TYPE_U || m_type_is_byref (type)) { + int index = td->vars [var].offset / sizeof (gpointer); + mono_bitset_set_fast (td->ref_slots, index); + if (td->verbose_level) + g_print ("Stack ref slot at off %d for var %d\n", index * sizeof (gpointer), var); + } + } +} + static int get_var_offset (TransformData *td, int var) { @@ -8526,6 +8615,7 @@ get_var_offset (TransformData *td, int var) g_assert (td->vars [var].execution_stack); td->vars [var].offset = td->total_locals_size + td->vars [var].stack_offset; + interp_mark_ref_slots_for_var (td, var); return td->vars [var].offset; } @@ -9155,6 +9245,21 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG mono_interp_register_imethod_data_items (rtm->data_items, td->imethod_items); rtm->patchpoint_data = td->patchpoint_data; + if (td->ref_slots) { + gpointer ref_slots_mem = mono_mem_manager_alloc0 (td->mem_manager, mono_bitset_alloc_size (rtm->alloca_size / sizeof (gpointer), 0)); + rtm->ref_slots = mono_bitset_mem_new (ref_slots_mem, rtm->alloca_size / sizeof (gpointer), 0); + gsize copy_size = rtm->ref_slots->size; + if (td->ref_slots->size < copy_size) + copy_size = td->ref_slots->size; + memcpy (&rtm->ref_slots->data, &td->ref_slots->data, copy_size / 8); + if (!td->optimized) { + // Unoptimized code can have some stack slot moving patterns as part of calls. + // Just conservatively mark all these slots as potentially containing refs. + for (guint32 offset = rtm->locals_size; offset < rtm->alloca_size; offset += sizeof (gpointer)) + mono_bitset_set (rtm->ref_slots, offset / sizeof (gpointer)); + } + } + /* Save debug info */ interp_save_debug_info (rtm, header, td, td->line_numbers); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index f05556a44c7c2..1e6185f8089c8 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -340,6 +340,8 @@ typedef struct int inline_depth; int patchpoint_data_n; int *patchpoint_data; + // This marks each stack slot offset that might contain refs throughout the execution of this method + MonoBitSet *ref_slots; guint has_localloc : 1; // If method compilation fails due to certain limits being exceeded, we disable inlining // and retry compilation. @@ -543,6 +545,8 @@ interp_foreach_ins_var (TransformData *td, InterpInst *ins, gpointer data, void void interp_foreach_ins_svar (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int*, gpointer)); +void +interp_mark_ref_slots_for_var (TransformData *td, int var); /* Forward definitions for simd methods */ static gboolean