From 055b86e0f0b4325117055d8d31c49011258f4af3 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 22 Jul 2023 20:43:45 +0100 Subject: [PATCH 01/10] util/interval-tree: Use qatomic_read for left/right while searching Fixes a race condition (generally without optimization) in which the subtree is re-read after the protecting if condition. Cc: qemu-stable@nongnu.org Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- util/interval-tree.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/util/interval-tree.c b/util/interval-tree.c index 4c0baf108f..5a0ad21b2d 100644 --- a/util/interval-tree.c +++ b/util/interval-tree.c @@ -745,8 +745,9 @@ static IntervalTreeNode *interval_tree_subtree_search(IntervalTreeNode *node, * Loop invariant: start <= node->subtree_last * (Cond2 is satisfied by one of the subtree nodes) */ - if (node->rb.rb_left) { - IntervalTreeNode *left = rb_to_itree(node->rb.rb_left); + RBNode *tmp = qatomic_read(&node->rb.rb_left); + if (tmp) { + IntervalTreeNode *left = rb_to_itree(tmp); if (start <= left->subtree_last) { /* @@ -765,8 +766,9 @@ static IntervalTreeNode *interval_tree_subtree_search(IntervalTreeNode *node, if (start <= node->last) { /* Cond2 */ return node; /* node is leftmost match */ } - if (node->rb.rb_right) { - node = rb_to_itree(node->rb.rb_right); + tmp = qatomic_read(&node->rb.rb_right); + if (tmp) { + node = rb_to_itree(tmp); if (start <= node->subtree_last) { continue; } @@ -814,8 +816,9 @@ IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root, IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node, uint64_t start, uint64_t last) { - RBNode *rb = node->rb.rb_right, *prev; + RBNode *rb, *prev; + rb = qatomic_read(&node->rb.rb_right); while (true) { /* * Loop invariants: @@ -840,7 +843,7 @@ IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node, } prev = &node->rb; node = rb_to_itree(rb); - rb = node->rb.rb_right; + rb = qatomic_read(&node->rb.rb_right); } while (prev == rb); /* Check if the node intersects [start;last] */ From 4c8baa02d36379507afd17bdea87aabe0aa32ed3 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 22 Jul 2023 15:25:30 +0100 Subject: [PATCH 02/10] util/interval-tree: Use qatomic_set_mb in rb_link_node Ensure that the stores to rb_left and rb_right are complete before inserting the new node into the tree. Otherwise a concurrent reader could see garbage in the new leaf. Cc: qemu-stable@nongnu.org Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- util/interval-tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/util/interval-tree.c b/util/interval-tree.c index 5a0ad21b2d..759562db7d 100644 --- a/util/interval-tree.c +++ b/util/interval-tree.c @@ -128,7 +128,11 @@ static inline void rb_link_node(RBNode *node, RBNode *parent, RBNode **rb_link) node->rb_parent_color = (uintptr_t)parent; node->rb_left = node->rb_right = NULL; - qatomic_set(rb_link, node); + /* + * Ensure that node is initialized before insertion, + * as viewed by a concurrent search. + */ + qatomic_set_mb(rb_link, node); } static RBNode *rb_next(RBNode *node) From d37a259fa4f5164e300e8a20cdd83fe39c7fdadb Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 22 Jul 2023 21:20:05 +0100 Subject: [PATCH 03/10] util/interval-tree: Introduce pc_parent Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- util/interval-tree.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/util/interval-tree.c b/util/interval-tree.c index 759562db7d..d86c0752db 100644 --- a/util/interval-tree.c +++ b/util/interval-tree.c @@ -68,9 +68,14 @@ typedef struct RBAugmentCallbacks { void (*rotate)(RBNode *old, RBNode *new); } RBAugmentCallbacks; +static inline RBNode *pc_parent(uintptr_t pc) +{ + return (RBNode *)(pc & ~1); +} + static inline RBNode *rb_parent(const RBNode *n) { - return (RBNode *)(n->rb_parent_color & ~1); + return pc_parent(n->rb_parent_color); } static inline RBNode *rb_red_parent(const RBNode *n) @@ -532,7 +537,7 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, * so as to bypass rb_erase_color() later on. */ pc = node->rb_parent_color; - parent = rb_parent(node); + parent = pc_parent(pc); rb_change_child(node, child, parent, root); if (child) { child->rb_parent_color = pc; @@ -544,7 +549,7 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ pc = node->rb_parent_color; - parent = rb_parent(node); + parent = pc_parent(pc); tmp->rb_parent_color = pc; rb_change_child(node, tmp, parent, root); rebalance = NULL; @@ -600,7 +605,7 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, rb_set_parent(tmp, successor); pc = node->rb_parent_color; - tmp = rb_parent(node); + tmp = pc_parent(pc); rb_change_child(node, successor, tmp, root); if (child2) { From 79e29851bf57741252a20838cdc59074ab5bac29 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 22 Jul 2023 15:17:38 +0100 Subject: [PATCH 04/10] util/interval-tree: Use qatomic_read/set for rb_parent_color While less susceptible to optimization problems than left and right, interval_tree_iter_next also reads rb_parent(), so make sure that stores and loads are atomic. This goes further than technically required, changing all loads to be atomic, rather than simply the ones in the iteration side. But it doesn't really affect the code generation on the rebalance side and is cleaner to handle everything the same. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- util/interval-tree.c | 47 ++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/util/interval-tree.c b/util/interval-tree.c index d86c0752db..f2866aa7d3 100644 --- a/util/interval-tree.c +++ b/util/interval-tree.c @@ -48,12 +48,6 @@ * * It also guarantees that if the lookup returns an element it is the 'correct' * one. But not returning an element does _NOT_ mean it's not present. - * - * NOTE: - * - * Stores to __rb_parent_color are not important for simple lookups so those - * are left undone as of now. Nor did I check for loops involving parent - * pointers. */ typedef enum RBColor @@ -68,6 +62,16 @@ typedef struct RBAugmentCallbacks { void (*rotate)(RBNode *old, RBNode *new); } RBAugmentCallbacks; +static inline uintptr_t rb_pc(const RBNode *n) +{ + return qatomic_read(&n->rb_parent_color); +} + +static inline void rb_set_pc(RBNode *n, uintptr_t pc) +{ + qatomic_set(&n->rb_parent_color, pc); +} + static inline RBNode *pc_parent(uintptr_t pc) { return (RBNode *)(pc & ~1); @@ -75,12 +79,12 @@ static inline RBNode *pc_parent(uintptr_t pc) static inline RBNode *rb_parent(const RBNode *n) { - return pc_parent(n->rb_parent_color); + return pc_parent(rb_pc(n)); } static inline RBNode *rb_red_parent(const RBNode *n) { - return (RBNode *)n->rb_parent_color; + return (RBNode *)rb_pc(n); } static inline RBColor pc_color(uintptr_t pc) @@ -100,27 +104,27 @@ static inline bool pc_is_black(uintptr_t pc) static inline RBColor rb_color(const RBNode *n) { - return pc_color(n->rb_parent_color); + return pc_color(rb_pc(n)); } static inline bool rb_is_red(const RBNode *n) { - return pc_is_red(n->rb_parent_color); + return pc_is_red(rb_pc(n)); } static inline bool rb_is_black(const RBNode *n) { - return pc_is_black(n->rb_parent_color); + return pc_is_black(rb_pc(n)); } static inline void rb_set_black(RBNode *n) { - n->rb_parent_color |= RB_BLACK; + rb_set_pc(n, rb_pc(n) | RB_BLACK); } static inline void rb_set_parent_color(RBNode *n, RBNode *p, RBColor color) { - n->rb_parent_color = (uintptr_t)p | color; + rb_set_pc(n, (uintptr_t)p | color); } static inline void rb_set_parent(RBNode *n, RBNode *p) @@ -186,9 +190,10 @@ static inline void rb_change_child(RBNode *old, RBNode *new, static inline void rb_rotate_set_parents(RBNode *old, RBNode *new, RBRoot *root, RBColor color) { - RBNode *parent = rb_parent(old); + uintptr_t pc = rb_pc(old); + RBNode *parent = pc_parent(pc); - new->rb_parent_color = old->rb_parent_color; + rb_set_pc(new, pc); rb_set_parent_color(old, new, color); rb_change_child(old, new, parent, root); } @@ -536,11 +541,11 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, * and node must be black due to 4). We adjust colors locally * so as to bypass rb_erase_color() later on. */ - pc = node->rb_parent_color; + pc = rb_pc(node); parent = pc_parent(pc); rb_change_child(node, child, parent, root); if (child) { - child->rb_parent_color = pc; + rb_set_pc(child, pc); rebalance = NULL; } else { rebalance = pc_is_black(pc) ? parent : NULL; @@ -548,9 +553,9 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ - pc = node->rb_parent_color; + pc = rb_pc(node); parent = pc_parent(pc); - tmp->rb_parent_color = pc; + rb_set_pc(tmp, pc); rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; @@ -604,7 +609,7 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, qatomic_set(&successor->rb_left, tmp); rb_set_parent(tmp, successor); - pc = node->rb_parent_color; + pc = rb_pc(node); tmp = pc_parent(pc); rb_change_child(node, successor, tmp, root); @@ -614,7 +619,7 @@ static void rb_erase_augmented(RBNode *node, RBRoot *root, } else { rebalance = rb_is_black(successor) ? parent : NULL; } - successor->rb_parent_color = pc; + rb_set_pc(successor, pc); tmp = successor; } From ad17868eb162a5466d8ad43e5ccb428776403308 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 26 Jul 2023 12:58:08 -0700 Subject: [PATCH 05/10] accel/tcg: Clear tcg_ctx->gen_tb on buffer overflow On overflow of code_gen_buffer, we unlock the guest pages we had been translating, but failed to clear gen_tb. On restart, if we cannot allocate a TB, we exit to the main loop to perform the flush of all TBs as soon as possible. With garbage in gen_tb, we hit an assert: ../src/accel/tcg/tb-maint.c:348:page_unlock__debug: \ assertion failed: (page_is_locked(pd)) Fixes: deba78709ae8 ("accel/tcg: Always lock pages before translation") Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 1 + 1 file changed, 1 insertion(+) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index a1782db5dd..b2d4e22c17 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -374,6 +374,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, "Restarting code generation for " "code_gen_buffer overflow\n"); tb_unlock_pages(tb); + tcg_ctx->gen_tb = NULL; goto buffer_overflow; case -2: From 28b61d49ac80bac8ef74aff0b75058bdd0b2f108 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 27 Jul 2023 09:11:48 -0700 Subject: [PATCH 06/10] bsd-user: Allocate guest virtual address space With reserved_va, mmap.c expects to have pre-allocated host address space for the entire guest address space. When combined with the -B command-line option, ensure that the chosen address does not overlap anything else. Ensure that mmap_next_start is within reserved_va, as we use it within mmap.c without checking. Reviewed by: Warner Losh Signed-off-by: Richard Henderson Message-Id: <20230727161148.444988-1-richard.henderson@linaro.org> --- bsd-user/main.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/bsd-user/main.c b/bsd-user/main.c index b597328118..381bb18df8 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -473,10 +473,6 @@ int main(int argc, char **argv) target_environ = envlist_to_environ(envlist, NULL); envlist_free(envlist); - if (reserved_va) { - mmap_next_start = reserved_va + 1; - } - { Error *err = NULL; if (seed_optarg != NULL) { @@ -494,7 +490,49 @@ int main(int argc, char **argv) * Now that page sizes are configured we can do * proper page alignment for guest_base. */ - guest_base = HOST_PAGE_ALIGN(guest_base); + if (have_guest_base) { + if (guest_base & ~qemu_host_page_mask) { + error_report("Selected guest base not host page aligned"); + exit(1); + } + } + + /* + * If reserving host virtual address space, do so now. + * Combined with '-B', ensure that the chosen range is free. + */ + if (reserved_va) { + void *p; + + if (have_guest_base) { + p = mmap((void *)guest_base, reserved_va + 1, PROT_NONE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_EXCL, -1, 0); + } else { + p = mmap(NULL, reserved_va + 1, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + } + if (p == MAP_FAILED) { + const char *err = strerror(errno); + char *sz = size_to_str(reserved_va + 1); + + if (have_guest_base) { + error_report("Cannot allocate %s bytes at -B %p for guest " + "address space: %s", sz, (void *)guest_base, err); + } else { + error_report("Cannot allocate %s bytes for guest " + "address space: %s", sz, err); + } + exit(1); + } + guest_base = (uintptr_t)p; + have_guest_base = true; + + /* Ensure that mmap_next_start is within range. */ + if (reserved_va <= mmap_next_start) { + mmap_next_start = (reserved_va / 4 * 3) + & TARGET_PAGE_MASK & qemu_host_page_mask; + } + } if (loader_exec(filename, argv + optind, target_environ, regs, info, &bprm) != 0) { From 0f2f3247d456e08baa345768824dae6864d9acb6 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Fri, 28 Jul 2023 10:29:27 -0600 Subject: [PATCH 07/10] bsd-user: Specify host page alignment if none specified We're hitting an assert when we pass in alignment == 0 since that's not a power of two. so pass in the ideal page size. Signed-off-by: Warner Losh Message-Id: <20230728162927.5009-1-imp@bsdimp.com> Reviewed-by: Richard Henderson Signed-off-by: Richard Henderson --- bsd-user/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c index 74ed00b9fe..b62a69bd07 100644 --- a/bsd-user/mmap.c +++ b/bsd-user/mmap.c @@ -260,7 +260,8 @@ static abi_ulong mmap_find_vma_aligned(abi_ulong start, abi_ulong size, if (reserved_va) { return mmap_find_vma_reserved(start, size, - (alignment != 0 ? 1 << alignment : 0)); + (alignment != 0 ? 1 << alignment : + MAX(qemu_host_page_size, TARGET_PAGE_SIZE))); } addr = start; From 2e718e665706d5fcc3e3501bda26f277f055ed85 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 28 Jul 2023 10:22:52 -0700 Subject: [PATCH 08/10] target/ppc: Disable goto_tb with architectural singlestep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The change to use translator_use_goto_tb went too far, as the CF_SINGLE_STEP flag managed by the translator only handles gdb single stepping and not the architectural single stepping modeled in DisasContext.singlestep_enabled. Fixes: 6e9cc373ec5 ("target/ppc: Use translator_use_goto_tb") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1795 Reviewed-by: Cédric Le Goater Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- target/ppc/translate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/target/ppc/translate.c b/target/ppc/translate.c index e6a0709066..74796ec7ba 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -4175,6 +4175,9 @@ static void pmu_count_insns(DisasContext *ctx) static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest) { + if (unlikely(ctx->singlestep_enabled)) { + return false; + } return translator_use_goto_tb(&ctx->base, dest); } From 38dd78c41eaf08b490c9e7ec68fc508bbaa5cb1d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 28 Jul 2023 21:23:10 +0200 Subject: [PATCH 09/10] linux-user/armeb: Fix __kernel_cmpxchg() for armeb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7f4f0d9ea870 ("linux-user/arm: Implement __kernel_cmpxchg with host atomics") switched to use qatomic_cmpxchg() to swap a word with the memory content, but missed to endianess-swap the oldval and newval values when emulating an armeb CPU, which expects words to be stored in big endian in the guest memory. The bug can be verified with qemu >= v7.0 on any little-endian host, when starting the armeb binary of the upx program, which just hangs without this patch. Cc: qemu-stable@nongnu.org Signed-off-by: Helge Deller Reported-by: "Markus F.X.J. Oberhumer" Reported-by: John Reiser Closes: https://github.com/upx/upx/issues/687 Message-Id: Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Richard Henderson --- linux-user/arm/cpu_loop.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/linux-user/arm/cpu_loop.c b/linux-user/arm/cpu_loop.c index a992423257..b404117ff3 100644 --- a/linux-user/arm/cpu_loop.c +++ b/linux-user/arm/cpu_loop.c @@ -117,8 +117,9 @@ static void arm_kernel_cmpxchg32_helper(CPUARMState *env) { uint32_t oldval, newval, val, addr, cpsr, *host_addr; - oldval = env->regs[0]; - newval = env->regs[1]; + /* Swap if host != guest endianness, for the host cmpxchg below */ + oldval = tswap32(env->regs[0]); + newval = tswap32(env->regs[1]); addr = env->regs[2]; mmap_lock(); @@ -174,6 +175,10 @@ static void arm_kernel_cmpxchg64_helper(CPUARMState *env) return; } + /* Swap if host != guest endianness, for the host cmpxchg below */ + oldval = tswap64(oldval); + newval = tswap64(newval); + #ifdef CONFIG_ATOMIC64 val = qatomic_cmpxchg__nocheck(host_addr, oldval, newval); cpsr = (val == oldval) * CPSR_C; From 8b94ec53f367db7adcc9b59c483ce3e6c7bc3740 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 28 Jul 2023 12:53:59 -0700 Subject: [PATCH 10/10] target/s390x: Move trans_exc_code update to do_program_interrupt This solves a problem in which the store to LowCore during tlb_fill triggers a clean-page TB invalidation for page0 during translation, which results in an assertion failure for locked pages. By delaying the store until after the exception has been raised, we will have unwound the pages locked for translation and the problem does not arise. There are plenty of other updates to LowCore while delivering an interrupt/exception; trans_exc_code does not need to be special. Reviewed-by: Ilya Leoshkevich Signed-off-by: Richard Henderson --- target/s390x/tcg/excp_helper.c | 40 ++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/target/s390x/tcg/excp_helper.c b/target/s390x/tcg/excp_helper.c index 3da337f7c7..b7116d0577 100644 --- a/target/s390x/tcg/excp_helper.c +++ b/target/s390x/tcg/excp_helper.c @@ -190,11 +190,6 @@ bool s390_cpu_tlb_fill(CPUState *cs, vaddr address, int size, return false; } - if (excp != PGM_ADDRESSING) { - stq_phys(env_cpu(env)->as, - env->psa + offsetof(LowCore, trans_exc_code), tec); - } - /* * For data accesses, ILEN will be filled in from the unwind info, * within cpu_loop_exit_restore. For code accesses, retaddr == 0, @@ -211,20 +206,33 @@ static void do_program_interrupt(CPUS390XState *env) uint64_t mask, addr; LowCore *lowcore; int ilen = env->int_pgm_ilen; + bool set_trans_exc_code = false; + bool advance = false; assert((env->int_pgm_code == PGM_SPECIFICATION && ilen == 0) || ilen == 2 || ilen == 4 || ilen == 6); switch (env->int_pgm_code) { case PGM_PER: - if (env->per_perc_atmid & PER_CODE_EVENT_NULLIFICATION) { - break; - } - /* FALL THROUGH */ + advance = !(env->per_perc_atmid & PER_CODE_EVENT_NULLIFICATION); + break; + case PGM_ASCE_TYPE: + case PGM_REG_FIRST_TRANS: + case PGM_REG_SEC_TRANS: + case PGM_REG_THIRD_TRANS: + case PGM_SEGMENT_TRANS: + case PGM_PAGE_TRANS: + assert(env->int_pgm_code == env->tlb_fill_exc); + set_trans_exc_code = true; + break; + case PGM_PROTECTION: + assert(env->int_pgm_code == env->tlb_fill_exc); + set_trans_exc_code = true; + advance = true; + break; case PGM_OPERATION: case PGM_PRIVILEGED: case PGM_EXECUTE: - case PGM_PROTECTION: case PGM_ADDRESSING: case PGM_SPECIFICATION: case PGM_DATA: @@ -243,11 +251,15 @@ static void do_program_interrupt(CPUS390XState *env) case PGM_PC_TRANS_SPEC: case PGM_ALET_SPEC: case PGM_MONITOR: - /* advance the PSW if our exception is not nullifying */ - env->psw.addr += ilen; + advance = true; break; } + /* advance the PSW if our exception is not nullifying */ + if (advance) { + env->psw.addr += ilen; + } + qemu_log_mask(CPU_LOG_INT, "%s: code=0x%x ilen=%d psw: %" PRIx64 " %" PRIx64 "\n", __func__, env->int_pgm_code, ilen, env->psw.mask, @@ -263,6 +275,10 @@ static void do_program_interrupt(CPUS390XState *env) env->per_perc_atmid = 0; } + if (set_trans_exc_code) { + lowcore->trans_exc_code = cpu_to_be64(env->tlb_fill_tec); + } + lowcore->pgm_ilen = cpu_to_be16(ilen); lowcore->pgm_code = cpu_to_be16(env->int_pgm_code); lowcore->program_old_psw.mask = cpu_to_be64(s390_cpu_get_psw_mask(env));