From 872cad2d63b3a07f65323fe80a7abb29ea276b44 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Tue, 10 Jan 2023 14:56:27 +0800 Subject: [PATCH] Port the maple tree data structures and functions There have been two ways to iterate vm_area_struct until Linux 6.0: 1) by rbtree, aka vma.vm_rb; 2) by linked list, aka vma.vm_{next,prev}. However with the maple tree patches[1][2] in Linux 6.1, vm_rb and vm_{next,prev} are removed from vm_area_struct. The vm_area_dump() in crash mainly uses the linked list for vma iteration, which will not work for this case. So the maple tree iteration needs to be ported to crash. For crash, currently it only iteratively reads the maple tree, no more rcu safe or maple tree modification features needed. So we only port a subset of kernel maple tree features. In addition, we need to modify the ported kernel source code, making it compatible with crash. This patch deals with the two issues: 1) Poring mt_dump() function and all its dependencies from kernel source to crash, to enable crash maple tree iteration, 2) adapting the ported code with crash. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=524e00b36e8c547f5582eef3fb645a8d9fc5e3df [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=763ecb035029f500d7e6dc99acd1ad299b7726a1 Signed-off-by: Tao Liu --- Makefile | 10 +- defs.h | 19 +++ maple_tree.c | 407 +++++++++++++++++++++++++++++++++++++++++++++++++++ maple_tree.h | 82 +++++++++++ 4 files changed, 515 insertions(+), 3 deletions(-) create mode 100644 maple_tree.c create mode 100644 maple_tree.h diff --git a/Makefile b/Makefile index 1506dd4..102597f 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,7 @@ IBM_HFILES=ibm_common.h SADUMP_HFILES=sadump.h UNWIND_HFILES=unwind.h unwind_i.h rse.h unwind_x86.h unwind_x86_64.h VMWARE_HFILES=vmware_vmss.h +MAPLE_TREE_HFILES=maple_tree.h CFILES=main.c tools.c global_data.c memory.c filesys.c help.c task.c \ kernel.c test.c gdb_interface.c configure.c net.c dev.c bpf.c \ @@ -73,12 +74,12 @@ CFILES=main.c tools.c global_data.c memory.c filesys.c help.c task.c \ xen_hyper.c xen_hyper_command.c xen_hyper_global_data.c \ xen_hyper_dump_tables.c kvmdump.c qemu.c qemu-load.c sadump.c ipcs.c \ ramdump.c vmware_vmss.c vmware_guestdump.c \ - xen_dom0.c kaslr_helper.c sbitmap.c + xen_dom0.c kaslr_helper.c sbitmap.c maple_tree.c SOURCE_FILES=${CFILES} ${GENERIC_HFILES} ${MCORE_HFILES} \ ${REDHAT_CFILES} ${REDHAT_HFILES} ${UNWIND_HFILES} \ ${LKCD_DUMP_HFILES} ${LKCD_TRACE_HFILES} ${LKCD_OBSOLETE_HFILES}\ - ${IBM_HFILES} ${SADUMP_HFILES} ${VMWARE_HFILES} + ${IBM_HFILES} ${SADUMP_HFILES} ${VMWARE_HFILES} ${MAPLE_TREE_HFILES} OBJECT_FILES=main.o tools.o global_data.o memory.o filesys.o help.o task.o \ build_data.o kernel.o test.o gdb_interface.o net.o dev.o bpf.o \ @@ -93,7 +94,7 @@ OBJECT_FILES=main.o tools.o global_data.o memory.o filesys.o help.o task.o \ xen_hyper.o xen_hyper_command.o xen_hyper_global_data.o \ xen_hyper_dump_tables.o kvmdump.o qemu.o qemu-load.o sadump.o ipcs.o \ ramdump.o vmware_vmss.o vmware_guestdump.o \ - xen_dom0.o kaslr_helper.o sbitmap.o + xen_dom0.o kaslr_helper.o sbitmap.o maple_tree.o MEMORY_DRIVER_FILES=memory_driver/Makefile memory_driver/crash.c memory_driver/README @@ -539,6 +540,9 @@ kaslr_helper.o: ${GENERIC_HFILES} kaslr_helper.c bpf.o: ${GENERIC_HFILES} bpf.c ${CC} -c ${CRASH_CFLAGS} bpf.c ${WARNING_OPTIONS} ${WARNING_ERROR} +maple_tree.o: ${GENERIC_HFILES} ${MAPLE_TREE_HFILES} maple_tree.c + ${CC} -c ${CRASH_CFLAGS} maple_tree.c ${WARNING_OPTIONS} ${WARNING_ERROR} + ${PROGRAM}: force @$(MAKE) all diff --git a/defs.h b/defs.h index 08ac4dc..46bfd4a 100644 --- a/defs.h +++ b/defs.h @@ -2189,6 +2189,21 @@ struct offset_table { /* stash of commonly-used offsets */ long request_queue_hctx_table; long percpu_counter_counters; long slab_slab_list; + long mm_struct_mm_mt; + long maple_tree_ma_root; + long maple_tree_ma_flags; + long maple_node_parent; + long maple_node_ma64; + long maple_node_mr64; + long maple_node_slot; + long maple_arange_64_pivot; + long maple_arange_64_slot; + long maple_arange_64_gap; + long maple_arange_64_meta; + long maple_range_64_pivot; + long maple_range_64_slot; + long maple_metadata_end; + long maple_metadata_gap; }; struct size_table { /* stash of commonly-used sizes */ @@ -2360,6 +2375,8 @@ struct size_table { /* stash of commonly-used sizes */ long sbq_wait_state; long blk_mq_tags; long percpu_counter; + long maple_tree; + long maple_node; }; struct array_table { @@ -5742,6 +5759,8 @@ int file_dump(ulong, ulong, ulong, int, int); int same_file(char *, char *); int cleanup_memory_driver(void); +void maple_init(void); +int do_mptree(struct tree_data *); /* * help.c diff --git a/maple_tree.c b/maple_tree.c new file mode 100644 index 0000000..474faed --- /dev/null +++ b/maple_tree.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Maple Tree implementation + * Copyright (c) 2018-2022 Oracle Corporation + * Authors: Liam R. Howlett + * Matthew Wilcox + * + * The following are copied and modified from lib/maple_tree.c + */ + +#include "maple_tree.h" +#include "defs.h" + +unsigned char *mt_slots = NULL; +unsigned char *mt_pivots = NULL; +ulong mt_max[4] = {0}; + +#define MAPLE_BUFSIZE 512 + +static inline ulong mte_to_node(ulong maple_enode_entry) +{ + return maple_enode_entry & ~MAPLE_NODE_MASK; +} + +static inline enum maple_type mte_node_type(ulong maple_enode_entry) +{ + return (maple_enode_entry >> MAPLE_NODE_TYPE_SHIFT) & + MAPLE_NODE_TYPE_MASK; +} + +static inline ulong mt_slot(void **slots, unsigned char offset) +{ + return (ulong)slots[offset]; +} + +static inline bool ma_is_leaf(const enum maple_type type) +{ + return type < maple_range_64; +} + +/*************** For cmd_tree ********************/ + +struct maple_tree_ops { + void (*entry)(ulong node, ulong slot, const char *path, + ulong index, void *private); + void *private; + bool is_td; +}; + +static const char spaces[] = " "; + +static void do_mt_range64(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_arange64(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_entry(ulong, ulong, ulong, uint, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_node(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +struct req_entry *fill_member_offsets(char *); +void dump_struct_members_fast(struct req_entry *, int, ulong); +void dump_struct_members_for_tree(struct tree_data *, int, ulong); + +static void mt_dump_range(ulong min, ulong max, uint depth) +{ + if (min == max) + fprintf(fp, "%.*s%lu: ", depth * 2, spaces, min); + else + fprintf(fp, "%.*s%lu-%lu: ", depth * 2, spaces, min, max); +} + +static inline bool mt_is_reserved(ulong entry) +{ + return (entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); +} + +static inline bool mte_is_leaf(ulong maple_enode_entry) +{ + return ma_is_leaf(mte_node_type(maple_enode_entry)); +} + +static uint mt_height(char *mt_buf) +{ + return (UINT(mt_buf + OFFSET(maple_tree_ma_flags)) & + MT_FLAGS_HEIGHT_MASK) + >> MT_FLAGS_HEIGHT_OFFSET; +} + +static void dump_mt_range64(char *mr64_buf) +{ + int i; + + fprintf(fp, " contents: "); + for (i = 0; i < mt_slots[maple_range_64] - 1; i++) + fprintf(fp, "%p %lu ", + VOID_PTR(mr64_buf + OFFSET(maple_range_64_slot) + + sizeof(void *) * i), + ULONG(mr64_buf + OFFSET(maple_range_64_pivot) + + sizeof(ulong) * i)); + fprintf(fp, "%p\n", VOID_PTR(mr64_buf + OFFSET(maple_range_64_slot) + + sizeof(void *) * i)); +} + +static void dump_mt_arange64(char *ma64_buf) +{ + int i; + + fprintf(fp, " contents: "); + for (i = 0; i < mt_slots[maple_arange_64]; i++) + fprintf(fp, "%lu ", ULONG(ma64_buf + OFFSET(maple_arange_64_gap) + + sizeof(ulong) * i)); + + fprintf(fp, "| %02X %02X| ", + UCHAR(ma64_buf + OFFSET(maple_arange_64_meta) + + OFFSET(maple_metadata_end)), + UCHAR(ma64_buf + OFFSET(maple_arange_64_meta) + + OFFSET(maple_metadata_gap))); + + for (i = 0; i < mt_slots[maple_arange_64] - 1; i++) + fprintf(fp, "%p %lu ", + VOID_PTR(ma64_buf + OFFSET(maple_arange_64_slot) + + sizeof(void *) * i), + ULONG(ma64_buf + OFFSET(maple_arange_64_pivot) + + sizeof(ulong) * i)); + fprintf(fp, "%p\n", VOID_PTR(ma64_buf + OFFSET(maple_arange_64_slot) + + sizeof(void *) * i)); +} + +static void dump_mt_entry(ulong entry, ulong min, ulong max, uint depth) +{ + mt_dump_range(min, max, depth); + + if (xa_is_value(entry)) + fprintf(fp, "value %ld (0x%lx) [0x%lx]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (xa_is_zero(entry)) + fprintf(fp, "zero (%ld)\n", xa_to_internal(entry)); + else if (mt_is_reserved(entry)) + fprintf(fp, "UNKNOWN ENTRY (0x%lx)\n", entry); + else + fprintf(fp, "0x%lx\n", entry); +} + +static void dump_mt_node(ulong maple_node, char *node_data, uint type, + ulong min, ulong max, uint depth) +{ + mt_dump_range(min, max, depth); + + fprintf(fp, "node 0x%lx depth %d type %d parent %p", + maple_node, depth, type, + maple_node ? VOID_PTR(node_data + OFFSET(maple_node_parent)) : + NULL); +} + +static void do_mt_range64(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node_m_node = mte_to_node(entry); + char node_buf[MAPLE_BUFSIZE]; + bool leaf = mte_is_leaf(entry); + ulong first = min, last; + int i; + int len = strlen(path); + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + char *mr64_buf; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node_m_node, KVADDR, node_buf, SIZE(maple_node), + "mt_dump_range64 read maple_node", FAULT_ON_ERROR); + + mr64_buf = node_buf + OFFSET(maple_node_mr64); + + for (i = 0; i < mt_slots[maple_range_64]; i++) { + last = max; + + if (i < (mt_slots[maple_range_64] - 1)) + last = ULONG(mr64_buf + OFFSET(maple_range_64_pivot) + + sizeof(ulong) * i); + + else if (!VOID_PTR(mr64_buf + OFFSET(maple_range_64_slot) + + sizeof(void *) * i) && + max != mt_max[mte_node_type(entry)]) + break; + if (last == 0 && i > 0) + break; + if (leaf) + do_mt_entry(mt_slot((void **)(mr64_buf + + OFFSET(maple_range_64_slot)), i), + first, last, depth + 1, i, path, global_index, ops); + else if (VOID_PTR(mr64_buf + OFFSET(maple_range_64_slot) + + sizeof(void *) * i)) { + sprintf(path + len, "/%d", i); + do_mt_node(mt_slot((void **)(mr64_buf + + OFFSET(maple_range_64_slot)), i), + first, last, depth + 1, path, global_index, ops); + } + + if (last == max) + break; + if (last > max) { + fprintf(fp, "node %p last (%lu) > max (%lu) at pivot %d!\n", + mr64_buf, last, max, i); + break; + } + first = last + 1; + } +} + +static void do_mt_arange64(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node_m_node = mte_to_node(entry); + char node_buf[MAPLE_BUFSIZE]; + bool leaf = mte_is_leaf(entry); + ulong first = min, last; + int i; + int len = strlen(path); + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + char *ma64_buf; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node_m_node, KVADDR, node_buf, SIZE(maple_node), + "mt_dump_arange64 read maple_node", FAULT_ON_ERROR); + + ma64_buf = node_buf + OFFSET(maple_node_ma64); + + for (i = 0; i < mt_slots[maple_arange_64]; i++) { + last = max; + + if (i < (mt_slots[maple_arange_64] - 1)) + last = ULONG(ma64_buf + OFFSET(maple_arange_64_pivot) + + sizeof(ulong) * i); + else if (!VOID_PTR(ma64_buf + OFFSET(maple_arange_64_slot) + + sizeof(void *) * i)) + break; + if (last == 0 && i > 0) + break; + + if (leaf) + do_mt_entry(mt_slot((void **)(ma64_buf + + OFFSET(maple_arange_64_slot)), i), + first, last, depth + 1, i, path, global_index, ops); + else if (VOID_PTR(ma64_buf + OFFSET(maple_arange_64_slot) + + sizeof(void *) * i)) { + sprintf(path + len, "/%d", i); + do_mt_node(mt_slot((void **)(ma64_buf + + OFFSET(maple_arange_64_slot)), i), + first, last, depth + 1, path, global_index, ops); + } + + if (last == max) + break; + if (last > max) { + fprintf(fp, "node %p last (%lu) > max (%lu) at pivot %d!\n", + ma64_buf, last, max, i); + break; + } + first = last + 1; + } +} + +static void do_mt_entry(ulong entry, ulong min, ulong max, uint depth, + uint index, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + int print_radix = 0, i; + static struct req_entry **e = NULL; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + + if (!td) + return; +} + +static void do_mt_node(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node = mte_to_node(entry); + uint type = mte_node_type(entry); + uint i; + char node_buf[MAPLE_BUFSIZE]; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node, KVADDR, node_buf, SIZE(maple_node), + "mt_dump_node read maple_node", FAULT_ON_ERROR); + + switch (type) { + case maple_dense: + for (i = 0; i < mt_slots[maple_dense]; i++) { + if (min + i > max) + fprintf(fp, "OUT OF RANGE: "); + do_mt_entry(mt_slot((void **)(node_buf + OFFSET(maple_node_slot)), i), + min + i, min + i, depth, i, path, global_index, ops); + } + break; + case maple_leaf_64: + case maple_range_64: + do_mt_range64(entry, min, max, depth, path, global_index, ops); + break; + case maple_arange_64: + do_mt_arange64(entry, min, max, depth, path, global_index, ops); + break; + default: + fprintf(fp, " UNKNOWN TYPE\n"); + } +} + +static int do_maple_tree_traverse(ulong ptr, int is_root, + struct maple_tree_ops *ops) +{ + char path[BUFSIZE] = {0}; + char tree_buf[MAPLE_BUFSIZE]; + ulong entry; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + ulong global_index = 0; + + if (SIZE(maple_tree) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_tree struct"); + + if (!is_root) { + strcpy(path, "direct"); + do_mt_node(ptr, 0, mt_max[mte_node_type(ptr)], + 0, path, &global_index, ops); + } else { + readmem(ptr, KVADDR, tree_buf, SIZE(maple_tree), + "mt_dump read maple_tree", FAULT_ON_ERROR); + entry = ULONG(tree_buf + OFFSET(maple_tree_ma_root)); + + if (!xa_is_node(entry)) + do_mt_entry(entry, 0, 0, 0, 0, path, &global_index, ops); + else if (entry) { + strcpy(path, "root"); + do_mt_node(entry, 0, mt_max[mte_node_type(entry)], 0, + path, &global_index, ops); + } + } + return 0; +} + +int do_mptree(struct tree_data *td) +{ + struct maple_tree_ops ops = { + .entry = NULL, + .private = td, + .is_td = true, + }; + + int is_root = !(td->flags & TREE_NODE_POINTER); + + do_maple_tree_traverse(td->start, is_root, &ops); + + return 0; +} + +/***********************************************/ +void maple_init(void) +{ + int array_len; + + STRUCT_SIZE_INIT(maple_tree, "maple_tree"); + STRUCT_SIZE_INIT(maple_node, "maple_node"); + + MEMBER_OFFSET_INIT(maple_tree_ma_root, "maple_tree", "ma_root"); + MEMBER_OFFSET_INIT(maple_tree_ma_flags, "maple_tree", "ma_flags"); + + MEMBER_OFFSET_INIT(maple_node_parent, "maple_node", "parent"); + MEMBER_OFFSET_INIT(maple_node_ma64, "maple_node", "ma64"); + MEMBER_OFFSET_INIT(maple_node_mr64, "maple_node", "mr64"); + MEMBER_OFFSET_INIT(maple_node_slot, "maple_node", "slot"); + + MEMBER_OFFSET_INIT(maple_arange_64_pivot, "maple_arange_64", "pivot"); + MEMBER_OFFSET_INIT(maple_arange_64_slot, "maple_arange_64", "slot"); + MEMBER_OFFSET_INIT(maple_arange_64_gap, "maple_arange_64", "gap"); + MEMBER_OFFSET_INIT(maple_arange_64_meta, "maple_arange_64", "meta"); + + MEMBER_OFFSET_INIT(maple_range_64_pivot, "maple_range_64", "pivot"); + MEMBER_OFFSET_INIT(maple_range_64_slot, "maple_range_64", "slot"); + + MEMBER_OFFSET_INIT(maple_metadata_end, "maple_metadata", "end"); + MEMBER_OFFSET_INIT(maple_metadata_gap, "maple_metadata", "gap"); + + array_len = get_array_length("mt_slots", NULL, sizeof(char)); + mt_slots = calloc(array_len, sizeof(char)); + readmem(symbol_value("mt_slots"), KVADDR, mt_slots, + array_len * sizeof(char), "maple_init read mt_slots", + RETURN_ON_ERROR); + + array_len = get_array_length("mt_pivots", NULL, sizeof(char)); + mt_pivots = calloc(array_len, sizeof(char)); + readmem(symbol_value("mt_pivots"), KVADDR, mt_pivots, + array_len * sizeof(char), "maple_init read mt_pivots", + RETURN_ON_ERROR); + + mt_max[maple_dense] = mt_slots[maple_dense]; + mt_max[maple_leaf_64] = ULONG_MAX; + mt_max[maple_range_64] = ULONG_MAX; + mt_max[maple_arange_64] = ULONG_MAX; +} diff --git a/maple_tree.h b/maple_tree.h new file mode 100644 index 0000000..f53d5aa --- /dev/null +++ b/maple_tree.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _MAPLE_TREE_H +#define _MAPLE_TREE_H +/* + * Maple Tree - An RCU-safe adaptive tree for storing ranges + * Copyright (c) 2018-2022 Oracle + * Authors: Liam R. Howlett + * Matthew Wilcox + * + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + * + * See Documentation/core-api/xarray.rst for how to use the XArray. + */ +#include +#include +#include + +/* + * The following are copied and modified from include/linux/maple_tree.h + */ + +enum maple_type { + maple_dense, + maple_leaf_64, + maple_range_64, + maple_arange_64, +}; + +#define MAPLE_NODE_MASK 255UL + +#define MT_FLAGS_HEIGHT_OFFSET 0x02 +#define MT_FLAGS_HEIGHT_MASK 0x7C + +#define MAPLE_NODE_TYPE_MASK 0x0F +#define MAPLE_NODE_TYPE_SHIFT 0x03 + +#define MAPLE_RESERVED_RANGE 4096 + +/* + * The following are copied and modified from include/linux/xarray.h + */ + +#define XA_ZERO_ENTRY xa_mk_internal(257) + +static inline ulong xa_mk_internal(ulong v) +{ + return (v << 2) | 2; +} + +static inline bool xa_is_internal(ulong entry) +{ + return (entry & 3) == 2; +} + +static inline bool xa_is_node(ulong entry) +{ + return xa_is_internal(entry) && entry > 4096; +} + +static inline bool xa_is_value(ulong entry) +{ + return entry & 1; +} + +static inline bool xa_is_zero(ulong entry) +{ + return entry == XA_ZERO_ENTRY; +} + +static inline unsigned long xa_to_internal(ulong entry) +{ + return entry >> 2; +} + +static inline unsigned long xa_to_value(ulong entry) +{ + return entry >> 1; +} + +#endif /* _MAPLE_TREE_H */