// (c) 2011 Thomas Schoebel-Theuer / 1&1 Internet AG #include #include #include #include #include #include "brick_mem.h" #include "brick_say.h" #include "brick_locks.h" #define BRICK_DEBUG_MEM 10000 //#define LIMIT_MEM #define USE_KERNEL_PAGES // currently mandatory (vmalloc does not work) //#define BUMP_LIMITS // try to avoid this #define ALLOW_DYNAMIC_RAISE 512 #ifndef CONFIG_DEBUG_KERNEL #undef BRICK_DEBUG_MEM #endif #define MAGIC_BLOCK (int)0x8B395D7B #define MAGIC_BEND (int)0x8B395D7C #define MAGIC_MEM (int)0x8B395D7D #define MAGIC_END (int)0x8B395D7E #define MAGIC_STR (int)0x8B395D7F #define INT_ACCESS(ptr,offset) (*(int*)(((char*)(ptr)) + (offset))) #define _BRICK_FMT(_fmt) __BASE_FILE__ " %d %s(): " _fmt, __LINE__, __FUNCTION__ #define _BRICK_MSG(_dump, PREFIX, _fmt, _args...) do { say(PREFIX _BRICK_FMT(_fmt), ##_args); if (_dump) dump_stack(); } while (0) #define BRICK_ERROR "MEM_ERROR " #define BRICK_WARNING "MEM_WARN " #define BRICK_INFO "MEM_INFO " #define BRICK_ERR(_fmt, _args...) _BRICK_MSG(true, BRICK_ERROR, _fmt, ##_args) #define BRICK_WRN(_fmt, _args...) _BRICK_MSG(false, BRICK_WARNING, _fmt, ##_args) #define BRICK_INF(_fmt, _args...) _BRICK_MSG(false, BRICK_INFO, _fmt, ##_args) ///////////////////////////////////////////////////////////////////////// // limit handling #ifdef LIMIT_MEM #include #include #endif long long brick_global_memlimit = 0; EXPORT_SYMBOL_GPL(brick_global_memlimit); ///////////////////////////////////////////////////////////////////////// // small memory allocation (use this only for len < PAGE_SIZE) #ifdef BRICK_DEBUG_MEM static atomic_t mem_count[BRICK_DEBUG_MEM] = {}; static atomic_t mem_free[BRICK_DEBUG_MEM] = {}; static int mem_len[BRICK_DEBUG_MEM] = {}; #define PLUS_SIZE (4 * sizeof(int)) #else #define PLUS_SIZE (1 * sizeof(int)) #endif static inline void *__brick_mem_alloc(int len) { void *res; if (len >= PAGE_SIZE) { res = _brick_block_alloc(0, len, 0); } else { #ifdef CONFIG_MARS_MEM_RETRY for (;;) { res = kmalloc(len, GFP_BRICK); if (likely(res)) break; msleep(1000); } #else res = kmalloc(len, GFP_BRICK); #endif } return res; } static inline void __brick_mem_free(void *data, int len) { if (len >= PAGE_SIZE) { _brick_block_free(data, len, 0); } else { kfree(data); } } void *_brick_mem_alloc(int len, int line) { void *res; #ifdef CONFIG_DEBUG_KERNEL might_sleep(); #endif res = __brick_mem_alloc(len + PLUS_SIZE); if (likely(res)) { #ifdef BRICK_DEBUG_MEM if (unlikely(line < 0)) line = 0; else if (unlikely(line >= BRICK_DEBUG_MEM)) line = BRICK_DEBUG_MEM - 1; INT_ACCESS(res, 0 * sizeof(int)) = MAGIC_MEM; INT_ACCESS(res, 1 * sizeof(int)) = len; INT_ACCESS(res, 2 * sizeof(int)) = line; res += 3 * sizeof(int); INT_ACCESS(res, len) = MAGIC_END; atomic_inc(&mem_count[line]); mem_len[line] = len; #else INT_ACCESS(res, 0 * sizeof(int)) = len; res += 1 * sizeof(int); #endif } return res; } EXPORT_SYMBOL_GPL(_brick_mem_alloc); void _brick_mem_free(void *data, int cline) { if (data) { #ifdef BRICK_DEBUG_MEM void *test = data - 3 * sizeof(int); int magic = INT_ACCESS(test, 0 * sizeof(int)); int len = INT_ACCESS(test, 1 * sizeof(int)); int line = INT_ACCESS(test, 2 * sizeof(int)); if (unlikely(magic != MAGIC_MEM)) { BRICK_ERR("line %d memory corruption: magix %08x != %08x, len = %d\n", cline, magic, MAGIC_MEM, len); return; } if (unlikely(line < 0 || line >= BRICK_DEBUG_MEM)) { BRICK_ERR("line %d memory corruption: alloc line = %d, len = %d\n", cline, line, len); return; } INT_ACCESS(test, 0) = 0xffffffff; magic = INT_ACCESS(data, len); if (unlikely(magic != MAGIC_END)) { BRICK_ERR("line %d memory corruption: magix %08x != %08x, len = %d\n", cline, magic, MAGIC_END, len); return; } INT_ACCESS(data, len) = 0xffffffff; atomic_dec(&mem_count[line]); atomic_inc(&mem_free[line]); #else void *test = data - 1 * sizeof(int); int len = INT_ACCESS(test, 0 * sizeof(int)); #endif data = test; __brick_mem_free(data, len + PLUS_SIZE); } } EXPORT_SYMBOL_GPL(_brick_mem_free); ///////////////////////////////////////////////////////////////////////// // string memory allocation #ifdef BRICK_DEBUG_MEM static atomic_t string_count[BRICK_DEBUG_MEM] = {}; static atomic_t string_free[BRICK_DEBUG_MEM] = {}; #endif char *_brick_string_alloc(int len, int line) { char *res; #ifdef CONFIG_DEBUG_KERNEL might_sleep(); #endif if (len <= 0) { len = BRICK_STRING_LEN; } #ifdef BRICK_DEBUG_MEM len += sizeof(int) * 4; #endif #ifdef CONFIG_MARS_MEM_RETRY for (;;) { #endif #ifdef CONFIG_DEBUG_KERNEL res = kzalloc(len + 1024, GFP_BRICK); #else res = kzalloc(len, GFP_BRICK); #endif #ifdef CONFIG_MARS_MEM_RETRY if (likely(res)) break; msleep(1000); } #endif #ifdef BRICK_DEBUG_MEM if (likely(res)) { if (unlikely(line < 0)) line = 0; else if (unlikely(line >= BRICK_DEBUG_MEM)) line = BRICK_DEBUG_MEM - 1; INT_ACCESS(res, 0) = MAGIC_STR; INT_ACCESS(res, sizeof(int)) = len; INT_ACCESS(res, sizeof(int) * 2) = line; INT_ACCESS(res, len - sizeof(int)) = MAGIC_END; atomic_inc(&string_count[line]); res += sizeof(int) * 3; } #endif return res; } EXPORT_SYMBOL_GPL(_brick_string_alloc); void _brick_string_free(const char *data, int cline) { if (data) { #ifdef BRICK_DEBUG_MEM int magic; int len; int line; data -= sizeof(int) * 3; magic = INT_ACCESS(data, 0); if (unlikely(magic != MAGIC_STR)) { BRICK_ERR("cline %d stringmem corruption: magix %08x != %08x\n", cline, magic, MAGIC_STR); return; } len = INT_ACCESS(data, sizeof(int)); line = INT_ACCESS(data, sizeof(int) * 2); if (unlikely(line < 0 || line >= BRICK_DEBUG_MEM)) { BRICK_ERR("cline %d stringmem corruption: line = %d (len = %d)\n", cline, line, len); return; } magic = INT_ACCESS(data, len - sizeof(int)); if (unlikely(magic != MAGIC_END)) { BRICK_ERR("cline %d stringmem corruption: end_magix %08x != %08x, line = %d len = %d\n", cline, magic, MAGIC_END, len, line); return; } INT_ACCESS(data, len - sizeof(int)) = 0xffffffff; atomic_dec(&string_count[line]); atomic_inc(&string_free[line]); #endif kfree(data); } } EXPORT_SYMBOL_GPL(_brick_string_free); ///////////////////////////////////////////////////////////////////////// // block memory allocation static int len2order(int len) { int order = 0; while ((PAGE_SIZE << order) < len) order++; if (unlikely(order > BRICK_MAX_ORDER || len <= 0)) { BRICK_ERR("trying to allocate %d bytes (max = %d)\n", len, (int)(PAGE_SIZE << order)); return -1; } return order; } #ifdef BRICK_DEBUG_MEM // indexed by line static atomic_t block_count[BRICK_DEBUG_MEM] = {}; static atomic_t block_free[BRICK_DEBUG_MEM] = {}; static int block_len[BRICK_DEBUG_MEM] = {}; // indexed by order static atomic_t op_count[BRICK_MAX_ORDER+1] = {}; static atomic_t raw_count[BRICK_MAX_ORDER+1] = {}; static atomic_t alloc_count[BRICK_MAX_ORDER+1] = {}; static int alloc_max[BRICK_MAX_ORDER+1] = {}; static int alloc_line[BRICK_MAX_ORDER+1] = {}; #endif static inline void *__brick_block_alloc(gfp_t gfp, int order) { void *res; #ifdef BRICK_DEBUG_MEM atomic_inc(&raw_count[order]); #endif #ifdef CONFIG_MARS_MEM_RETRY for (;;) { #endif #ifdef USE_KERNEL_PAGES res = (void*)__get_free_pages(gfp, order); #else res = __vmalloc(PAGE_SIZE << order, gfp, PAGE_KERNEL_IO); #endif #ifdef CONFIG_MARS_MEM_RETRY if (likely(res)) break; msleep(1000); } #endif return res; } static inline void __brick_block_free(void *data, int order) { #ifdef USE_KERNEL_PAGES __free_pages(virt_to_page((unsigned long)data), order); #else vfree(data); #endif #ifdef BRICK_DEBUG_MEM atomic_dec(&raw_count[order]); #endif } bool brick_allow_freelist = true; EXPORT_SYMBOL_GPL(brick_allow_freelist); #ifdef CONFIG_MARS_MEM_PREALLOC /* Note: we have no separate lists per CPU. * This should not hurt because the freelists are only used * for higher-order pages which should be rather low-frequency. */ static spinlock_t freelist_lock[BRICK_MAX_ORDER+1]; static void *brick_freelist[BRICK_MAX_ORDER+1] = {}; static atomic_t freelist_count[BRICK_MAX_ORDER+1] = {}; static int freelist_max[BRICK_MAX_ORDER+1] = {}; static void *_get_free(int order) { void *data; unsigned long flags; traced_lock(&freelist_lock[order], flags); data = brick_freelist[order]; if (likely(data)) { void *next = *(void**)data; #ifdef BRICK_DEBUG_MEM // check for corruptions void *copy = *(((void**)data)+1); if (unlikely(next != copy)) { // found a corruption // prevent further trouble by leaving a memleak brick_freelist[order] = NULL; traced_unlock(&freelist_lock[order], flags); BRICK_ERR("freelist corruption at %p (next %p != %p, murdered = %d), order = %d\n", data, next, copy, atomic_read(&freelist_count[order]), order); return NULL; } #endif brick_freelist[order] = next; atomic_dec(&freelist_count[order]); } traced_unlock(&freelist_lock[order], flags); return data; } static void _put_free(void *data, int order) { void *next; unsigned long flags; traced_lock(&freelist_lock[order], flags); next = brick_freelist[order]; *(void**)data = next; #ifdef BRICK_DEBUG_MEM // insert redundant copy for checking *(((void**)data)+1) = next; #endif brick_freelist[order] = data; traced_unlock(&freelist_lock[order], flags); atomic_inc(&freelist_count[order]); } static void _free_all(void) { int order; for (order = BRICK_MAX_ORDER; order >= 0; order--) { for (;;) { void *data = _get_free(order); if (!data) break; __brick_block_free(data, order); } } } int brick_mem_reserve(struct mem_reservation *r) { int order; int status = 0; for (order = BRICK_MAX_ORDER; order >= 0; order--) { int max = r->amount[order]; int i; freelist_max[order] += max; BRICK_INF("preallocating %d at order %d (new maxlevel = %d)\n", max, order, freelist_max[order]); max = freelist_max[order] - atomic_read(&freelist_count[order]); if (max >= 0) { for (i = 0; i < max; i++) { void *data = __brick_block_alloc(GFP_KERNEL, order); if (likely(data)) { _put_free(data, order); } else { status = -ENOMEM; } } } else { for (i = 0; i < -max; i++) { void *data = _get_free(order); if (likely(data)) { __brick_block_free(data, order); } } } } return status; } #else int brick_mem_reserve(struct mem_reservation *r) { BRICK_INF("preallocation is not compiled in\n"); return 0; } #endif EXPORT_SYMBOL_GPL(brick_mem_reserve); void *_brick_block_alloc(loff_t pos, int len, int line) { void *data; #ifdef BRICK_DEBUG_MEM int count; const int plus = len <= PAGE_SIZE ? 0 : PAGE_SIZE * 2; #else const int plus = 0; #endif int order = len2order(len + plus); if (unlikely(order < 0)) { BRICK_ERR("trying to allocate %d bytes (max = %d)\n", len, (int)(PAGE_SIZE << order)); return NULL; } #ifdef CONFIG_DEBUG_KERNEL might_sleep(); #endif #ifdef BRICK_DEBUG_MEM atomic_inc(&op_count[order]); atomic_inc(&alloc_count[order]); count = atomic_read(&alloc_count[order]); // statistics alloc_line[order] = line; if (count > alloc_max[order]) alloc_max[order] = count; /* Dynamic increase of limits, in order to reduce * fragmentation on higher-order pages. * This comes on cost of higher memory usage. */ #if defined(ALLOW_DYNAMIC_RAISE) && defined(CONFIG_MARS_MEM_PREALLOC) if (order > 0 && count > freelist_max[order] && count <= ALLOW_DYNAMIC_RAISE) freelist_max[order] = count; #endif #endif #ifdef CONFIG_MARS_MEM_PREALLOC data = _get_free(order); if (!data) #endif data = __brick_block_alloc(GFP_BRICK, order); #ifdef BRICK_DEBUG_MEM if (likely(data) && order > 0) { if (unlikely(line < 0)) line = 0; else if (unlikely(line >= BRICK_DEBUG_MEM)) line = BRICK_DEBUG_MEM - 1; atomic_inc(&block_count[line]); block_len[line] = len; INT_ACCESS(data, 0) = MAGIC_BLOCK; INT_ACCESS(data, sizeof(int)) = line; INT_ACCESS(data, sizeof(int) * 2) = len; data += PAGE_SIZE; INT_ACCESS(data, len) = MAGIC_BEND; } #endif return data; } EXPORT_SYMBOL_GPL(_brick_block_alloc); void _brick_block_free(void *data, int len, int cline) { int order; #ifdef BRICK_DEBUG_MEM const int plus = len <= PAGE_SIZE ? 0 : PAGE_SIZE * 2; #else const int plus = 0; #endif if (!data) { return; } order = len2order(len + plus); #ifdef BRICK_DEBUG_MEM if (order > 0) { void *test = data - PAGE_SIZE; int magic = INT_ACCESS(test, 0); int line = INT_ACCESS(test, sizeof(int)); int oldlen = INT_ACCESS(test, sizeof(int)*2); int magic2; if (unlikely(magic != MAGIC_BLOCK)) { BRICK_ERR("line %d memory corruption: magix %08x != %08x\n", cline, magic, MAGIC_BLOCK); return; } if (unlikely(line < 0 || line >= BRICK_DEBUG_MEM)) { BRICK_ERR("line %d memory corruption: alloc line = %d\n", cline, line); return; } if (unlikely(oldlen != len)) { BRICK_ERR("line %d memory corruption: len != oldlen (%d != %d)\n", cline, len, oldlen); return; } magic2 = INT_ACCESS(data, len); if (unlikely(magic2 != MAGIC_BEND)) { BRICK_ERR("line %d memory corruption: magix %08x != %08x\n", cline, magic, MAGIC_BEND); return; } INT_ACCESS(test, 0) = 0xffffffff; INT_ACCESS(data, len) = 0xffffffff; atomic_dec(&block_count[line]); atomic_inc(&block_free[line]); data = test; } #endif #ifdef CONFIG_MARS_MEM_PREALLOC if (order > 0 && brick_allow_freelist && atomic_read(&freelist_count[order]) <= freelist_max[order]) { _put_free(data, order); } else #endif __brick_block_free(data, order); #ifdef BRICK_DEBUG_MEM atomic_dec(&alloc_count[order]); #endif } EXPORT_SYMBOL_GPL(_brick_block_free); struct page *brick_iomap(void *data, int *offset, int *len) { int _offset = ((unsigned long)data) & (PAGE_SIZE-1); struct page *page; *offset = _offset; if (*len > PAGE_SIZE - _offset) { *len = PAGE_SIZE - _offset; } if (is_vmalloc_addr(data)) { page = vmalloc_to_page(data); } else { page = virt_to_page(data); } return page; } EXPORT_SYMBOL_GPL(brick_iomap); ///////////////////////////////////////////////////////////////////////// // module void brick_mem_statistics(void) { #ifdef BRICK_DEBUG_MEM int i; int count = 0; int places = 0; BRICK_INF("======== page allocation:\n"); #ifdef CONFIG_MARS_MEM_PREALLOC for (i = 0; i <= BRICK_MAX_ORDER; i++) { BRICK_INF("pages order = %2d operations = %9d freelist_count = %4d / %3d raw_count = %5d alloc_count = %5d line = %5d max_count = %5d\n", i, atomic_read(&op_count[i]), atomic_read(&freelist_count[i]), freelist_max[i], atomic_read(&raw_count[i]), atomic_read(&alloc_count[i]), alloc_line[i], alloc_max[i]); } #endif for (i = 0; i < BRICK_DEBUG_MEM; i++) { int val = atomic_read(&block_count[i]); if (val) { count += val; places++; BRICK_INF("line %4d: %6d allocated (last size = %4d, freed = %6d)\n", i, val, block_len[i], atomic_read(&block_free[i])); } } BRICK_INF("======== %d block allocations in %d places\n", count, places); count = places = 0; for (i = 0; i < BRICK_DEBUG_MEM; i++) { int val = atomic_read(&mem_count[i]); if (val) { count += val; places++; BRICK_INF("line %4d: %6d allocated (last size = %4d, freed = %6d)\n", i, val, mem_len[i], atomic_read(&mem_free[i])); } } BRICK_INF("======== %d memory allocations in %d places\n", count, places); count = places = 0; for (i = 0; i < BRICK_DEBUG_MEM; i++) { int val = atomic_read(&string_count[i]); if (val) { count += val; places++; BRICK_INF("line %4d: %6d allocated (freed = %6d)\n", i, val, atomic_read(&string_free[i])); } } BRICK_INF("======== %d string allocations in %d places\n", count, places); #endif } EXPORT_SYMBOL_GPL(brick_mem_statistics); // module init stuff #ifdef BUMP_LIMITS // quirk: bump the memory reserve limits. extern int min_free_kbytes; static int old_free_kbytes = 0; #endif int __init init_brick_mem(void) { #ifdef CONFIG_MARS_MEM_PREALLOC int i; for (i = BRICK_MAX_ORDER; i >= 0; i--) { spin_lock_init(&freelist_lock[i]); } #endif #ifdef LIMIT_MEM // provisionary brick_global_memlimit = total_swapcache_pages * (PAGE_SIZE / 4); BRICK_INF("brick_global_memlimit = %lld\n", brick_global_memlimit); #endif #ifdef BUMP_LIMITS // quirk: bump the memory reserve limits. TODO: determine right values. old_free_kbytes = min_free_kbytes; min_free_kbytes *= 4; setup_per_zone_wmarks(); #endif return 0; } void __exit exit_brick_mem(void) { #ifdef CONFIG_MARS_MEM_PREALLOC _free_all(); #endif #ifdef BUMP_LIMITS // quirk: bump the memory reserve limits. min_free_kbytes = old_free_kbytes; setup_per_zone_wmarks(); #endif brick_mem_statistics(); } #ifndef CONFIG_MARS_HAVE_BIGMODULE MODULE_DESCRIPTION("generic brick infrastructure"); MODULE_AUTHOR("Thomas Schoebel-Theuer "); MODULE_LICENSE("GPL"); module_init(init_brick_mem); module_exit(exit_brick_mem); #endif