New compiler flags to set the size and alignment of tcmalloc pages

Added two new compiler flags, --with-tcmalloc-pagesize and
--with-tcmalloc-alignment, in order to set the tcmalloc internal page
size and alignment without the need of a compiler directive and to
make the choice of the page size independent of the alignment.
This commit is contained in:
Raphael Moreira Zinsly 2014-12-11 17:04:52 -02:00
parent 1ecc068be9
commit 0d9b42839b
3 changed files with 65 additions and 28 deletions

20
INSTALL
View File

@ -102,19 +102,19 @@ cost of using more space (due to internal fragmentation).
Internally, tcmalloc divides its memory into "pages." The default Internally, tcmalloc divides its memory into "pages." The default
page size is chosen to minimize memory use by reducing fragmentation. page size is chosen to minimize memory use by reducing fragmentation.
The cost is that keeping track of these pages can cost tcmalloc time. The cost is that keeping track of these pages can cost tcmalloc time.
We've added a new, experimental flag to tcmalloc that enables a larger We've added a new, flag to tcmalloc that enables a larger page size.
page size. In general, this will increase the memory needs of In general, this will increase the memory needs of applications using
applications using tcmalloc. However, in many cases it will speed up tcmalloc. However, in many cases it will speed up the applications
the applications as well, particularly if they allocate and free a lot as well, particularly if they allocate and free a lot of memory. We've
of memory. We've seen average speedups of 3-5% on Google seen average speedups of 3-5% on Google applications.
applications.
This feature is still very experimental; it's not even a configure To build libtcmalloc with large pages you need to use the
flag yet. To build libtcmalloc with large pages, run --with-tcmalloc-pagesize=ARG compiler flag, e.g.:
./configure <normal flags> CXXFLAGS=-DTCMALLOC_LARGE_PAGES ./configure <other flags> --with-tcmalloc-pagesize=32
(or add -DTCMALLOC_LARGE_PAGES to your existing CXXFLAGS argument). The ARG argument can be 8, 32 or 64 which sets the internal page size to
8K, 32K and 64K repectively. The default is 8K.
*** SMALL TCMALLOC CACHES: TRADING SPACE FOR TIME *** SMALL TCMALLOC CACHES: TRADING SPACE FOR TIME

View File

@ -22,7 +22,6 @@ AM_INIT_AUTOMAKE([dist-zip])
AC_CONFIG_HEADERS([src/config.h]) AC_CONFIG_HEADERS([src/config.h])
AM_MAINTAINER_MODE() AM_MAINTAINER_MODE()
# Export the version information (for tc_version and friends) # Export the version information (for tc_version and friends)
TC_VERSION_MAJOR=`expr "$PACKAGE_VERSION" : '\([[0-9]]*\)'` TC_VERSION_MAJOR=`expr "$PACKAGE_VERSION" : '\([[0-9]]*\)'`
TC_VERSION_MINOR=`expr "$PACKAGE_VERSION" : '[[0-9]]*\.\([[0-9]]*\)'` TC_VERSION_MINOR=`expr "$PACKAGE_VERSION" : '[[0-9]]*\.\([[0-9]]*\)'`
@ -42,6 +41,8 @@ default_enable_heap_profiler=yes
default_enable_heap_checker=yes default_enable_heap_checker=yes
default_enable_debugalloc=yes default_enable_debugalloc=yes
default_enable_minimal=no default_enable_minimal=no
default_tcmalloc_pagesize=8
default_tcmalloc_alignment=16
need_nanosleep=yes # Used later, to decide if to run ACX_NANOSLEEP need_nanosleep=yes # Used later, to decide if to run ACX_NANOSLEEP
case "$host" in case "$host" in
*-mingw*) default_enable_minimal=yes; default_enable_debugalloc=no; *-mingw*) default_enable_minimal=yes; default_enable_debugalloc=no;
@ -95,6 +96,40 @@ AC_ARG_ENABLE([libunwind],
[enable libunwind linking])], [enable libunwind linking])],
[], [],
[enable_libunwind="$default_enable_libunwind"]) [enable_libunwind="$default_enable_libunwind"])
AC_ARG_WITH([tcmalloc-pagesize],
[AS_HELP_STRING([--with-tcmalloc-pagesize],
[Set the tcmalloc internal page size to 8K, 32K or 64K])],
[],
[with_tcmalloc_pagesize=$default_tcmalloc_pagesize])
AC_ARG_WITH([tcmalloc-alignment],
[AS_HELP_STRING([--with-tcmalloc-alignment],
[Set the tcmalloc internal page alignment to 8 or 16 bytes])],
[],
[with_tcmalloc_alignment=$default_tcmalloc_alignment])
case "$with_tcmalloc_pagesize" in
8)
#Default tcmalloc page size.
;;
32)
AC_DEFINE(TCMALLOC_32K_PAGES, 1,
[Define 32K of internal pages size for tcmalloc]);;
64)
AC_DEFINE(TCMALLOC_64K_PAGES, 1,
[Define 64K of internal pages size for tcmalloc]);;
*)
AC_MSG_WARN([${with_tcmalloc_pagesize}K size not supported, using default tcmalloc page size.])
esac
case "$with_tcmalloc_alignment" in
8)
AC_DEFINE(TCMALLOC_ALIGN_8BYTES, 1,
[Define 8 bytes of internal pages alignment for tcmalloc]);;
16)
#Default tcmalloc page alignment.
;;
*)
AC_MSG_WARN([${with_tcmalloc_alignment} bytes not supported, using default tcmalloc page alignment.])
esac
# Checks for programs. # Checks for programs.
AC_PROG_CXX AC_PROG_CXX

View File

@ -62,6 +62,19 @@ typedef uintptr_t Length;
// Configuration // Configuration
//------------------------------------------------------------------- //-------------------------------------------------------------------
#if defined(TCMALLOC_ALIGN_8BYTES)
// Unless we force to use 8 bytes alignment we use an alignment of
// at least 16 bytes to statisfy requirements for some SSE types.
// Keep in mind when using the 16 bytes alignment you can have a space
// waste due alignment of 25%. (eg malloc of 24 bytes will get 32 bytes)
static const size_t kMinAlign = 8;
// Number of classes created until reach page size 128.
static const size_t kBaseClasses = 16;
#else
static const size_t kMinAlign = 16;
static const size_t kBaseClasses = 9;
#endif
// Using large pages speeds up the execution at a cost of larger memory use. // Using large pages speeds up the execution at a cost of larger memory use.
// Deallocation may speed up by a factor as the page map gets 8x smaller, so // Deallocation may speed up by a factor as the page map gets 8x smaller, so
// lookups in the page map result in fewer L2 cache misses, which translates to // lookups in the page map result in fewer L2 cache misses, which translates to
@ -70,28 +83,17 @@ typedef uintptr_t Length;
// the thread cache allowance to avoid passing more free ranges to and from // the thread cache allowance to avoid passing more free ranges to and from
// central lists. Also, larger pages are less likely to get freed. // central lists. Also, larger pages are less likely to get freed.
// These two factors cause a bounded increase in memory use. // These two factors cause a bounded increase in memory use.
#if defined(TCMALLOC_32K_PAGES)
#if defined(TCMALLOC_LARGE_PAGES)
static const size_t kPageShift = 15; static const size_t kPageShift = 15;
static const size_t kNumClasses = 78; static const size_t kNumClasses = kBaseClasses + 69;
static const size_t kMinAlign = 16; #elif defined(TCMALLOC_64K_PAGES)
#elif defined(TCMALLOC_LARGE_PAGES64K)
static const size_t kPageShift = 16; static const size_t kPageShift = 16;
static const size_t kNumClasses = 82; static const size_t kNumClasses = kBaseClasses + 73;
static const size_t kMinAlign = 16;
#elif defined(TCMALLOC_ALIGN_8BYTES)
static const size_t kPageShift = 13;
static const size_t kNumClasses = 95;
// Unless we force to use 8 bytes alignment we use an alignment of
// at least 16 bytes to statisfy requirements for some SSE types.
// Keep in mind when using the 16 bytes alignment you can have a space
// waste due alignment of 25%. (eg malloc of 24 bytes will get 32 bytes)
static const size_t kMinAlign = 8;
#else #else
static const size_t kPageShift = 13; static const size_t kPageShift = 13;
static const size_t kNumClasses = 88; static const size_t kNumClasses = kBaseClasses + 79;
static const size_t kMinAlign = 16;
#endif #endif
static const size_t kMaxThreadCacheSize = 4 << 20; static const size_t kMaxThreadCacheSize = 4 << 20;
static const size_t kPageSize = 1 << kPageShift; static const size_t kPageSize = 1 << kPageShift;