re-organize docs and convert htmls to asciidoc

This commit is contained in:
Aliaksei Kandratsenka 2024-12-06 15:39:28 -05:00
parent ddfa828b3f
commit 023376d651
44 changed files with 1613 additions and 3029 deletions

1
.gitignore vendored
View File

@ -38,6 +38,7 @@
/debugallocation_test
/debugallocation_test.sh
/depcomp
/docs/*.html
/frag_unittest
/frag_unittest.exe
/function_ref_test

View File

@ -99,9 +99,6 @@ EXTRA_INSTALL =
## vvvv RULES TO MAKE THE LIBRARIES, BINARIES, AND UNITTESTS
dist_doc_DATA += docs/index.html docs/designstyle.css
### ------- various support library routines
# Having set of common helpers helps with unit testing various "guts"
@ -223,9 +220,6 @@ check_address_test_SOURCES = src/tests/check_address_test.cc
check_address_test_CPPFLAGS = $(gtest_CPPFLAGS)
check_address_test_LDADD = libcommon.la libgtest.la
### Documentation
dist_doc_DATA +=
endif WITH_STACK_TRACE
### ------- tcmalloc_minimal (thread-caching malloc)
@ -436,43 +430,19 @@ min_per_thread_cache_size_test_CPPFLAGS = $(gtest_CPPFLAGS)
min_per_thread_cache_size_test_LDADD = libtcmalloc_minimal.la libgtest.la
### Documentation
dist_doc_DATA += docs/tcmalloc.html \
docs/overview.gif \
docs/pageheap.gif \
docs/spanmap.gif \
docs/threadheap.gif \
docs/t-test1.times.txt \
docs/tcmalloc-opspercpusec.vs.threads.1024.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.128.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.131072.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.16384.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.2048.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.256.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.32768.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.4096.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.512.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.64.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.65536.bytes.png \
docs/tcmalloc-opspercpusec.vs.threads.8192.bytes.png \
docs/tcmalloc-opspersec.vs.size.1.threads.png \
docs/tcmalloc-opspersec.vs.size.12.threads.png \
docs/tcmalloc-opspersec.vs.size.16.threads.png \
docs/tcmalloc-opspersec.vs.size.2.threads.png \
docs/tcmalloc-opspersec.vs.size.20.threads.png \
docs/tcmalloc-opspersec.vs.size.3.threads.png \
docs/tcmalloc-opspersec.vs.size.4.threads.png \
docs/tcmalloc-opspersec.vs.size.5.threads.png \
docs/tcmalloc-opspersec.vs.size.8.threads.png
dist_doc_DATA += $(top_srcdir)/docs/*adoc $(top_srcdir)/docs/*gif $(top_srcdir)/docs/*png $(top_srcdir)/docs/dots/*dot
# I don't know how to say "distribute the .dot files but don't install them";
# noinst doesn't seem to work with data. I separate them out anyway, in case
# one day we figure it out. Regardless, installing the dot files isn't the
# end of the world.
dist_doc_DATA += docs/overview.dot \
docs/pageheap.dot \
docs/spanmap.dot \
docs/threadheap.dot
gperftools_HTMLDOCS = docs/tcmalloc.html docs/heapprofile.html \
docs/cpuprofile.html docs/cpuprofile-fileformat.html \
docs/pprof_integration.html
if !MISSING_ASCIIDOCTOR
doc_DATA = $(gperftools_HTMLDOCS)
MOSTLYCLEANFILES = $(gperftools_HTMLDOCS)
.adoc.html:
$(ASCIIDOCTOR) $(ASCIIDOCTOR_FLAGS) -o $@ $<
endif !MISSING_ASCIIDOCTOR
### ------- tcmalloc_minimal_debug (thread-caching malloc with debugallocation)
@ -715,11 +685,6 @@ endif !SKIP_PPROF_TESTS
endif WITH_HEAP_PROFILER
### Documentation (above and beyond tcmalloc_minimal documentation)
if WITH_HEAP_PROFILER
dist_doc_DATA += docs/heapprofile.html docs/heap-example1.png
endif WITH_HEAP_PROFILER
### ------- tcmalloc with debugallocation
if WITH_DEBUGALLOC
@ -837,14 +802,6 @@ profiler4_unittest_LDADD = -lstacktrace -lprofiler
profiler4_unittest_DEPENDENCIES = libprofiler.la
endif !SKIP_PPROF_TESTS
### Documentation
dist_doc_DATA += docs/cpuprofile.html \
docs/cpuprofile-fileformat.html \
docs/pprof-test-big.gif \
docs/pprof-test.gif \
docs/pprof-vsnprintf-big.gif \
docs/pprof-vsnprintf.gif
endif WITH_CPU_PROFILER
@ -954,7 +911,7 @@ $(top_distdir)/ChangeLog:
EXTRA_DIST = $(SCRIPTS) \
src/windows/get_mangled_names.cc src/windows/override_functions.cc \
src/windows/CMakeLists.txt \
docs/pprof.see_also $(WINDOWS_EXTRA) \
$(WINDOWS_EXTRA) \
gperftools.sln vsprojects vendor \
$(top_srcdir)/src/*h $(top_srcdir)/src/base/*h \
$(top_srcdir)/benchmark/*h \

View File

@ -519,6 +519,12 @@ AM_CONDITIONAL(SKIP_PPROF_TESTS, [test "x$PPROF_PATH" = "x"])
AS_IF([test "x$PPROF_PATH" = "x"],
[AC_MSG_WARN([pprof tool not found. Will skip several unit tests that need it. Install via go install github.com/google/pprof@latest then add \$HOME/go/bin to PATH])])
AC_PATH_PROG([ASCIIDOCTOR], [asciidoctor])
AM_CONDITIONAL([MISSING_ASCIIDOCTOR], [test "x$ASCIIDOCTOR" = "x"])
AS_IF([test "x$ASCIIDOCTOR" = "x"],
[AC_MSG_WARN([asciidoctor tool not found. Will skip building .html documentation from .adoc])])
AC_ARG_VAR(ASCIIDOCTOR_FLAGS, [flags to pass to asciidoctor])
# Write generated configuration file
AC_CONFIG_FILES([Makefile])
AC_OUTPUT

View File

@ -0,0 +1,161 @@
= Gperftools CPU Profiler Binary Data File Format
:reproducible:
[.normal]
This file documents the binary data file format produced by the
gperftools CPU Profiler. It is one of "legacy" formats supported by
the pprof tool. For information about using the CPU Profiler, see
link:cpuprofile.html[its user guide].
The profiler source code, which generates files using this format, is at
`src/profiler.cc`.
== CPU Profile Data File Structure
CPU profile data files each consist of four parts, in order:
* Binary header
* Binary profile records
* Binary trailer
* Text list of mapped objects
The binary data is expressed in terms of "slots." These are words large
enough to hold the program's pointer type, i.e., for 32-bit programs
they are 4 bytes in size, and for 64-bit programs they are 8 bytes. They
are stored in the profile data file in the native byte order (i.e.,
little-endian for x86 and x86_64).
== Binary Header
The binary header format is show below. Values written by the profiler,
along with requirements currently enforced by the analysis tools, are
shown in parentheses.
[cols=",",options="header",]
|===
|slot |data
|0 |header count (0; must be 0)
|1 |header slots after this one (3; must be >= 3)
|2 |format version (0; must be 0)
|3 |sampling period, in microseconds
|4 |padding (0)
|===
The headers currently generated for 32-bit and 64-bit little-endian (x86
and x86_64) profiles are shown below, for comparison.
[cols=",,,,,",options="header",]
|===
| |hdr count |hdr words |version |sampling period |pad
|32-bit or 64-bit (slots) |0 |3 |0 |10000 |0
|32-bit (4-byte words in file) |`0x00000` |`0x00003` |`0x00000`
|`0x02710` |`0x00000`
|64-bit LE (4-byte words in file) |`0x00000 0x00000`
|`0x00003 0x00000` |`0x00000 0x00000` |`0x02710 0x00000`
|`0x00000 0x00000`
|===
The contents are shown in terms of slots, and in terms of 4-byte words
in the profile data file. The slot contents for 32-bit and 64-bit
headers are identical. For 32-bit profiles, the 4-byte word view matches
the slot view. For 64-bit profiles, each (8-byte) slot is shown as two
4-byte words, ordered as they would appear in the file.
The profiling tools examine the contents of the file and use the
expected locations and values of the header words field to detect
whether the file is 32-bit or 64-bit.
== Binary Profile Records
The binary profile record format is shown below.
[cols=2*]
|===
|slot
|data
|0
|sample count, must be >= 1
|1
|number of call chain PCs (num_pcs), must be >= 1
|2 .. (num_pcs + 1)
|call chain PCs, most-recently-called function first.
|===
The total length of a given record is 2 + num_pcs.
Note that multiple profile records can be emitted by the profiler having
an identical call chain. In that case, analysis tools should sum the
counts of all records having identical call chains.
*Note:* Some profile analysis tools terminate if they see _any_ profile
record with a call chain with its first entry having the address 0.
(This is similar to the binary trailer.)
=== Example
This example shows the slots contained in a sample profile record.
[cols=",,,,",]
|===
|5 |3 |0xa0000 |0xc0000 |0xe0000
|===
In this example, 5 ticks were received at PC 0xa0000, whose function had
been called by the function containing 0xc0000, which had been called
from the function containing 0xe0000.
== Binary Trailer
The binary trailer consists of three slots of data with fixed values,
shown below.
[cols=",",options="header",]
|===
|slot |value
|0 |0
|1 |1
|2 |0
|===
Note that this is the same data that would contained in a profile record
with sample count = 0, num_pcs = 1, and a one-element call chain
containing the address 0.
== Text List of Mapped Objects
The binary data in the file is followed immediately by a list of mapped
objects. This list consists of lines of text separated by newline
characters.
Each line describes one mapping as produced by SaveProcSelfMaps. For
example:
....
40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so
....
The first address must start at the beginning of the line. This is
essentially the same format as Linux's `/proc/<pid>/maps` file.
Recent Linux systems have this format documented in
link:https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html[`man 5
proc_pid_maps`]. Less recent systems document it under `man 5 proc`.
Tools ignore minor:major number and inode number. And only executable
mappings really need to be present. See
`src/base/proc_maps_iterator.{h,cc}` for how it is produced.
Unrecognized lines should be ignored by analysis tools.
Note, original pprof tool also supported processing `$build`
"variable" when processing mappings, but we never produced such
mappings. So we don't document this anymore.
'''''
Original authror: Chris Demetriou (cgd) +
Last update by: Aliaksei Kandratsenka +

View File

@ -1,264 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<HTML>
<HEAD>
<link rel="stylesheet" href="designstyle.css">
<title>Google CPU Profiler Binary Data File Format</title>
</HEAD>
<BODY>
<h1>Google CPU Profiler Binary Data File Format</h1>
<p align=right>
<i>Last modified
<script type=text/javascript>
var lm = new Date(document.lastModified);
document.write(lm.toDateString());
</script></i>
</p>
<p>This file documents the binary data file format produced by the
Google CPU Profiler. For information about using the CPU Profiler,
see <a href="cpuprofile.html">its user guide</a>.
<p>The profiler source code, which generates files using this format, is at
<code>src/profiler.cc</code></a>.
<h2>CPU Profile Data File Structure</h2>
<p>CPU profile data files each consist of four parts, in order:
<ul>
<li> Binary header
<li> Binary profile records
<li> Binary trailer
<li> Text list of mapped objects
</ul>
<p>The binary data is expressed in terms of "slots." These are words
large enough to hold the program's pointer type, i.e., for 32-bit
programs they are 4 bytes in size, and for 64-bit programs they are 8
bytes. They are stored in the profile data file in the native byte
order (i.e., little-endian for x86 and x86_64).
<h2>Binary Header</h2>
<p>The binary header format is show below. Values written by the
profiler, along with requirements currently enforced by the analysis
tools, are shown in parentheses.
<p>
<table summary="Header Format"
frame="box" rules="sides" cellpadding="5" width="50%">
<tr>
<th width="30%">slot</th>
<th width="70%">data</th>
</tr>
<tr>
<td>0</td>
<td>header count (0; must be 0)</td>
</tr>
<tr>
<td>1</td>
<td>header slots after this one (3; must be &gt;= 3)</td>
</tr>
<tr>
<td>2</td>
<td>format version (0; must be 0)</td>
</tr>
<tr>
<td>3</td>
<td>sampling period, in microseconds</td>
</tr>
<tr>
<td>4</td>
<td>padding (0)</td>
</tr>
</table>
<p>The headers currently generated for 32-bit and 64-bit little-endian
(x86 and x86_64) profiles are shown below, for comparison.
<p>
<table summary="Header Example" frame="box" rules="sides" cellpadding="5">
<tr>
<th></th>
<th>hdr count</th>
<th>hdr words</th>
<th>version</th>
<th>sampling period</th>
<th>pad</th>
</tr>
<tr>
<td>32-bit or 64-bit (slots)</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>10000</td>
<td>0</td>
</tr>
<tr>
<td>32-bit (4-byte words in file)</td>
<td><tt>0x00000</tt></td>
<td><tt>0x00003</tt></td>
<td><tt>0x00000</tt></td>
<td><tt>0x02710</tt></td>
<td><tt>0x00000</tt></td>
</tr>
<tr>
<td>64-bit LE (4-byte words in file)</td>
<td><tt>0x00000&nbsp;0x00000</tt></td>
<td><tt>0x00003&nbsp;0x00000</tt></td>
<td><tt>0x00000&nbsp;0x00000</tt></td>
<td><tt>0x02710&nbsp;0x00000</tt></td>
<td><tt>0x00000&nbsp;0x00000</tt></td>
</tr>
</table>
<p>The contents are shown in terms of slots, and in terms of 4-byte
words in the profile data file. The slot contents for 32-bit and
64-bit headers are identical. For 32-bit profiles, the 4-byte word
view matches the slot view. For 64-bit profiles, each (8-byte) slot
is shown as two 4-byte words, ordered as they would appear in the
file.
<p>The profiling tools examine the contents of the file and use the
expected locations and values of the header words field to detect
whether the file is 32-bit or 64-bit.
<h2>Binary Profile Records</h2>
<p>The binary profile record format is shown below.
<p>
<table summary="Profile Record Format"
frame="box" rules="sides" cellpadding="5" width="50%">
<tr>
<th width="30%">slot</th>
<th width="70%">data</th>
</tr>
<tr>
<td>0</td>
<td>sample count, must be &gt;= 1</td>
</tr>
<tr>
<td>1</td>
<td>number of call chain PCs (num_pcs), must be &gt;= 1</td>
</tr>
<tr>
<td>2 .. (num_pcs + 1)</td>
<td>call chain PCs, most-recently-called function first.
</tr>
</table>
<p>The total length of a given record is 2 + num_pcs.
<p>Note that multiple profile records can be emitted by the profiler
having an identical call chain. In that case, analysis tools should
sum the counts of all records having identical call chains.
<p><b>Note:</b> Some profile analysis tools terminate if they see
<em>any</em> profile record with a call chain with its first entry
having the address 0. (This is similar to the binary trailer.)
<h3>Example</h3>
This example shows the slots contained in a sample profile record.
<p>
<table summary="Profile Record Example"
frame="box" rules="sides" cellpadding="5">
<tr>
<td>5</td>
<td>3</td>
<td>0xa0000</td>
<td>0xc0000</td>
<td>0xe0000</td>
</tr>
</table>
<p>In this example, 5 ticks were received at PC 0xa0000, whose
function had been called by the function containing 0xc0000, which had
been called from the function containing 0xe0000.
<h2>Binary Trailer</h2>
<p>The binary trailer consists of three slots of data with fixed
values, shown below.
<p>
<table summary="Trailer Format"
frame="box" rules="sides" cellpadding="5" width="50%">
<tr>
<th width="30%">slot</th>
<th width="70%">value</th>
</tr>
<tr>
<td>0</td>
<td>0</td>
</tr>
<tr>
<td>1</td>
<td>1</td>
</tr>
<tr>
<td>2</td>
<td>0</td>
</tr>
</table>
<p>Note that this is the same data that would contained in a profile
record with sample count = 0, num_pcs = 1, and a one-element call
chain containing the address 0.
<h2>Text List of Mapped Objects</h2>
<p>The binary data in the file is followed immediately by a list of
mapped objects. This list consists of lines of text separated by
newline characters.
<p>Each line is one of the following types:
<ul>
<li>Build specifier, starting with "<tt>build=</tt>". For example:
<pre> build=/path/to/binary</pre>
Leading spaces on the line are ignored.
<li>Mapping line from ProcMapsIterator::FormatLine. For example:
<pre> 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so</pre>
The first address must start at the beginning of the line.
</ul>
<p>Unrecognized lines should be ignored by analysis tools.
<p>When processing the paths see in mapping lines, occurrences of
<tt>$build</tt> followed by a non-word character (i.e., characters
other than underscore or alphanumeric characters), should be replaced
by the path given on the last build specifier line.
<hr>
<address>Chris Demetriou<br>
<!-- Created: Mon Aug 27 12:18:26 PDT 2007 -->
<!-- hhmts start -->
Last modified: Mon Aug 27 12:18:26 PDT 2007 (cgd)
<!-- hhmts end -->
</address>
</BODY>
</HTML>

422
docs/cpuprofile.adoc Normal file
View File

@ -0,0 +1,422 @@
= Using CPU Profiler
:reproducible:
[.normal]
This is the CPU profiler originally developed at Google. There are
three parts to using it: linking the library into an application,
running the code, and analyzing the output.
On the off-chance that you should need to understand it, the CPU
profiler data file format is documented separately,
link:cpuprofile-fileformat.html[here].
== Linking in the Library
To install the CPU profiler into your executable, add `-lprofiler` to
the link-time step for your executable. (It's also possible to
add in the profiler at run-time using `+LD_PRELOAD+`, e.g.
% LD_PRELOAD="/usr/lib/libprofiler.so" <binary>
This does _not_ turn on CPU profiling; it just inserts the code. For
that reason, it's practical to just always link `+-lprofiler+` into a
binary while developing; that's what we do at Google. (However, since
any user can turn on the profiler by setting an environment variable,
it's not necessarily recommended to install profiler-linked binaries
into a production, running system.)
== Running the Code
There are several alternatives to actually turn on CPU profiling for a
given run of an executable:
. Define the environment variable CPUPROFILE to the filename to dump the
profile to. For instance, if you had a version of `+/bin/ls+` that had
been linked against libprofiler, you could run:
+
....
% env CPUPROFILE=ls.prof /bin/ls
....
. In addition to defining the environment variable CPUPROFILE you can
also define CPUPROFILESIGNAL. This allows profiling to be controlled via
the signal number that you specify. The signal number must be unused by
the program under normal operation. Internally it acts as a switch,
triggered by the signal, which is off by default. For instance, if you
had a copy of `+/bin/chrome+` that had been been linked against
libprofiler, you could run:
+
....
% env CPUPROFILE=chrome.prof CPUPROFILESIGNAL=12 /bin/chrome &
....
+
You can then trigger profiling to start:
+
....
% killall -12 chrome
....
+
Then after a period of time you can tell it to stop which will generate
the profile:
+
....
% killall -12 chrome
....
. In your code, bracket the code you want profiled in calls to
`+ProfilerStart()+` and `+ProfilerStop()+`. (These functions are
declared in `+<gperftools/profiler.h>+`.) `+ProfilerStart()+` will take
the profile-filename as an argument.
Profiling works correctly with sub-processes: each child process gets
its own profile with its own name (generated by combining CPUPROFILE
with the child's process id).
For security reasons, CPU profiling will not write to a file -- and is
thus not usable -- for setuid programs.
See the include-file `+gperftools/profiler.h+` for advanced-use
functions, including `+ProfilerFlush()+` and
`+ProfilerStartWithOptions()+`.
=== Modifying Runtime Behavior
You can more finely control the behavior of the CPU profiler via
environment variables.
[cols=",,",]
|===
|`CPUPROFILE_FREQUENCY=__x__` |default: 100 |How many
interrupts/second the cpu-profiler samples.
|`+CPUPROFILE_REALTIME=1+` |default: [not set] |If set to any value
(including 0 or the empty string), use ITIMER_REAL instead of
ITIMER_PROF to gather profiles. In general, ITIMER_REAL is not as
accurate as ITIMER_PROF, and also interacts badly with use of alarm(),
so prefer ITIMER_PROF unless you have a reason prefer ITIMER_REAL.
|===
== [#pprof]#Analyzing the Output#
`+pprof+` is the program used to analyze a profiles. Get it from
link:https://github.com/google/pprof[]. For example by running:
% go install github.com/google/pprof@latest
You can then add `$HOME/go/bin` to your `$PATH`. Also, note, that they
have their own documentation as well. So check it out
link:https://github.com/google/pprof/blob/main/doc/README.md[here].
It has many output modes, both textual and graphical. Some give just
raw numbers, much like the `+-pg+` output of `+gcc+`, and others show
the data in the form of a dependency graph.
Here are some ways to call pprof. These are described in more detail
below.
....
% pprof /bin/ls ls.prof
Enters "interactive" mode
% pprof --text /bin/ls ls.prof
Outputs one line per procedure
% pprof --gv /bin/ls ls.prof
Displays annotated call-graph via 'gv'
% pprof --gv --focus=Mutex /bin/ls ls.prof
Restricts to code paths including a .*Mutex.* entry
% pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
Code paths including Mutex but not string
% pprof --list=getdir /bin/ls ls.prof
(Per-line) annotated source listing for getdir()
% pprof --disasm=getdir /bin/ls ls.prof
(Per-PC) annotated disassembly for getdir()
% pprof --text localhost:1234
Outputs one line per procedure for localhost:1234
% pprof --callgrind /bin/ls ls.prof
Outputs the call information in callgrind format
% pprof --http=:<port> /bin/ls ls.prof
Starts Web UI and launches web browser
for interactive profile inspection
....
=== Analyzing Text Output
Text mode has lines of output that look like this:
....
14 2.1% 17.2% 58 8.7% std::_Rb_tree::find
....
Here is how to interpret the columns:
. Number of profiling samples in this function
. Percentage of profiling samples in this function
. Percentage of profiling samples in the functions printed so far
. Number of profiling samples in this function and its callees
. Percentage of profiling samples in this function and its callees
. Function name
=== Analyzing Callgrind Output
Use http://kcachegrind.sourceforge.net[kcachegrind] to analyze your
callgrind output:
....
% pprof --callgrind /bin/ls ls.prof > ls.callgrind
% kcachegrind ls.callgrind
....
The cost is specified in 'hits', i.e. how many times a function appears
in the recorded call stack information. The 'calls' from function a to b
record how many times function b was found in the stack traces directly
below function a.
Tip: if you use a debug build the output will include file and line
number information and kcachegrind will show an annotated source code
view.
=== Node Information
In the various graphical modes of pprof, the output is a call graph
annotated with timing information, like so:
link:pprof-test-big.gif[]
image:pprof-test.gif[pprof-test]
Each node represents a procedure. The directed edges indicate caller to
callee relations. Each node is formatted as follows:
....
Class Name
Method Name
local (percentage)
of cumulative (percentage)
....
The last one or two lines contains the timing information. (The
profiling is done via a sampling method, where by default we take 100
samples a second. Therefor one unit of time in the output corresponds to
about 10 milliseconds of execution time.) The "local" time is the time
spent executing the instructions directly contained in the procedure
(and in any other procedures that were inlined into the procedure). The
"cumulative" time is the sum of the "local" time and the time spent in
any callees. If the cumulative time is the same as the local time, it is
not printed.
For instance, the timing information for test_main_thread() indicates
that 155 units (about 1.55 seconds) were spent executing the code in
`+test_main_thread()+` and 200 units were spent while executing
`+test_main_thread()+` and its callees such as `+snprintf()+`.
The size of the node is proportional to the local count. The percentage
displayed in the node corresponds to the count divided by the total run
time of the program (that is, the cumulative count for `+main()+`).
=== Edge Information
An edge from one node to another indicates a caller to callee
relationship. Each edge is labelled with the time spent by the callee on
behalf of the caller. E.g, the edge from `+test_main_thread()+` to
`+snprintf()+` indicates that of the 200 samples in
`+test_main_thread()+`, 37 are because of calls to `+snprintf()+`.
Note that `+test_main_thread()+` has an edge to `+vsnprintf()+`, even
though `+test_main_thread()+` doesn't call that function directly. This
is because the code was compiled with `+-O2+`; the profile reflects the
optimized control flow.
=== Meta Information
The top of the display should contain some meta information like:
....
/tmp/profiler2_unittest
Total samples: 202
Focusing on: 202
Dropped nodes with <= 1 abs(samples)
Dropped edges with <= 0 samples
....
This section contains the name of the program, and the total samples
collected during the profiling run. If the `+--focus+` option is on (see
the link:#focus[Focus] section below), the legend also contains the
number of samples being shown in the focused display. Furthermore, some
unimportant nodes and edges are dropped to reduce clutter. The
characteristics of the dropped nodes and edges are also displayed in the
legend.
=== [#focus]#Focus and Ignore#
You can ask pprof to generate a display focused on a particular piece of
the program. You specify a regular expression. Any portion of the
call-graph that is on a path which contains at least one node matching
the regular expression is preserved. The rest of the call-graph is
dropped on the floor. For example, you can focus on the `+vsnprintf()+`
libc call in `+profiler2_unittest+` as follows:
....
% pprof --gv --focus=vsnprintf /tmp/profiler2_unittest test.prof
....
link:pprof-vsnprintf-big.gif[]
[cols="",]
|===
|image:pprof-vsnprintf.gif[pprof-vsnprintf]
|===
Similarly, you can supply the `+--ignore+` option to ignore samples that
match a specified regular expression. E.g., if you are interested in
everything except calls to `+snprintf()+`, you can say:
....
% pprof --gv --ignore=snprintf /tmp/profiler2_unittest test.prof
....
=== Text interactive mode
By default -- if you don't specify any flags to the contrary -- pprof
runs in interactive mode. At the `+(pprof)+` prompt, you can run many of
the commands described above. You can type `+help+` for a list of what
commands are available in interactive mode.
=== [#options]#pprof Options#
For a complete list of pprof options, you can run `+pprof --help+`.
==== Output Type
[width="100%",cols="50%,50%",]
|===
|`+--text+` |Produces a textual listing. (Note: If you have an X
display, and `+dot+` and `+gv+` installed, you will probably be happier
with the `+--gv+` output.)
|`+--gv+` |Generates annotated call-graph, converts to postscript, and
displays via gv (requres `+dot+` and `+gv+` be installed).
|`+--dot+` |Generates the annotated call-graph in dot format and emits
to stdout (requres `+dot+` be installed).
|`+--ps+` |Generates the annotated call-graph in Postscript format and
emits to stdout (requres `+dot+` be installed).
|`+--pdf+` |Generates the annotated call-graph in PDF format and emits
to stdout (requires `+dot+` and `+ps2pdf+` be installed).
|`+--gif+` |Generates the annotated call-graph in GIF format and emits
to stdout (requres `+dot+` be installed).
|`--list=<__regexp__>` |
Outputs source-code listing of routines whose name matches <regexp>.
Each line in the listing is annotated with flat and cumulative sample
counts.
In the presence of inlined calls, the samples associated with inlined
code tend to get assigned to a line that follows the location of the
inlined call. A more precise accounting can be obtained by disassembling
the routine using the --disasm flag.
|`--disasm=<__regexp__>` |Generates disassembly of routines that
match <regexp>, annotated with flat and cumulative sample counts and
emits to stdout.
|===
==== Reporting Granularity
By default, pprof produces one entry per procedure. However you can use
one of the following options to change the granularity of the output.
[cols=2*]
|===
|`+--addresses+`
|Produce one node per program address.
|`+--lines+`
|Produce one node per source line.
|`+--functions+`
|Produce one node per function (this is the default).
|`+--files+`
|Produce one node per source file.
|===
==== Controlling the Call Graph Display
Some nodes and edges are dropped to reduce clutter in the output
display. The following options control this effect:
[cols=",",]
|===
|`+--nodecount=<n>+` |This option controls the number of displayed
nodes. The nodes are first sorted by decreasing cumulative count, and
then only the top N nodes are kept. The default value is 80.
|`+--nodefraction=<f>+` |This option provides another mechanism for
discarding nodes from the display. If the cumulative count for a node is
less than this option's value multiplied by the total count for the
profile, the node is dropped. The default value is 0.005; i.e. nodes
that account for less than half a percent of the total time are dropped.
A node is dropped if either this condition is satisfied, or the
--nodecount condition is satisfied.
|`+--edgefraction=<f>+` |This option controls the number of displayed
edges. First of all, an edge is dropped if either its source or
destination node is dropped. Otherwise, the edge is dropped if the
sample count along the edge is less than this option's value multiplied
by the total count for the profile. The default value is 0.001; i.e.,
edges that account for less than 0.1% of the total time are dropped.
|`+--focus=<re>+` |This option controls what region of the graph is
displayed based on the regular expression supplied with the option. For
any path in the callgraph, we check all nodes in the path against the
supplied regular expression. If none of the nodes match, the path is
dropped from the output.
|`+--ignore=<re>+` |This option controls what region of the graph is
displayed based on the regular expression supplied with the option. For
any path in the callgraph, we check all nodes in the path against the
supplied regular expression. If any of the nodes match, the path is
dropped from the output.
|===
The dropped edges and nodes account for some count mismatches in the
display. For example, the cumulative count for `+snprintf()+` in the
first diagram above was 41. However the local count (1) and the count
along the outgoing edges (12+1+20+6) add up to only 40.
== Caveats
* If the program exits because of a signal, the generated profile will
be incomplete, and may perhaps be completely empty.
* The displayed graph may have disconnected regions because of the
edge-dropping heuristics described above.
* If the program linked in a library that was not compiled with enough
symbolic information, all samples associated with the library may be
charged to the last symbol found in the program before the
library. This will artificially inflate the count for that symbol.
* If you run the program on one machine, and profile it on another,
and the shared libraries are different on the two machines, the
profiling output may be confusing: samples that fall within shared
libaries may be assigned to arbitrary procedures.
* If your program forks, the children will also be profiled (since
they inherit the same CPUPROFILE setting). Each process is profiled
separately; to distinguish the child profiles from the parent profile
and from each other, all children will have their process-id appended
to the CPUPROFILE name.
* Due to a hack we use to trigger appending of pid in child processes,
your profiles may end up named strangely if the first character of
your CPUPROFILE variable has ascii value greater than 127. This should
be exceedingly rare, but if you need to use such a name, just set
prepend `+./+` to your filename: `+CPUPROFILE=./Ägypten+`.
'''''
Original author: Sanjay Ghemawat +
Last updated by: Aliaksei Kandratsenka

View File

@ -1,536 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<link rel="stylesheet" href="designstyle.css">
<title>Gperftools CPU Profiler</title>
</HEAD>
<BODY>
<p align=right>
<i>Last modified
<script type=text/javascript>
var lm = new Date(document.lastModified);
document.write(lm.toDateString());
</script></i>
</p>
<p>This is the CPU profiler we use at Google. There are three parts
to using it: linking the library into an application, running the
code, and analyzing the output.</p>
<p>On the off-chance that you should need to understand it, the CPU
profiler data file format is documented separately,
<a href="cpuprofile-fileformat.html">here</a>.
<H1>Linking in the Library</H1>
<p>To install the CPU profiler into your executable, add
<code>-lprofiler</code> to the link-time step for your executable.
(It's also probably possible to add in the profiler at run-time using
<code>LD_PRELOAD</code>, e.g.
<code>% env LD_PRELOAD="/usr/lib/libprofiler.so" &lt;binary&gt;</code>,
but this isn't necessarily recommended.)</p>
<p>This does <i>not</i> turn on CPU profiling; it just inserts the
code. For that reason, it's practical to just always link
<code>-lprofiler</code> into a binary while developing; that's what we
do at Google. (However, since any user can turn on the profiler by
setting an environment variable, it's not necessarily recommended to
install profiler-linked binaries into a production, running
system.)</p>
<H1>Running the Code</H1>
<p>There are several alternatives to actually turn on CPU profiling
for a given run of an executable:</p>
<ol>
<li> <p>Define the environment variable CPUPROFILE to the filename
to dump the profile to. For instance, if you had a version of
<code>/bin/ls</code> that had been linked against libprofiler,
you could run:</p>
<pre>% env CPUPROFILE=ls.prof /bin/ls</pre>
</li>
<li> <p>In addition to defining the environment variable CPUPROFILE
you can also define CPUPROFILESIGNAL. This allows profiling to be
controlled via the signal number that you specify. The signal number
must be unused by the program under normal operation. Internally it
acts as a switch, triggered by the signal, which is off by default.
For instance, if you had a copy of <code>/bin/chrome</code> that had been
been linked against libprofiler, you could run:</p>
<pre>% env CPUPROFILE=chrome.prof CPUPROFILESIGNAL=12 /bin/chrome &</pre>
<p>You can then trigger profiling to start:</p>
<pre>% killall -12 chrome</pre>
<p>Then after a period of time you can tell it to stop which will
generate the profile:</p>
<pre>% killall -12 chrome</pre>
</li>
<li> <p>In your code, bracket the code you want profiled in calls to
<code>ProfilerStart()</code> and <code>ProfilerStop()</code>.
(These functions are declared in <code>&lt;gperftools/profiler.h&gt;</code>.)
<code>ProfilerStart()</code> will take
the profile-filename as an argument.</p>
</li>
</ol>
<p>In Linux 2.6 and above, profiling works correctly with threads,
automatically profiling all threads. In Linux 2.4, profiling only
profiles the main thread (due to a kernel bug involving itimers and
threads). Profiling works correctly with sub-processes: each child
process gets its own profile with its own name (generated by combining
CPUPROFILE with the child's process id).</p>
<p>For security reasons, CPU profiling will not write to a file -- and
is thus not usable -- for setuid programs.</p>
<p>See the include-file <code>gperftools/profiler.h</code> for
advanced-use functions, including <code>ProfilerFlush()</code> and
<code>ProfilerStartWithOptions()</code>.</p>
<H2>Modifying Runtime Behavior</H2>
<p>You can more finely control the behavior of the CPU profiler via
environment variables.</p>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>CPUPROFILE_FREQUENCY=<i>x</i></code></td>
<td>default: 100</td>
<td>
How many interrupts/second the cpu-profiler samples.
</td>
</tr>
<tr valign=top>
<td><code>CPUPROFILE_REALTIME=1</code></td>
<td>default: [not set]</td>
<td>
If set to any value (including 0 or the empty string), use
ITIMER_REAL instead of ITIMER_PROF to gather profiles. In
general, ITIMER_REAL is not as accurate as ITIMER_PROF, and also
interacts badly with use of alarm(), so prefer ITIMER_PROF unless
you have a reason prefer ITIMER_REAL.
</td>
</tr>
</table>
<h1><a name="pprof">Analyzing the Output</a></h1>
<p><code>pprof</code> is the script used to analyze a profile. It has
many output modes, both textual and graphical. Some give just raw
numbers, much like the <code>-pg</code> output of <code>gcc</code>,
and others show the data in the form of a dependency graph.</p>
<p>pprof <b>requires</b> <code>perl5</code> to be installed to run.
It also requires <code>dot</code> to be installed for any of the
graphical output routines, and <code>gv</code> to be installed for
<code>--gv</code> mode (described below).
</p>
<p>Here are some ways to call pprof. These are described in more
detail below.</p>
<pre>
% pprof /bin/ls ls.prof
Enters "interactive" mode
% pprof --text /bin/ls ls.prof
Outputs one line per procedure
% pprof --gv /bin/ls ls.prof
Displays annotated call-graph via 'gv'
% pprof --gv --focus=Mutex /bin/ls ls.prof
Restricts to code paths including a .*Mutex.* entry
% pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
Code paths including Mutex but not string
% pprof --list=getdir /bin/ls ls.prof
(Per-line) annotated source listing for getdir()
% pprof --disasm=getdir /bin/ls ls.prof
(Per-PC) annotated disassembly for getdir()
% pprof --text localhost:1234
Outputs one line per procedure for localhost:1234
% pprof --callgrind /bin/ls ls.prof
Outputs the call information in callgrind format
</pre>
<h3>Analyzing Text Output</h3>
<p>Text mode has lines of output that look like this:</p>
<pre>
14 2.1% 17.2% 58 8.7% std::_Rb_tree::find
</pre>
<p>Here is how to interpret the columns:</p>
<ol>
<li> Number of profiling samples in this function
<li> Percentage of profiling samples in this function
<li> Percentage of profiling samples in the functions printed so far
<li> Number of profiling samples in this function and its callees
<li> Percentage of profiling samples in this function and its callees
<li> Function name
</ol>
<h3>Analyzing Callgrind Output</h3>
<p>Use <a href="http://kcachegrind.sourceforge.net">kcachegrind</a> to
analyze your callgrind output:</p>
<pre>
% pprof --callgrind /bin/ls ls.prof > ls.callgrind
% kcachegrind ls.callgrind
</pre>
<p>The cost is specified in 'hits', i.e. how many times a function
appears in the recorded call stack information. The 'calls' from
function a to b record how many times function b was found in the
stack traces directly below function a.</p>
<p>Tip: if you use a debug build the output will include file and line
number information and kcachegrind will show an annotated source
code view.</p>
<h3>Node Information</h3>
<p>In the various graphical modes of pprof, the output is a call graph
annotated with timing information, like so:</p>
<A HREF="pprof-test-big.gif">
<center><table><tr><td>
<img src="pprof-test.gif">
</td></tr></table></center>
</A>
<p>Each node represents a procedure. The directed edges indicate
caller to callee relations. Each node is formatted as follows:</p>
<center><pre>
Class Name
Method Name
local (percentage)
<b>of</b> cumulative (percentage)
</pre></center>
<p>The last one or two lines contains the timing information. (The
profiling is done via a sampling method, where by default we take 100
samples a second. Therefor one unit of time in the output corresponds
to about 10 milliseconds of execution time.) The "local" time is the
time spent executing the instructions directly contained in the
procedure (and in any other procedures that were inlined into the
procedure). The "cumulative" time is the sum of the "local" time and
the time spent in any callees. If the cumulative time is the same as
the local time, it is not printed.</p>
<p>For instance, the timing information for test_main_thread()
indicates that 155 units (about 1.55 seconds) were spent executing the
code in <code>test_main_thread()</code> and 200 units were spent while
executing <code>test_main_thread()</code> and its callees such as
<code>snprintf()</code>.</p>
<p>The size of the node is proportional to the local count. The
percentage displayed in the node corresponds to the count divided by
the total run time of the program (that is, the cumulative count for
<code>main()</code>).</p>
<h3>Edge Information</h3>
<p>An edge from one node to another indicates a caller to callee
relationship. Each edge is labelled with the time spent by the callee
on behalf of the caller. E.g, the edge from
<code>test_main_thread()</code> to <code>snprintf()</code> indicates
that of the 200 samples in <code>test_main_thread()</code>, 37 are
because of calls to <code>snprintf()</code>.</p>
<p>Note that <code>test_main_thread()</code> has an edge to
<code>vsnprintf()</code>, even though <code>test_main_thread()</code>
doesn't call that function directly. This is because the code was
compiled with <code>-O2</code>; the profile reflects the optimized
control flow.</p>
<h3>Meta Information</h3>
<p>The top of the display should contain some meta information
like:</p>
<pre>
/tmp/profiler2_unittest
Total samples: 202
Focusing on: 202
Dropped nodes with &lt;= 1 abs(samples)
Dropped edges with &lt;= 0 samples
</pre>
<p>This section contains the name of the program, and the total
samples collected during the profiling run. If the
<code>--focus</code> option is on (see the <a href="#focus">Focus</a>
section below), the legend also contains the number of samples being
shown in the focused display. Furthermore, some unimportant nodes and
edges are dropped to reduce clutter. The characteristics of the
dropped nodes and edges are also displayed in the legend.</p>
<h3><a name=focus>Focus and Ignore</a></h3>
<p>You can ask pprof to generate a display focused on a particular
piece of the program. You specify a regular expression. Any portion
of the call-graph that is on a path which contains at least one node
matching the regular expression is preserved. The rest of the
call-graph is dropped on the floor. For example, you can focus on the
<code>vsnprintf()</code> libc call in <code>profiler2_unittest</code>
as follows:</p>
<pre>
% pprof --gv --focus=vsnprintf /tmp/profiler2_unittest test.prof
</pre>
<A HREF="pprof-vsnprintf-big.gif">
<center><table><tr><td>
<img src="pprof-vsnprintf.gif">
</td></tr></table></center>
</A>
<p>Similarly, you can supply the <code>--ignore</code> option to
ignore samples that match a specified regular expression. E.g., if
you are interested in everything except calls to
<code>snprintf()</code>, you can say:</p>
<pre>
% pprof --gv --ignore=snprintf /tmp/profiler2_unittest test.prof
</pre>
<h3>Interactive mode</a></h3>
<p>By default -- if you don't specify any flags to the contrary --
pprof runs in interactive mode. At the <code>(pprof)</code> prompt,
you can run many of the commands described above. You can type
<code>help</code> for a list of what commands are available in
interactive mode.</p>
<h3><a name=options>pprof Options</a></h3>
For a complete list of pprof options, you can run <code>pprof
--help</code>.
<h4>Output Type</h4>
<p>
<center>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>--text</code></td>
<td>
Produces a textual listing. (Note: If you have an X display, and
<code>dot</code> and <code>gv</code> installed, you will probably
be happier with the <code>--gv</code> output.)
</td>
</tr>
<tr valign=top>
<td><code>--gv</code></td>
<td>
Generates annotated call-graph, converts to postscript, and
displays via gv (requres <code>dot</code> and <code>gv</code> be
installed).
</td>
</tr>
<tr valign=top>
<td><code>--dot</code></td>
<td>
Generates the annotated call-graph in dot format and
emits to stdout (requres <code>dot</code> be installed).
</td>
</tr>
<tr valign=top>
<td><code>--ps</code></td>
<td>
Generates the annotated call-graph in Postscript format and
emits to stdout (requres <code>dot</code> be installed).
</td>
</tr>
<tr valign=top>
<td><code>--pdf</code></td>
<td>
Generates the annotated call-graph in PDF format and emits to
stdout (requires <code>dot</code> and <code>ps2pdf</code> be
installed).
</td>
</tr>
<tr valign=top>
<td><code>--gif</code></td>
<td>
Generates the annotated call-graph in GIF format and
emits to stdout (requres <code>dot</code> be installed).
</td>
</tr>
<tr valign=top>
<td><code>--list=&lt;<i>regexp</i>&gt;</code></td>
<td>
<p>Outputs source-code listing of routines whose
name matches &lt;regexp&gt;. Each line
in the listing is annotated with flat and cumulative
sample counts.</p>
<p>In the presence of inlined calls, the samples
associated with inlined code tend to get assigned
to a line that follows the location of the
inlined call. A more precise accounting can be
obtained by disassembling the routine using the
--disasm flag.</p>
</td>
</tr>
<tr valign=top>
<td><code>--disasm=&lt;<i>regexp</i>&gt;</code></td>
<td>
Generates disassembly of routines that match
&lt;regexp&gt;, annotated with flat and
cumulative sample counts and emits to stdout.
</td>
</tr>
</table>
</center>
<h4>Reporting Granularity</h4>
<p>By default, pprof produces one entry per procedure. However you can
use one of the following options to change the granularity of the
output. The <code>--files</code> option seems to be particularly
useless, and may be removed eventually.</p>
<center>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>--addresses</code></td>
<td>
Produce one node per program address.
</td>
</tr>
<td><code>--lines</code></td>
<td>
Produce one node per source line.
</td>
</tr>
<td><code>--functions</code></td>
<td>
Produce one node per function (this is the default).
</td>
</tr>
<td><code>--files</code></td>
<td>
Produce one node per source file.
</td>
</tr>
</table>
</center>
<h4>Controlling the Call Graph Display</h4>
<p>Some nodes and edges are dropped to reduce clutter in the output
display. The following options control this effect:</p>
<center>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>--nodecount=&lt;n&gt;</code></td>
<td>
This option controls the number of displayed nodes. The nodes
are first sorted by decreasing cumulative count, and then only
the top N nodes are kept. The default value is 80.
</td>
</tr>
<tr valign=top>
<td><code>--nodefraction=&lt;f&gt;</code></td>
<td>
This option provides another mechanism for discarding nodes
from the display. If the cumulative count for a node is
less than this option's value multiplied by the total count
for the profile, the node is dropped. The default value
is 0.005; i.e. nodes that account for less than
half a percent of the total time are dropped. A node
is dropped if either this condition is satisfied, or the
--nodecount condition is satisfied.
</td>
</tr>
<tr valign=top>
<td><code>--edgefraction=&lt;f&gt;</code></td>
<td>
This option controls the number of displayed edges. First of all,
an edge is dropped if either its source or destination node is
dropped. Otherwise, the edge is dropped if the sample
count along the edge is less than this option's value multiplied
by the total count for the profile. The default value is
0.001; i.e., edges that account for less than
0.1% of the total time are dropped.
</td>
</tr>
<tr valign=top>
<td><code>--focus=&lt;re&gt;</code></td>
<td>
This option controls what region of the graph is displayed
based on the regular expression supplied with the option.
For any path in the callgraph, we check all nodes in the path
against the supplied regular expression. If none of the nodes
match, the path is dropped from the output.
</td>
</tr>
<tr valign=top>
<td><code>--ignore=&lt;re&gt;</code></td>
<td>
This option controls what region of the graph is displayed
based on the regular expression supplied with the option.
For any path in the callgraph, we check all nodes in the path
against the supplied regular expression. If any of the nodes
match, the path is dropped from the output.
</td>
</tr>
</table>
</center>
<p>The dropped edges and nodes account for some count mismatches in
the display. For example, the cumulative count for
<code>snprintf()</code> in the first diagram above was 41. However
the local count (1) and the count along the outgoing edges (12+1+20+6)
add up to only 40.</p>
<h1>Caveats</h1>
<ul>
<li> If the program exits because of a signal, the generated profile
will be <font color=red>incomplete, and may perhaps be
completely empty</font>.
<li> The displayed graph may have disconnected regions because
of the edge-dropping heuristics described above.
<li> If the program linked in a library that was not compiled
with enough symbolic information, all samples associated
with the library may be charged to the last symbol found
in the program before the library. This will artificially
inflate the count for that symbol.
<li> If you run the program on one machine, and profile it on
another, and the shared libraries are different on the two
machines, the profiling output may be confusing: samples that
fall within shared libaries may be assigned to arbitrary
procedures.
<li> If your program forks, the children will also be profiled
(since they inherit the same CPUPROFILE setting). Each process
is profiled separately; to distinguish the child profiles from
the parent profile and from each other, all children will have
their process-id appended to the CPUPROFILE name.
<li> Due to a hack we make to work around a possible gcc bug, your
profiles may end up named strangely if the first character of
your CPUPROFILE variable has ascii value greater than 127.
This should be exceedingly rare, but if you need to use such a
name, just set prepend <code>./</code> to your filename:
<code>CPUPROFILE=./&Auml;gypten</code>.
</ul>
<hr>
<address>Sanjay Ghemawat<br>
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
<!-- hhmts start -->
Last modified: Fri May 9 14:41:29 PDT 2008
<!-- hhmts end -->
</address>
</BODY>
</HTML>

View File

@ -1,109 +0,0 @@
body {
background-color: #ffffff;
color: black;
margin-right: 1in;
margin-left: 1in;
}
h1, h2, h3, h4, h5, h6 {
color: #3366ff;
font-family: sans-serif;
}
@media print {
/* Darker version for printing */
h1, h2, h3, h4, h5, h6 {
color: #000080;
font-family: helvetica, sans-serif;
}
}
h1 {
text-align: center;
font-size: 18pt;
}
h2 {
margin-left: -0.5in;
}
h3 {
margin-left: -0.25in;
}
h4 {
margin-left: -0.125in;
}
hr {
margin-left: -1in;
}
/* Definition lists: definition term bold */
dt {
font-weight: bold;
}
address {
text-align: right;
}
/* Use the <code> tag for bits of code and <var> for variables and objects. */
code,pre,samp,var {
color: #006000;
}
/* Use the <file> tag for file and directory paths and names. */
file {
color: #905050;
font-family: monospace;
}
/* Use the <kbd> tag for stuff the user should type. */
kbd {
color: #600000;
}
div.note p {
float: right;
width: 3in;
margin-right: 0%;
padding: 1px;
border: 2px solid #6060a0;
background-color: #fffff0;
}
UL.nobullets {
list-style-type: none;
list-style-image: none;
margin-left: -1em;
}
/* pretty printing styles. See prettify.js */
.str { color: #080; }
.kwd { color: #008; }
.com { color: #800; }
.typ { color: #606; }
.lit { color: #066; }
.pun { color: #660; }
.pln { color: #000; }
.tag { color: #008; }
.atn { color: #606; }
.atv { color: #080; }
pre.prettyprint { padding: 2px; border: 1px solid #888; }
.embsrc { background: #eee; }
@media print {
.str { color: #060; }
.kwd { color: #006; font-weight: bold; }
.com { color: #600; font-style: italic; }
.typ { color: #404; font-weight: bold; }
.lit { color: #044; }
.pun { color: #440; }
.pln { color: #000; }
.tag { color: #006; font-weight: bold; }
.atn { color: #404; }
.atv { color: #060; }
}
/* Table Column Headers */
.hdr {
color: #006;
font-weight: bold;
background-color: #dddddd; }
.hdr2 {
color: #006;
background-color: #eeeeee; }

2
docs/dots/README Normal file
View File

@ -0,0 +1,2 @@
This directory contains original graphviz sources of diagrams used in
gperftools docs.

293
docs/heapprofile.adoc Normal file
View File

@ -0,0 +1,293 @@
= Gperftools Heap Profiler
Original author: Sanjay Ghemawat
:reproducible:
[.normal]
This is the heap profiler originally developed at Google, to explore
how C++ programs manage memory. This facility can be useful for
* Figuring out what is in the program heap at any given time
* Locating memory leaks
* Finding places that do a lot of allocation
The profiling system instruments all allocations and frees. It
keeps track of various pieces of information per allocation site. An
allocation site is defined as the active stack trace at the call to
`malloc`, `calloc`, `realloc`, or, `new`.
There are three parts to using it: linking the library into an
application, running the code, and analyzing the output.
== Linking in the Library
To install the heap profiler into your executable, add
`-ltcmalloc` to the link-time step for your executable.
Also, while we don't necessarily recommend this form of usage, it's
possible to add in the profiler at run-time using
`LD_PRELOAD`:
% env LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary>
This does _not_ turn on heap profiling; it just inserts the
code. For that reason, it's practical to just always link
`-ltcmalloc` into a binary while developing; that's what we
do at Google. (However, since any user can turn on the profiler by
setting an environment variable, it's not necessarily recommended to
install profiler-linked binaries into a production, running
system.) Note that if you wish to use the heap profiler, you must
also use the tcmalloc memory-allocation library. There is no way
currently to use the heap profiler separate from tcmalloc.
== Running the Code
There are several alternatives to actually turn on heap profiling for
a given run of an executable:
. Define the environment variable HEAPPROFILE to the filename
to dump the profile to. For instance, to profile
`/usr/local/bin/my_binary_compiled_with_tcmalloc`:
% env HEAPPROFILE=/tmp/mybin.hprof /usr/local/bin/my_binary_compiled_with_tcmalloc
. In your code, bracket the code you want profiled in calls to
`HeapProfilerStart()` and `HeapProfilerStop()`. (These functions are
declared in `<gperftools/heap-profiler.h>`.) `HeapProfilerStart()`
will take the profile-filename-prefix as an argument. Then, as often
as you'd like before calling `HeapProfilerStop()`, you can use
`HeapProfilerDump()` or `GetHeapProfile()` to examine the profile. In
case it's useful, `IsHeapProfilerRunning()` will tell you whether
you've already called `HeapProfilerStart()` or not.
For security reasons, heap profiling will not write to a file -- and
is thus not usable -- for setuid programs.
=== Modifying Runtime Behavior
You can more finely control the behavior of the heap profiler via
environment variables.
[cols=3*]
|===
|`HEAP_PROFILE_ALLOCATION_INTERVAL`
|default: 1073741824 (1 Gb)
|Dump heap profiling information each time the specified number of
bytes has been allocated by the program.
|`HEAP_PROFILE_INUSE_INTERVAL`
|default: 104857600 (100 Mb)
|Dump heap profiling information whenever the high-water memory
usage mark increases by the specified number of bytes.
|`HEAP_PROFILE_TIME_INTERVAL`
|default: 0
|Dump heap profiling information each time the specified
number of seconds has elapsed.
|`HEAPPROFILESIGNAL`
|default: disabled
|Dump heap profiling information whenever the specified signal is sent to the
process.
|`HEAP_PROFILE_MMAP`
|default: false
|Profile `mmap`, `mremap` and `sbrk`
calls in addition
to `malloc`, `calloc`, `realloc`,
and `new`. *NOTE:* this causes the profiler to
profile calls internal to tcmalloc, since tcmalloc and friends use
mmap and sbrk internally for allocations. One partial solution is
to filter these allocations out when running `pprof`,
with something like
`pprof --ignore='DoAllocWithArena\|SbrkSysAllocator::Alloc\|MmapSysAllocator::Alloc`.
|`HEAP_PROFILE_ONLY_MMAP`
|default: false
|Only profile `mmap`, `mremap`, and `sbrk`
calls; do not profile
`malloc`, `calloc`, `realloc`,
or `new`.
|`HEAP_PROFILE_MMAP_LOG`
|default: false
|Log `mmap`/`munmap` calls.
|===
== Analyzing the Output
If heap-profiling is turned on in a program, the program will
periodically write profiles to the filesystem. The sequence of
profiles will be named:
<prefix>.0000.heap
<prefix>.0001.heap
<prefix>.0002.heap
...
where `<prefix>` is the filename-prefix supplied
when running the code (e.g. via the `HEAPPROFILE`
environment variable). Note that if the supplied prefix
does not start with a `/`, the profile files will be
written to the program's working directory.
The profile output can be viewed by passing it to the
`pprof` tool -- the same tool that's used to analyze <A
link:cpuprofile.html[CPU profiles].
Here are some examples. These examples assume the binary is named
`gfs_master`, and a sequence of heap profile files can be
found in files named:
/tmp/profile.0001.heap
/tmp/profile.0002.heap
...
/tmp/profile.0100.heap
=== Why is a process so big
% pprof --gv gfs_master /tmp/profile.0100.heap
This command will pop-up a `gv` window that displays
the profile information as a directed graph. Here is a portion
of the resulting output:
image::heap-example1.png[]
A few explanations:
* `GFS_MasterChunk::AddServer` accounts for 255.6 MB
of the live memory, which is 25% of the total live memory.
* `GFS_MasterChunkTable::UpdateState` is directly
accountable for 176.2 MB of the live memory (i.e., it directly
allocated 176.2 MB that has not been freed yet). Furthermore,
it and its callees are responsible for 729.9 MB. The
labels on the outgoing edges give a good indication of the
amount allocated by each callee.
=== Comparing Profiles
You often want to skip allocations during the initialization phase
of a program so you can find gradual memory leaks. One simple way to
do this is to compare two profiles -- both collected after the program
has been running for a while. Specify the name of the first profile
using the `--base` option. For example:
% pprof --base=/tmp/profile.0004.heap gfs_master /tmp/profile.0100.heap
The memory-usage in `/tmp/profile.0004.heap` will be
subtracted from the memory-usage in
`/tmp/profile.0100.heap` and the result will be
displayed.
=== Text display
% pprof --text gfs_master /tmp/profile.0100.heap
255.6 24.7% 24.7% 255.6 24.7% GFS_MasterChunk::AddServer
184.6 17.8% 42.5% 298.8 28.8% GFS_MasterChunkTable::Create
176.2 17.0% 59.5% 729.9 70.5% GFS_MasterChunkTable::UpdateState
169.8 16.4% 75.9% 169.8 16.4% PendingClone::PendingClone
76.3 7.4% 83.3% 76.3 7.4% __default_alloc_template::_S_chunk_alloc
49.5 4.8% 88.0% 49.5 4.8% hashtable::resize
...
* The first column contains the direct memory use in MB.
* The fourth column contains memory use by the procedure
and all of its callees.
* The second and fifth columns are just percentage
representations of the numbers in the first and fourth columns.
* The third column is a cumulative sum of the second column
(i.e., the `k`-th entry in the third column is the
sum of the first `k` entries in the second column.)
=== Ignoring or focusing on specific regions
The following command will give a graphical display of a subset of
the call-graph. Only paths in the call-graph that match the regular
expression `DataBuffer` are included:
% pprof --gv --focus=DataBuffer gfs_master /tmp/profile.0100.heap
Similarly, the following command will omit all paths subset of the
call-graph. All paths in the call-graph that match the regular
expression `DataBuffer` are discarded:
% pprof --gv --ignore=DataBuffer gfs_master /tmp/profile.0100.heap
=== Total allocations + object-level information
All of the previous examples have displayed the amount of in-use
space. I.e., the number of bytes that have been allocated but not
freed. You can also get other types of information by supplying a
flag to `pprof`:
[cols=2*]
|===
|`--inuse_space`
|Display the number of in-use megabytes (i.e. space that has
been allocated but not freed). This is the default.
|`--inuse_objects`
|Display the number of in-use objects (i.e. number of
objects that have been allocated but not freed).
|`--alloc_space`
|Display the number of allocated megabytes. This includes
the space that has since been de-allocated. Use this
if you want to find the main allocation sites in the
program.
|`--alloc_objects`
|Display the number of allocated objects. This includes
the objects that have since been de-allocated. Use this
if you want to find the main allocation sites in the
program.
|===
=== Interactive mode
By default -- if you don't specify any flags to the contrary --
pprof runs in interactive mode. At the `(pprof)` prompt,
you can run many of the commands described above. You can type
`help` for a list of what commands are available in
interactive mode.
== Caveats
* Heap profiling requires the use of libtcmalloc. This
requirement may be removed in a future version of the heap
profiler, and the heap profiler separated out into its own
library.
* If the program linked in a library that was not compiled
with enough symbolic information, all samples associated
with the library may be charged to the last symbol found
in the program before the library. This will artificially
inflate the count for that symbol.
* If you run the program on one machine, and profile it on
another, and the shared libraries are different on the two
machines, the profiling output may be confusing: samples that
fall within the shared libaries may be assigned to arbitrary
procedures.
* Several libraries, such as some STL implementations, do their
own memory management. This may cause strange profiling
results. We have code in libtcmalloc to cause STL to use
tcmalloc for memory management (which in our tests is better
than STL's internal management), though it only works for some
STL implementations.
* If your program forks, the children will also be profiled
(since they inherit the same HEAPPROFILE setting). Each
process is profiled separately; to distinguish the child
profiles from the parent profile and from each other, all
children will have their process-id attached to the HEAPPROFILE
name.
* Due to a hack we make to work around a possible gcc bug, your
profiles may end up named strangely if the first character of
your HEAPPROFILE variable has ascii value greater than 127.
This should be exceedingly rare, but if you need to use such a
name, just set prepend `./` to your filename:
`HEAPPROFILE=.Ägypten`.

View File

@ -1,391 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<link rel="stylesheet" href="designstyle.css">
<title>Gperftools Heap Profiler</title>
</HEAD>
<BODY>
<p align=right>
<i>Last modified
<script type=text/javascript>
var lm = new Date(document.lastModified);
document.write(lm.toDateString());
</script></i>
</p>
<p>This is the heap profiler we use at Google, to explore how C++
programs manage memory. This facility can be useful for</p>
<ul>
<li> Figuring out what is in the program heap at any given time
<li> Locating memory leaks
<li> Finding places that do a lot of allocation
</ul>
<p>The profiling system instruments all allocations and frees. It
keeps track of various pieces of information per allocation site. An
allocation site is defined as the active stack trace at the call to
<code>malloc</code>, <code>calloc</code>, <code>realloc</code>, or,
<code>new</code>.</p>
<p>There are three parts to using it: linking the library into an
application, running the code, and analyzing the output.</p>
<h1>Linking in the Library</h1>
<p>To install the heap profiler into your executable, add
<code>-ltcmalloc</code> to the link-time step for your executable.
Also, while we don't necessarily recommend this form of usage, it's
possible to add in the profiler at run-time using
<code>LD_PRELOAD</code>:
<pre>% env LD_PRELOAD="/usr/lib/libtcmalloc.so" &lt;binary&gt;</pre>
<p>This does <i>not</i> turn on heap profiling; it just inserts the
code. For that reason, it's practical to just always link
<code>-ltcmalloc</code> into a binary while developing; that's what we
do at Google. (However, since any user can turn on the profiler by
setting an environment variable, it's not necessarily recommended to
install profiler-linked binaries into a production, running
system.) Note that if you wish to use the heap profiler, you must
also use the tcmalloc memory-allocation library. There is no way
currently to use the heap profiler separate from tcmalloc.</p>
<h1>Running the Code</h1>
<p>There are several alternatives to actually turn on heap profiling
for a given run of an executable:</p>
<ol>
<li> <p>Define the environment variable HEAPPROFILE to the filename
to dump the profile to. For instance, to profile
<code>/usr/local/bin/my_binary_compiled_with_tcmalloc</code>:</p>
<pre>% env HEAPPROFILE=/tmp/mybin.hprof /usr/local/bin/my_binary_compiled_with_tcmalloc</pre>
<li> <p>In your code, bracket the code you want profiled in calls to
<code>HeapProfilerStart()</code> and <code>HeapProfilerStop()</code>.
(These functions are declared in <code>&lt;gperftools/heap-profiler.h&gt;</code>.)
<code>HeapProfilerStart()</code> will take the
profile-filename-prefix as an argument. Then, as often as
you'd like before calling <code>HeapProfilerStop()</code>, you
can use <code>HeapProfilerDump()</code> or
<code>GetHeapProfile()</code> to examine the profile. In case
it's useful, <code>IsHeapProfilerRunning()</code> will tell you
whether you've already called HeapProfilerStart() or not.</p>
</ol>
<p>For security reasons, heap profiling will not write to a file --
and is thus not usable -- for setuid programs.</p>
<H2>Modifying Runtime Behavior</H2>
<p>You can more finely control the behavior of the heap profiler via
environment variables.</p>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>HEAP_PROFILE_ALLOCATION_INTERVAL</code></td>
<td>default: 1073741824 (1 Gb)</td>
<td>
Dump heap profiling information each time the specified number of
bytes has been allocated by the program.
</td>
</tr>
<tr valign=top>
<td><code>HEAP_PROFILE_INUSE_INTERVAL</code></td>
<td>default: 104857600 (100 Mb)</td>
<td>
Dump heap profiling information whenever the high-water memory
usage mark increases by the specified number of bytes.
</td>
</tr>
<tr valign=top>
<td><code>HEAP_PROFILE_TIME_INTERVAL</code></td>
<td>default: 0</td>
<td>
Dump heap profiling information each time the specified
number of seconds has elapsed.
</td>
</tr>
<tr valign=top>
<td><code>HEAPPROFILESIGNAL</code></td>
<td>default: disabled</td>
<td>
Dump heap profiling information whenever the specified signal is sent to the
process.
</td>
</tr>
<tr valign=top>
<td><code>HEAP_PROFILE_MMAP</code></td>
<td>default: false</td>
<td>
Profile <code>mmap</code>, <code>mremap</code> and <code>sbrk</code>
calls in addition
to <code>malloc</code>, <code>calloc</code>, <code>realloc</code>,
and <code>new</code>. <b>NOTE:</b> this causes the profiler to
profile calls internal to tcmalloc, since tcmalloc and friends use
mmap and sbrk internally for allocations. One partial solution is
to filter these allocations out when running <code>pprof</code>,
with something like
<code>pprof --ignore='DoAllocWithArena|SbrkSysAllocator::Alloc|MmapSysAllocator::Alloc</code>.
</td>
</tr>
<tr valign=top>
<td><code>HEAP_PROFILE_ONLY_MMAP</code></td>
<td>default: false</td>
<td>
Only profile <code>mmap</code>, <code>mremap</code>, and <code>sbrk</code>
calls; do not profile
<code>malloc</code>, <code>calloc</code>, <code>realloc</code>,
or <code>new</code>.
</td>
</tr>
<tr valign=top>
<td><code>HEAP_PROFILE_MMAP_LOG</code></td>
<td>default: false</td>
<td>
Log <code>mmap</code>/<code>munmap</code> calls.
</td>
</tr>
</table>
<H2>Checking for Leaks</H2>
<p>You can use the heap profiler to manually check for leaks, for
instance by reading the profiler output and looking for large
allocations. However, for that task, it's easier to use the <A
HREF="heap_checker.html">automatic heap-checking facility</A> built
into tcmalloc.</p>
<h1><a name="pprof">Analyzing the Output</a></h1>
<p>If heap-profiling is turned on in a program, the program will
periodically write profiles to the filesystem. The sequence of
profiles will be named:</p>
<pre>
&lt;prefix&gt;.0000.heap
&lt;prefix&gt;.0001.heap
&lt;prefix&gt;.0002.heap
...
</pre>
<p>where <code>&lt;prefix&gt;</code> is the filename-prefix supplied
when running the code (e.g. via the <code>HEAPPROFILE</code>
environment variable). Note that if the supplied prefix
does not start with a <code>/</code>, the profile files will be
written to the program's working directory.</p>
<p>The profile output can be viewed by passing it to the
<code>pprof</code> tool -- the same tool that's used to analyze <A
HREF="cpuprofile.html">CPU profiles</A>.
<p>Here are some examples. These examples assume the binary is named
<code>gfs_master</code>, and a sequence of heap profile files can be
found in files named:</p>
<pre>
/tmp/profile.0001.heap
/tmp/profile.0002.heap
...
/tmp/profile.0100.heap
</pre>
<h3>Why is a process so big</h3>
<pre>
% pprof --gv gfs_master /tmp/profile.0100.heap
</pre>
<p>This command will pop-up a <code>gv</code> window that displays
the profile information as a directed graph. Here is a portion
of the resulting output:</p>
<p><center>
<img src="heap-example1.png">
</center></p>
A few explanations:
<ul>
<li> <code>GFS_MasterChunk::AddServer</code> accounts for 255.6 MB
of the live memory, which is 25% of the total live memory.
<li> <code>GFS_MasterChunkTable::UpdateState</code> is directly
accountable for 176.2 MB of the live memory (i.e., it directly
allocated 176.2 MB that has not been freed yet). Furthermore,
it and its callees are responsible for 729.9 MB. The
labels on the outgoing edges give a good indication of the
amount allocated by each callee.
</ul>
<h3>Comparing Profiles</h3>
<p>You often want to skip allocations during the initialization phase
of a program so you can find gradual memory leaks. One simple way to
do this is to compare two profiles -- both collected after the program
has been running for a while. Specify the name of the first profile
using the <code>--base</code> option. For example:</p>
<pre>
% pprof --base=/tmp/profile.0004.heap gfs_master /tmp/profile.0100.heap
</pre>
<p>The memory-usage in <code>/tmp/profile.0004.heap</code> will be
subtracted from the memory-usage in
<code>/tmp/profile.0100.heap</code> and the result will be
displayed.</p>
<h3>Text display</h3>
<pre>
% pprof --text gfs_master /tmp/profile.0100.heap
255.6 24.7% 24.7% 255.6 24.7% GFS_MasterChunk::AddServer
184.6 17.8% 42.5% 298.8 28.8% GFS_MasterChunkTable::Create
176.2 17.0% 59.5% 729.9 70.5% GFS_MasterChunkTable::UpdateState
169.8 16.4% 75.9% 169.8 16.4% PendingClone::PendingClone
76.3 7.4% 83.3% 76.3 7.4% __default_alloc_template::_S_chunk_alloc
49.5 4.8% 88.0% 49.5 4.8% hashtable::resize
...
</pre>
<p>
<ul>
<li> The first column contains the direct memory use in MB.
<li> The fourth column contains memory use by the procedure
and all of its callees.
<li> The second and fifth columns are just percentage
representations of the numbers in the first and fourth columns.
<li> The third column is a cumulative sum of the second column
(i.e., the <code>k</code>th entry in the third column is the
sum of the first <code>k</code> entries in the second column.)
</ul>
<h3>Ignoring or focusing on specific regions</h3>
<p>The following command will give a graphical display of a subset of
the call-graph. Only paths in the call-graph that match the regular
expression <code>DataBuffer</code> are included:</p>
<pre>
% pprof --gv --focus=DataBuffer gfs_master /tmp/profile.0100.heap
</pre>
<p>Similarly, the following command will omit all paths subset of the
call-graph. All paths in the call-graph that match the regular
expression <code>DataBuffer</code> are discarded:</p>
<pre>
% pprof --gv --ignore=DataBuffer gfs_master /tmp/profile.0100.heap
</pre>
<h3>Total allocations + object-level information</h3>
<p>All of the previous examples have displayed the amount of in-use
space. I.e., the number of bytes that have been allocated but not
freed. You can also get other types of information by supplying a
flag to <code>pprof</code>:</p>
<center>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>--inuse_space</code></td>
<td>
Display the number of in-use megabytes (i.e. space that has
been allocated but not freed). This is the default.
</td>
</tr>
<tr valign=top>
<td><code>--inuse_objects</code></td>
<td>
Display the number of in-use objects (i.e. number of
objects that have been allocated but not freed).
</td>
</tr>
<tr valign=top>
<td><code>--alloc_space</code></td>
<td>
Display the number of allocated megabytes. This includes
the space that has since been de-allocated. Use this
if you want to find the main allocation sites in the
program.
</td>
</tr>
<tr valign=top>
<td><code>--alloc_objects</code></td>
<td>
Display the number of allocated objects. This includes
the objects that have since been de-allocated. Use this
if you want to find the main allocation sites in the
program.
</td>
</table>
</center>
<h3>Interactive mode</a></h3>
<p>By default -- if you don't specify any flags to the contrary --
pprof runs in interactive mode. At the <code>(pprof)</code> prompt,
you can run many of the commands described above. You can type
<code>help</code> for a list of what commands are available in
interactive mode.</p>
<h1>Caveats</h1>
<ul>
<li> Heap profiling requires the use of libtcmalloc. This
requirement may be removed in a future version of the heap
profiler, and the heap profiler separated out into its own
library.
<li> If the program linked in a library that was not compiled
with enough symbolic information, all samples associated
with the library may be charged to the last symbol found
in the program before the library. This will artificially
inflate the count for that symbol.
<li> If you run the program on one machine, and profile it on
another, and the shared libraries are different on the two
machines, the profiling output may be confusing: samples that
fall within the shared libaries may be assigned to arbitrary
procedures.
<li> Several libraries, such as some STL implementations, do their
own memory management. This may cause strange profiling
results. We have code in libtcmalloc to cause STL to use
tcmalloc for memory management (which in our tests is better
than STL's internal management), though it only works for some
STL implementations.
<li> If your program forks, the children will also be profiled
(since they inherit the same HEAPPROFILE setting). Each
process is profiled separately; to distinguish the child
profiles from the parent profile and from each other, all
children will have their process-id attached to the HEAPPROFILE
name.
<li> Due to a hack we make to work around a possible gcc bug, your
profiles may end up named strangely if the first character of
your HEAPPROFILE variable has ascii value greater than 127.
This should be exceedingly rare, but if you need to use such a
name, just set prepend <code>./</code> to your filename:
<code>HEAPPROFILE=./&Auml;gypten</code>.
</ul>
<hr>
<address>Sanjay Ghemawat
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
</address>
</body>
</html>

View File

@ -7,14 +7,10 @@
<BODY>
<ul>
<li> <A HREF="tcmalloc.html">thread-caching malloc</A>
<li> <A HREF="heap_checker.html">heap-checking using tcmalloc</A>
<li> <A HREF="heapprofile.html">heap-profiling using tcmalloc</A>
<li> <A HREF="cpuprofile.html">CPU profiler</A>
</ul>
<hr>
Last modified: Thu Feb 2 14:40:47 PST 2012
</BODY>
</HTML>

View File

@ -1,131 +0,0 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.23.
.TH PPROF "1" "February 2005" "pprof (part of gperftools)" Google
.SH NAME
pprof \- manual page for pprof (part of gperftools)
.SH SYNOPSIS
.B pprof
[\fIoptions\fR] \fI<program> <profile>\fR
.SH DESCRIPTION
.IP
Prints specified cpu- or heap-profile
.SH OPTIONS
.TP
\fB\-\-cum\fR
Sort by cumulative data
.TP
\fB\-\-base=\fR<base>
Subtract <base> from <profile> before display
.SS "Reporting Granularity:"
.TP
\fB\-\-addresses\fR
Report at address level
.TP
\fB\-\-lines\fR
Report at source line level
.TP
\fB\-\-functions\fR
Report at function level [default]
.TP
\fB\-\-files\fR
Report at source file level
.SS "Output type:"
.TP
\fB\-\-text\fR
Generate text report [default]
.TP
\fB\-\-gv\fR
Generate Postscript and display
.TP
\fB\-\-list=\fR<regexp>
Generate source listing of matching routines
.TP
\fB\-\-disasm=\fR<regexp>
Generate disassembly of matching routines
.TP
\fB\-\-dot\fR
Generate DOT file to stdout
.TP
\fB\-\-ps\fR
Generate Postscript to stdout
.TP
\fB\-\-pdf\fR
Generate PDF to stdout
.TP
\fB\-\-gif\fR
Generate GIF to stdout
.SS "Heap-Profile Options:"
.TP
\fB\-\-inuse_space\fR
Display in-use (mega)bytes [default]
.TP
\fB\-\-inuse_objects\fR
Display in-use objects
.TP
\fB\-\-alloc_space\fR
Display allocated (mega)bytes
.TP
\fB\-\-alloc_objects\fR
Display allocated objects
.TP
\fB\-\-show_bytes\fR
Display space in bytes
.TP
\fB\-\-drop_negative\fR
Ignore negaive differences
.SS "Call-graph Options:"
.TP
\fB\-\-nodecount=\fR<n>
Show at most so many nodes [default=80]
.TP
\fB\-\-nodefraction=\fR<f>
Hide nodes below <f>*total [default=.005]
.TP
\fB\-\-edgefraction=\fR<f>
Hide edges below <f>*total [default=.001]
.TP
\fB\-\-focus=\fR<regexp>
Focus on nodes matching <regexp>
.TP
\fB\-\-ignore=\fR<regexp>
Ignore nodes matching <regexp>
.TP
\fB\-\-scale=\fR<n>
Set GV scaling [default=0]
.SH EXAMPLES
pprof /bin/ls ls.prof
.IP
Outputs one line per procedure
.PP
pprof \fB\-\-gv\fR /bin/ls ls.prof
.IP
Displays annotated call-graph via 'gv'
.PP
pprof \fB\-\-gv\fR \fB\-\-focus\fR=\fIMutex\fR /bin/ls ls.prof
.IP
Restricts to code paths including a .*Mutex.* entry
.PP
pprof \fB\-\-gv\fR \fB\-\-focus\fR=\fIMutex\fR \fB\-\-ignore\fR=\fIstring\fR /bin/ls ls.prof
.IP
Code paths including Mutex but not string
.PP
pprof \fB\-\-list\fR=\fIgetdir\fR /bin/ls ls.prof
.IP
Dissassembly (with per-line annotations) for getdir()
.PP
pprof \fB\-\-disasm\fR=\fIgetdir\fR /bin/ls ls.prof
.IP
Dissassembly (with per-PC annotations) for getdir()
.SH COPYRIGHT
Copyright \(co 2005 Google Inc.
.SH "SEE ALSO"
Further documentation for
.B pprof
is maintained as a web page called
.B cpu_profiler.html
and is likely installed at one of the following locations:
.IP
.B /usr/share/gperftools/cpu_profiler.html
.br
.B /usr/local/share/gperftools/cpu_profiler.html
.PP

View File

@ -1,11 +0,0 @@
[see also]
Further documentation for
.B pprof
is maintained as a web page called
.B cpu_profiler.html
and is likely installed at one of the following locations:
.IP
.B /usr/share/gperftools/cpu_profiler.html
.br
.B /usr/local/share/gperftools/cpu_profiler.html
.PP

119
docs/pprof_integration.adoc Normal file
View File

@ -0,0 +1,119 @@
== pprof integration
:reproducible:
gperftools was the original home for the pprof program. This program
is used to visualize and analyze profiles (CPU profiles, heap
profiles, heap samples, set of thread stacks, etc.). The original
pprof was written in Perl. As of this writing, the Linux distros are
shipping this version of pprof. Meanwhile, pprof was completely
modernized and rewritten in Go. The Go version is a much better
one. We've been recommending people to switch to the Go version for a
number of years and starting gperftools 2.17 we no longer have the
original pprof.
You can get the Go pprof binary by running:
% go install github.com/google/pprof@latest
The binary will normally appear in `$HOME/go/bin`. So you may want to
add it to your `$PATH`.
The main documentation of pprof can be found at
https://github.com/google/pprof/blob/main/doc/README.md
On this page, I'll point out some helpful integration aspects.
Here are the kinds of "profiles" that gperfools can feed into pprof.
=== CPU profiling
CPU profiler is provided in a distinct library: libprofiler. It's C++
API is in `gperftools/profiler.h`. You can invoke
`ProfilerStart()`/`ProfilerStop()` to control it. Or you can have
libprofiler automagically profile the full run of your program by setting
`CPUPROFILE` environment variable.
See link:cpuprofile.html[documentation of CPU profiler] for full
details.
A general description of how statistical sampling profilers work can be
found in this nice blog post: https://research.swtch.com/pprof.
We produce a "legacy" CPU profile format. The format is described
here: link:cpuprofile-fileformat.html[].
=== Heap sample
libtcmalloc supports very low overhead sampling of allocations. If this feature is enabled, you can call:
std::string sample_profile;
MallocExtension::instance()->GetHeapSample(&sample_profile);
And you'll get a statistical estimate of all currently in-use memory
allocations with backtraces of allocations. It will show you where
currently in-use memory was allocated. Heap sample can be saved and
fed to the pprof program for visualization and analysis.
At Google, this feature is enabled fleet-wide (and by default), but in
gperftools, our default is off. You can turn it on by setting the
environment variable `TCMALLOC_SAMPLE_PARAMETER`. However, please note
that libtcmalloc_minimal doesn't have this feature. In order to use
heap sampling, you need to link to "full" libtcmalloc.
The reasonable value of the sample parameter is from 524288 (512kb;
original default) to a few megs (current default at Google). A lower
value gives you more samples, so higher statistical precision. But a
lower value also causes higher overhead and lock contention.
Our sibling project, "abseil" tcmalloc, also supports heap
sampling. Implementation has evolved a bit, but this is fundamentally
the same logic. In addition to sampling, they also have allocation and
deallocation profiling powered by the same sampling facility. Their
docs are at:
https://github.com/google/tcmalloc/blob/master/docs/sampling.md.
Go has similar feature called heap profiling. Go's heap profiles
combine information about in-use memory and all the allocations ever
made. It is similar to gperftools' link:heapprofile.html[heap profiler] but works
via sampling, so it is low overhead and runs by default. You can read
about it here: https://pkg.go.dev/runtime/pprof. Approximately every
512k bytes (value of runtime.MemProfileRate) of memory allocated, Go's
runtime triggers heap sampling. Heap sampling grabs backtrace, and
then updates per-call-site allocation counters. The heap profile is a
collection of call sites (identified by the backtrace chain) and
relevant statistics.
=== Heap Growth stacks
Every time tcmalloc extends its heap, it grabs stack trace. A
collection of those stacks can be obtained by:
MallocExtension::instance()->GetHeapGrowthStacks(&string);
and fed to pprof for visualization and analysis. This kind of profile
gives you locations in your code that extended heap (either due to
regular usage, leaks, or fragmentation).
Heap growth tracking is always enabled in full libtcmalloc and is cut
off from libtcmalloc_minimal.
=== Heap Profiler
See link:heapprofile.html[Heap Profiler documentation]. Note that the
heap profiler intercepts every allocation and deallocation call, so it
runs with a much higher overhead than normal malloc and is not
suitable for production.
=== HTTP interfaces
The more commonly used pprof integration point used at Google is via
HTTP endpoints. Go standard library provides a great example of how it
is done and how to use it. https://pkg.go.dev/net/http/pprof documents
it.
gperftools doesn't provide any HTTP handlers, but we do give you raw
profiling data, which you can serve by whatever HTTP-serving APIs you
like. Each profile kind (with the partial exception of heap profiler)
has an API to obtain profile data, which can be returned from an HTTP
handler.

View File

@ -1,260 +0,0 @@
<HTML>
<HEAD>
<title>pprof and Remote Servers</title>
</HEAD>
<BODY>
<h1><code>pprof</code> and Remote Servers</h1>
<p>In mid-2006, we added an experimental facility to <A
HREF="cpu_profiler.html">pprof</A>, the tool that analyzes CPU and
heap profiles. This facility allows you to collect profile
information from running applications. It makes it easy to collect
profile information without having to stop the program first, and
without having to log into the machine where the application is
running. This is meant to be used on webservers, but will work on any
application that can be modified to accept TCP connections on a port
of its choosing, and to respond to HTTP requests on that port.</p>
<p>We do not currently have infrastructure, such as apache modules,
that you can pop into a webserver or other application to get the
necessary functionality "for free." However, it's easy to generate
the necessary data, which should allow the interested developer to add
the necessary support into his or her applications.</p>
<p>To use <code>pprof</code> in this experimental "server" mode, you
give the script a host and port it should query, replacing the normal
commandline arguments of application + profile file:</p>
<pre>
% pprof internalweb.mycompany.com:80
</pre>
<p>The host must be listening on that port, and be able to accept HTTP/1.0
requests -- sent via <code>wget</code> and <code>curl</code> -- for
several urls. The following sections list the urls that
<code>pprof</code> can send, and the responses it expects in
return.</p>
<p>Here are examples that pprof will recognize, when you give them
on the commandline, are urls. In general, you
specify the host and a port (the port-number is required), and put
the service-name at the end of the url.:</p>
<blockquote><pre>
http://myhost:80/pprof/heap # retrieves a heap profile
http://myhost:8008/pprof/profile # retrieves a CPU profile
http://myhost:80 # retrieves a CPU profile (the default)
http://myhost:8080/ # retrieves a CPU profile (the default)
myhost:8088/pprof/growth # "http://" is optional, but port is not
http://myhost:80/myservice/pprof/heap # /pprof/heap just has to come at the end
http://myhost:80/pprof/pmuprofile # CPU profile using performance counters
</pre></blockquote>
<h2> <code><b>/pprof/heap</b></code> </h2>
<p><code>pprof</code> asks for the url <code>/pprof/heap</code> to
get heap information. The actual url is controlled via the variable
<code>HEAP_PAGE</code> in the <code>pprof</code> script, so you
can change it if you'd like.</p>
<p>There are two ways to get this data. The first is to call</p>
<pre>
MallocExtension::instance()->GetHeapSample(&output);
</pre>
<p>and have the server send <code>output</code> back as an HTTP
response to <code>pprof</code>. <code>MallocExtension</code> is
defined in the header file <code>gperftools/malloc_extension.h</code>.</p>
<p>Note this will only only work if the binary is being run with
sampling turned on (which is not the default). To do this, set the
environment variable <code>TCMALLOC_SAMPLE_PARAMETER</code> to a
positive value, such as 524288, before running.</p>
<p>The other way is to call <code>HeapProfileStart(filename)</code>
(from <code>heap-profiler.h</code>), continue to do work, and then,
some number of seconds later, call <code>GetHeapProfile()</code>
(followed by <code>HeapProfilerStop()</code>). The server can send
the output of <code>GetHeapProfile</code> back as the HTTP response to
pprof. (Note you must <code>free()</code> this data after using it.)
This is similar to how <A HREF="#profile">profile requests</A> are
handled, below. This technique does not require the application to
run with sampling turned on.</p>
<p>Here's an example of what the output should look like:</p>
<pre>
heap profile: 1923: 127923432 [ 1923: 127923432] @ heap_v2/524288
1: 312 [ 1: 312] @ 0x2aaaabaf5ccc 0x2aaaaba4cd2c 0x2aaaac08c09a
928: 122586016 [ 928: 122586016] @ 0x2aaaabaf682c 0x400680 0x400bdd 0x2aaaab1c368a 0x2aaaab1c8f77 0x2aaaab1c0396 0x2aaaab1c86ed 0x4007ff 0x2aaaaca62afa
1: 16 [ 1: 16] @ 0x2aaaabaf5ccc 0x2aaaabb04bac 0x2aaaabc1b262 0x2aaaabc21496 0x2aaaabc214bb
[...]
</pre>
<p> Older code may produce "version 1" heap profiles which look like this:<p/>
<pre>
heap profile: 14933: 791700132 [ 14933: 791700132] @ heap
1: 848688 [ 1: 848688] @ 0xa4b142 0x7f5bfc 0x87065e 0x4056e9 0x4125f8 0x42b4f1 0x45b1ba 0x463248 0x460871 0x45cb7c 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
1: 1048576 [ 1: 1048576] @ 0xa4a9b2 0x7fd025 0x4ca6d8 0x4ca814 0x4caa88 0x2aaaab104cf0 0x404e20 0x4125f8 0x42b4f1 0x45b1ba 0x463248 0x460871 0x45cb7c 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
2942: 388629374 [ 2942: 388629374] @ 0xa4b142 0x4006a0 0x400bed 0x5f0cfa 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
[...]
</pre>
<p>pprof accepts both old and new heap profiles and automatically
detects which one you are using.</p>
<h2> <code><b>/pprof/growth</b></code> </h2>
<p><code>pprof</code> asks for the url <code>/pprof/growth</code> to
get heap-profiling delta (growth) information. The actual url is
controlled via the variable <code>GROWTH_PAGE</code> in the
<code>pprof</code> script, so you can change it if you'd like.</p>
<p>The server should respond by calling</p>
<pre>
MallocExtension::instance()->GetHeapGrowthStacks(&output);
</pre>
<p>and sending <code>output</code> back as an HTTP response to
<code>pprof</code>. <code>MallocExtension</code> is defined in the
header file <code>gperftools/malloc_extension.h</code>.</p>
<p>Here's an example, from an actual Google webserver, of what the
output should look like:</p>
<pre>
heap profile: 741: 812122112 [ 741: 812122112] @ growth
1: 1572864 [ 1: 1572864] @ 0x87da564 0x87db8a3 0x84787a4 0x846e851 0x836d12f 0x834cd1c 0x8349ba5 0x10a3177 0x8349961
1: 1048576 [ 1: 1048576] @ 0x87d92e8 0x87d9213 0x87d9178 0x87d94d3 0x87da9da 0x8a364ff 0x8a437e7 0x8ab7d23 0x8ab7da9 0x8ac7454 0x8348465 0x10a3161 0x8349961
[...]
</pre>
<h2> <A NAME="profile"><code><b>/pprof/profile</b></code></A> </h2>
<p><code>pprof</code> asks for the url
<code>/pprof/profile?seconds=XX</code> to get cpu-profiling
information. The actual url is controlled via the variable
<code>PROFILE_PAGE</code> in the <code>pprof</code> script, so you can
change it if you'd like.</p>
<p>The server should respond by calling
<code>ProfilerStart(filename)</code>, continuing to do its work, and
then, XX seconds later, calling <code>ProfilerStop()</code>. (These
functions are declared in <code>gperftools/profiler.h</code>.) The
application is responsible for picking a unique filename for
<code>ProfilerStart()</code>. After calling
<code>ProfilerStop()</code>, the server should read the contents of
<code>filename</code> and send them back as an HTTP response to
<code>pprof</code>.</p>
<p>Obviously, to get useful profile information the application must
continue to run in the XX seconds that the profiler is running. Thus,
the profile start-stop calls should be done in a separate thread, or
be otherwise non-blocking.</p>
<p>The profiler output file is binary, but near the end of it, it
should have lines of text somewhat like this:</p>
<pre>
01016000-01017000 rw-p 00015000 03:01 59314 /lib/ld-2.2.2.so
</pre>
<h2> <code><b>/pprof/pmuprofile</b></code> </h2>
<code>pprof</code> asks for a url of the form
<code>/pprof/pmuprofile?event=hw_event:unit_mask&period=nnn&seconds=xxx</code>
to get cpu-profiling information. The actual url is controlled via the variable
<code>PMUPROFILE_PAGE</code> in the <code>pprof</code> script, so you can
change it if you'd like.</p>
<p>
This is similar to pprof, but is meant to be used with your CPU's hardware
performance counters. The server could be implemented on top of a library
such as <a href="http://perfmon2.sourceforge.net/">
<code>libpfm</code></a>. It should collect a sample every nnn occurrences
of the event and stop the sampling after xxx seconds. Much of the code
for <code>/pprof/profile</code> can be reused for this purpose.
</p>
<p>The server side routines (the equivalent of
ProfilerStart/ProfilerStart) are not available as part of perftools,
so this URL is unlikely to be that useful.</p>
<h2> <code><b>/pprof/contention</b></code> </h2>
<p>This is intended to be able to profile (thread) lock contention in
addition to CPU and memory use. It's not yet usable.</p>
<h2> <code><b>/pprof/cmdline</b></code> </h2>
<p><code>pprof</code> asks for the url <code>/pprof/cmdline</code> to
figure out what application it's profiling. The actual url is
controlled via the variable <code>PROGRAM_NAME_PAGE</code> in the
<code>pprof</code> script, so you can change it if you'd like.</p>
<p>The server should respond by reading the contents of
<code>/proc/self/cmdline</code>, converting all internal NUL (\0)
characters to newlines, and sending the result back as an HTTP
response to <code>pprof</code>.</p>
<p>Here's an example return value:<p>
<pre>
/root/server/custom_webserver
80
--configfile=/root/server/ws.config
</pre>
<h2> <code><b>/pprof/symbol</b></code> </h2>
<p><code>pprof</code> asks for the url <code>/pprof/symbol</code> to
map from hex addresses to variable names. The actual url is
controlled via the variable <code>SYMBOL_PAGE</code> in the
<code>pprof</code> script, so you can change it if you'd like.</p>
<p>When the server receives a GET request for
<code>/pprof/symbol</code>, it should return a line formatted like
so:</p>
<pre>
num_symbols: ###
</pre>
<p>where <code>###</code> is the number of symbols found in the
binary. (For now, the only important distinction is whether the value
is 0, which it is for executables that lack debug information, or
not-0).</p>
<p>This is perhaps the hardest request to write code for, because in
addition to the GET request for this url, the server must accept POST
requests. This means that after the HTTP headers, pprof will pass in
a list of hex addresses connected by <code>+</code>, like so:</p>
<pre>
curl -d '0x0824d061+0x0824d1cf' http://remote_host:80/pprof/symbol
</pre>
<p>The server should read the POST data, which will be in one line,
and for each hex value, should write one line of output to the output
stream, like so:</p>
<pre>
&lt;hex address&gt;&lt;tab&gt;&lt;function name&gt;
</pre>
<p>For instance:</p>
<pre>
0x08b2dabd _Update
</pre>
<p>The other reason this is the most difficult request to implement,
is that the application will have to figure out for itself how to map
from address to function name. One possibility is to run <code>nm -C
-n &lt;program name&gt;</code> to get the mappings at
program-compile-time. Another, at least on Linux, is to call out to
addr2line for every <code>pprof/symbol</code> call, for instance
<code>addr2line -Cfse /proc/<getpid>/exe 0x12345678 0x876543210</code>
(presumably with some caching!)</p>
<p><code>pprof</code> itself does just this for local profiles (not
ones that talk to remote servers); look at the subroutine
<code>GetProcedureBoundaries</code>.</p>
<hr>
Last modified: Mon Jun 12 21:30:14 PDT 2006
</body>
</html>

View File

@ -1,480 +0,0 @@
time.1.ptmalloc.64:0.56 user 0.02 system 0.57 elapsed 100% CPU
time.1.tcmalloc.64:0.38 user 0.02 system 0.40 elapsed 98% CPU
time.1.ptmalloc.128:0.61 user 0.01 system 0.61 elapsed 101% CPU
time.1.tcmalloc.128:0.35 user 0.00 system 0.35 elapsed 99% CPU
time.1.ptmalloc.256:0.59 user 0.01 system 0.60 elapsed 100% CPU
time.1.tcmalloc.256:0.27 user 0.02 system 0.28 elapsed 102% CPU
time.1.ptmalloc.512:0.57 user 0.00 system 0.57 elapsed 100% CPU
time.1.tcmalloc.512:0.25 user 0.01 system 0.25 elapsed 101% CPU
time.1.ptmalloc.1024:0.52 user 0.00 system 0.52 elapsed 99% CPU
time.1.tcmalloc.1024:0.22 user 0.02 system 0.24 elapsed 97% CPU
time.1.ptmalloc.2048:0.47 user 0.00 system 0.47 elapsed 99% CPU
time.1.tcmalloc.2048:0.22 user 0.02 system 0.25 elapsed 95% CPU
time.1.ptmalloc.4096:0.48 user 0.01 system 0.48 elapsed 100% CPU
time.1.tcmalloc.4096:0.25 user 0.01 system 0.25 elapsed 100% CPU
time.1.ptmalloc.8192:0.49 user 0.02 system 0.49 elapsed 102% CPU
time.1.tcmalloc.8192:0.27 user 0.02 system 0.28 elapsed 101% CPU
time.1.ptmalloc.16384:0.51 user 0.04 system 0.55 elapsed 99% CPU
time.1.tcmalloc.16384:0.35 user 0.02 system 0.37 elapsed 100% CPU
time.1.ptmalloc.32768:0.53 user 0.14 system 0.66 elapsed 100% CPU
time.1.tcmalloc.32768:0.67 user 0.02 system 0.69 elapsed 99% CPU
time.1.ptmalloc.65536:0.68 user 0.31 system 0.98 elapsed 100% CPU
time.1.tcmalloc.65536:0.71 user 0.01 system 0.72 elapsed 99% CPU
time.1.ptmalloc.131072:0.90 user 0.72 system 1.62 elapsed 99% CPU
time.1.tcmalloc.131072:0.94 user 0.03 system 0.97 elapsed 99% CPU
time.2.ptmalloc.64:1.05 user 0.00 system 0.53 elapsed 196% CPU
time.2.tcmalloc.64:0.66 user 0.03 system 0.37 elapsed 185% CPU
time.2.ptmalloc.128:1.77 user 0.01 system 0.89 elapsed 198% CPU
time.2.tcmalloc.128:0.53 user 0.01 system 0.29 elapsed 184% CPU
time.2.ptmalloc.256:1.14 user 0.01 system 0.62 elapsed 182% CPU
time.2.tcmalloc.256:0.45 user 0.02 system 0.26 elapsed 180% CPU
time.2.ptmalloc.512:1.26 user 0.40 system 1.79 elapsed 92% CPU
time.2.tcmalloc.512:0.43 user 0.02 system 0.27 elapsed 166% CPU
time.2.ptmalloc.1024:0.98 user 0.03 system 0.56 elapsed 179% CPU
time.2.tcmalloc.1024:0.44 user 0.02 system 0.34 elapsed 134% CPU
time.2.ptmalloc.2048:0.87 user 0.02 system 0.44 elapsed 199% CPU
time.2.tcmalloc.2048:0.49 user 0.02 system 0.34 elapsed 148% CPU
time.2.ptmalloc.4096:0.92 user 0.03 system 0.48 elapsed 196% CPU
time.2.tcmalloc.4096:0.50 user 0.02 system 0.49 elapsed 105% CPU
time.2.ptmalloc.8192:1.05 user 0.04 system 0.55 elapsed 196% CPU
time.2.tcmalloc.8192:0.59 user 0.01 system 0.51 elapsed 116% CPU
time.2.ptmalloc.16384:1.30 user 0.14 system 0.72 elapsed 198% CPU
time.2.tcmalloc.16384:0.63 user 0.03 system 0.68 elapsed 96% CPU
time.2.ptmalloc.32768:1.33 user 0.56 system 1.00 elapsed 189% CPU
time.2.tcmalloc.32768:1.16 user 0.01 system 1.17 elapsed 99% CPU
time.2.ptmalloc.65536:1.86 user 1.79 system 2.01 elapsed 181% CPU
time.2.tcmalloc.65536:1.35 user 0.01 system 1.35 elapsed 100% CPU
time.2.ptmalloc.131072:2.61 user 5.19 system 4.81 elapsed 162% CPU
time.2.tcmalloc.131072:1.86 user 0.04 system 1.90 elapsed 100% CPU
time.3.ptmalloc.64:1.79 user 0.03 system 0.67 elapsed 268% CPU
time.3.tcmalloc.64:1.58 user 0.04 system 0.62 elapsed 260% CPU
time.3.ptmalloc.128:2.77 user 1.34 system 3.07 elapsed 133% CPU
time.3.tcmalloc.128:1.19 user 0.01 system 0.50 elapsed 236% CPU
time.3.ptmalloc.256:2.14 user 0.02 system 0.85 elapsed 252% CPU
time.3.tcmalloc.256:0.96 user 0.01 system 0.41 elapsed 236% CPU
time.3.ptmalloc.512:3.37 user 1.31 system 3.33 elapsed 140% CPU
time.3.tcmalloc.512:0.93 user 0.04 system 0.39 elapsed 243% CPU
time.3.ptmalloc.1024:1.66 user 0.01 system 0.64 elapsed 260% CPU
time.3.tcmalloc.1024:0.81 user 0.02 system 0.44 elapsed 187% CPU
time.3.ptmalloc.2048:2.07 user 0.01 system 0.82 elapsed 252% CPU
time.3.tcmalloc.2048:1.10 user 0.04 system 0.59 elapsed 191% CPU
time.3.ptmalloc.4096:2.01 user 0.03 system 0.79 elapsed 258% CPU
time.3.tcmalloc.4096:0.87 user 0.03 system 0.65 elapsed 137% CPU
time.3.ptmalloc.8192:2.22 user 0.11 system 0.83 elapsed 280% CPU
time.3.tcmalloc.8192:0.96 user 0.06 system 0.75 elapsed 135% CPU
time.3.ptmalloc.16384:2.56 user 0.47 system 1.02 elapsed 295% CPU
time.3.tcmalloc.16384:0.99 user 0.04 system 1.03 elapsed 99% CPU
time.3.ptmalloc.32768:3.29 user 1.75 system 1.96 elapsed 256% CPU
time.3.tcmalloc.32768:1.67 user 0.02 system 1.69 elapsed 99% CPU
time.3.ptmalloc.65536:4.04 user 6.62 system 4.92 elapsed 216% CPU
time.3.tcmalloc.65536:1.91 user 0.02 system 1.98 elapsed 97% CPU
time.3.ptmalloc.131072:5.55 user 17.86 system 12.44 elapsed 188% CPU
time.3.tcmalloc.131072:2.78 user 0.02 system 2.82 elapsed 99% CPU
time.4.ptmalloc.64:3.42 user 1.36 system 3.20 elapsed 149% CPU
time.4.tcmalloc.64:2.42 user 0.02 system 0.71 elapsed 341% CPU
time.4.ptmalloc.128:3.98 user 1.79 system 3.89 elapsed 148% CPU
time.4.tcmalloc.128:1.87 user 0.02 system 0.58 elapsed 325% CPU
time.4.ptmalloc.256:4.06 user 2.14 system 4.12 elapsed 150% CPU
time.4.tcmalloc.256:1.69 user 0.02 system 0.51 elapsed 331% CPU
time.4.ptmalloc.512:4.48 user 2.15 system 4.39 elapsed 150% CPU
time.4.tcmalloc.512:1.62 user 0.03 system 0.52 elapsed 314% CPU
time.4.ptmalloc.1024:3.18 user 0.03 system 0.84 elapsed 381% CPU
time.4.tcmalloc.1024:1.53 user 0.02 system 0.56 elapsed 274% CPU
time.4.ptmalloc.2048:3.24 user 0.02 system 0.84 elapsed 384% CPU
time.4.tcmalloc.2048:1.44 user 0.04 system 0.66 elapsed 221% CPU
time.4.ptmalloc.4096:3.50 user 0.04 system 0.91 elapsed 389% CPU
time.4.tcmalloc.4096:1.31 user 0.01 system 0.89 elapsed 148% CPU
time.4.ptmalloc.8192:6.77 user 3.85 system 4.14 elapsed 256% CPU
time.4.tcmalloc.8192:1.20 user 0.05 system 0.97 elapsed 127% CPU
time.4.ptmalloc.16384:7.08 user 5.06 system 4.63 elapsed 262% CPU
time.4.tcmalloc.16384:1.27 user 0.03 system 1.25 elapsed 103% CPU
time.4.ptmalloc.32768:5.57 user 4.22 system 3.31 elapsed 295% CPU
time.4.tcmalloc.32768:2.17 user 0.03 system 2.25 elapsed 97% CPU
time.4.ptmalloc.65536:6.11 user 15.05 system 9.19 elapsed 230% CPU
time.4.tcmalloc.65536:2.51 user 0.02 system 2.57 elapsed 98% CPU
time.4.ptmalloc.131072:7.58 user 33.15 system 21.28 elapsed 191% CPU
time.4.tcmalloc.131072:3.57 user 0.07 system 3.66 elapsed 99% CPU
time.5.ptmalloc.64:4.44 user 2.08 system 4.37 elapsed 148% CPU
time.5.tcmalloc.64:2.87 user 0.02 system 0.79 elapsed 361% CPU
time.5.ptmalloc.128:4.77 user 2.77 system 5.14 elapsed 146% CPU
time.5.tcmalloc.128:2.65 user 0.03 system 0.72 elapsed 367% CPU
time.5.ptmalloc.256:5.82 user 2.88 system 5.49 elapsed 158% CPU
time.5.tcmalloc.256:2.33 user 0.01 system 0.66 elapsed 352% CPU
time.5.ptmalloc.512:6.27 user 3.11 system 5.34 elapsed 175% CPU
time.5.tcmalloc.512:2.14 user 0.03 system 0.70 elapsed 307% CPU
time.5.ptmalloc.1024:6.82 user 3.18 system 5.23 elapsed 191% CPU
time.5.tcmalloc.1024:2.20 user 0.02 system 0.70 elapsed 313% CPU
time.5.ptmalloc.2048:6.57 user 3.46 system 5.22 elapsed 192% CPU
time.5.tcmalloc.2048:2.15 user 0.03 system 0.82 elapsed 264% CPU
time.5.ptmalloc.4096:8.75 user 5.09 system 5.26 elapsed 263% CPU
time.5.tcmalloc.4096:1.68 user 0.03 system 1.08 elapsed 158% CPU
time.5.ptmalloc.8192:4.48 user 0.61 system 1.51 elapsed 335% CPU
time.5.tcmalloc.8192:1.47 user 0.07 system 1.18 elapsed 129% CPU
time.5.ptmalloc.16384:5.71 user 1.98 system 2.14 elapsed 358% CPU
time.5.tcmalloc.16384:1.58 user 0.03 system 1.52 elapsed 105% CPU
time.5.ptmalloc.32768:7.19 user 7.81 system 5.53 elapsed 270% CPU
time.5.tcmalloc.32768:2.63 user 0.05 system 2.72 elapsed 98% CPU
time.5.ptmalloc.65536:8.45 user 23.51 system 14.30 elapsed 223% CPU
time.5.tcmalloc.65536:3.12 user 0.05 system 3.21 elapsed 98% CPU
time.5.ptmalloc.131072:10.22 user 43.63 system 27.84 elapsed 193% CPU
time.5.tcmalloc.131072:4.42 user 0.07 system 4.51 elapsed 99% CPU
time.6.ptmalloc.64:5.57 user 2.56 system 5.08 elapsed 159% CPU
time.6.tcmalloc.64:3.20 user 0.01 system 0.89 elapsed 360% CPU
time.6.ptmalloc.128:5.98 user 3.52 system 5.71 elapsed 166% CPU
time.6.tcmalloc.128:2.76 user 0.02 system 0.78 elapsed 355% CPU
time.6.ptmalloc.256:4.61 user 0.02 system 1.19 elapsed 389% CPU
time.6.tcmalloc.256:2.65 user 0.02 system 0.74 elapsed 356% CPU
time.6.ptmalloc.512:8.28 user 3.88 system 6.61 elapsed 183% CPU
time.6.tcmalloc.512:2.60 user 0.02 system 0.72 elapsed 362% CPU
time.6.ptmalloc.1024:4.75 user 0.00 system 1.22 elapsed 387% CPU
time.6.tcmalloc.1024:2.56 user 0.02 system 0.79 elapsed 325% CPU
time.6.ptmalloc.2048:8.90 user 4.59 system 6.15 elapsed 219% CPU
time.6.tcmalloc.2048:2.37 user 0.06 system 0.96 elapsed 250% CPU
time.6.ptmalloc.4096:11.41 user 7.02 system 6.31 elapsed 291% CPU
time.6.tcmalloc.4096:1.82 user 0.03 system 1.19 elapsed 154% CPU
time.6.ptmalloc.8192:11.64 user 8.25 system 5.97 elapsed 332% CPU
time.6.tcmalloc.8192:1.83 user 0.07 system 1.38 elapsed 136% CPU
time.6.ptmalloc.16384:7.44 user 2.98 system 3.01 elapsed 345% CPU
time.6.tcmalloc.16384:1.83 user 0.08 system 1.80 elapsed 105% CPU
time.6.ptmalloc.32768:8.69 user 12.35 system 8.04 elapsed 261% CPU
time.6.tcmalloc.32768:3.14 user 0.06 system 3.24 elapsed 98% CPU
time.6.ptmalloc.65536:10.52 user 35.43 system 20.75 elapsed 221% CPU
time.6.tcmalloc.65536:3.62 user 0.03 system 3.72 elapsed 98% CPU
time.6.ptmalloc.131072:11.74 user 59.00 system 36.93 elapsed 191% CPU
time.6.tcmalloc.131072:5.33 user 0.04 system 5.42 elapsed 98% CPU
time.7.ptmalloc.64:6.60 user 3.45 system 6.01 elapsed 167% CPU
time.7.tcmalloc.64:3.50 user 0.04 system 0.94 elapsed 376% CPU
time.7.ptmalloc.128:7.09 user 4.25 system 6.69 elapsed 169% CPU
time.7.tcmalloc.128:3.13 user 0.03 system 0.84 elapsed 374% CPU
time.7.ptmalloc.256:9.28 user 4.85 system 7.20 elapsed 196% CPU
time.7.tcmalloc.256:3.06 user 0.02 system 0.82 elapsed 375% CPU
time.7.ptmalloc.512:9.13 user 4.78 system 6.79 elapsed 204% CPU
time.7.tcmalloc.512:2.99 user 0.03 system 0.83 elapsed 359% CPU
time.7.ptmalloc.1024:10.85 user 6.41 system 7.52 elapsed 229% CPU
time.7.tcmalloc.1024:3.05 user 0.04 system 0.89 elapsed 345% CPU
time.7.ptmalloc.2048:5.65 user 0.08 system 1.47 elapsed 388% CPU
time.7.tcmalloc.2048:3.01 user 0.01 system 0.98 elapsed 306% CPU
time.7.ptmalloc.4096:6.09 user 0.08 system 1.58 elapsed 389% CPU
time.7.tcmalloc.4096:2.25 user 0.03 system 1.32 elapsed 171% CPU
time.7.ptmalloc.8192:6.73 user 0.85 system 1.99 elapsed 379% CPU
time.7.tcmalloc.8192:2.22 user 0.08 system 1.61 elapsed 142% CPU
time.7.ptmalloc.16384:8.87 user 4.66 system 4.04 elapsed 334% CPU
time.7.tcmalloc.16384:2.07 user 0.07 system 2.07 elapsed 103% CPU
time.7.ptmalloc.32768:10.61 user 17.85 system 11.22 elapsed 253% CPU
time.7.tcmalloc.32768:3.68 user 0.06 system 3.79 elapsed 98% CPU
time.7.ptmalloc.65536:13.05 user 45.97 system 27.28 elapsed 216% CPU
time.7.tcmalloc.65536:4.16 user 0.07 system 4.31 elapsed 98% CPU
time.7.ptmalloc.131072:13.22 user 62.67 system 41.33 elapsed 183% CPU
time.7.tcmalloc.131072:6.10 user 0.06 system 6.25 elapsed 98% CPU
time.8.ptmalloc.64:7.31 user 3.92 system 6.39 elapsed 175% CPU
time.8.tcmalloc.64:4.00 user 0.01 system 1.04 elapsed 383% CPU
time.8.ptmalloc.128:9.40 user 5.41 system 7.67 elapsed 192% CPU
time.8.tcmalloc.128:3.61 user 0.02 system 0.94 elapsed 386% CPU
time.8.ptmalloc.256:10.61 user 6.35 system 7.96 elapsed 212% CPU
time.8.tcmalloc.256:3.30 user 0.02 system 0.99 elapsed 335% CPU
time.8.ptmalloc.512:12.42 user 7.10 system 8.79 elapsed 221% CPU
time.8.tcmalloc.512:3.35 user 0.04 system 0.94 elapsed 358% CPU
time.8.ptmalloc.1024:13.63 user 8.54 system 8.95 elapsed 247% CPU
time.8.tcmalloc.1024:3.44 user 0.02 system 0.96 elapsed 359% CPU
time.8.ptmalloc.2048:6.45 user 0.03 system 1.67 elapsed 386% CPU
time.8.tcmalloc.2048:3.55 user 0.05 system 1.09 elapsed 328% CPU
time.8.ptmalloc.4096:6.83 user 0.26 system 1.80 elapsed 393% CPU
time.8.tcmalloc.4096:2.78 user 0.06 system 1.53 elapsed 185% CPU
time.8.ptmalloc.8192:7.59 user 1.29 system 2.36 elapsed 376% CPU
time.8.tcmalloc.8192:2.57 user 0.07 system 1.84 elapsed 142% CPU
time.8.ptmalloc.16384:10.15 user 6.20 system 5.20 elapsed 314% CPU
time.8.tcmalloc.16384:2.40 user 0.05 system 2.42 elapsed 101% CPU
time.8.ptmalloc.32768:11.82 user 24.48 system 14.60 elapsed 248% CPU
time.8.tcmalloc.32768:4.37 user 0.05 system 4.47 elapsed 98% CPU
time.8.ptmalloc.65536:15.41 user 58.94 system 34.42 elapsed 215% CPU
time.8.tcmalloc.65536:4.90 user 0.04 system 4.96 elapsed 99% CPU
time.8.ptmalloc.131072:16.07 user 82.93 system 52.51 elapsed 188% CPU
time.8.tcmalloc.131072:7.13 user 0.04 system 7.19 elapsed 99% CPU
time.9.ptmalloc.64:8.44 user 4.59 system 6.92 elapsed 188% CPU
time.9.tcmalloc.64:4.00 user 0.02 system 1.05 elapsed 382% CPU
time.9.ptmalloc.128:10.92 user 6.14 system 8.31 elapsed 205% CPU
time.9.tcmalloc.128:3.88 user 0.02 system 1.01 elapsed 382% CPU
time.9.ptmalloc.256:13.01 user 7.75 system 9.12 elapsed 227% CPU
time.9.tcmalloc.256:3.89 user 0.01 system 1.00 elapsed 386% CPU
time.9.ptmalloc.512:14.96 user 8.89 system 9.73 elapsed 244% CPU
time.9.tcmalloc.512:3.80 user 0.03 system 1.01 elapsed 377% CPU
time.9.ptmalloc.1024:15.42 user 10.20 system 9.80 elapsed 261% CPU
time.9.tcmalloc.1024:3.86 user 0.03 system 1.19 elapsed 325% CPU
time.9.ptmalloc.2048:7.24 user 0.02 system 1.87 elapsed 388% CPU
time.9.tcmalloc.2048:3.98 user 0.05 system 1.26 elapsed 319% CPU
time.9.ptmalloc.4096:7.96 user 0.18 system 2.06 elapsed 394% CPU
time.9.tcmalloc.4096:3.27 user 0.04 system 1.69 elapsed 195% CPU
time.9.ptmalloc.8192:9.00 user 1.63 system 2.79 elapsed 380% CPU
time.9.tcmalloc.8192:3.00 user 0.06 system 2.05 elapsed 148% CPU
time.9.ptmalloc.16384:12.07 user 8.13 system 6.55 elapsed 308% CPU
time.9.tcmalloc.16384:2.85 user 0.05 system 2.75 elapsed 105% CPU
time.9.ptmalloc.32768:13.99 user 29.65 system 18.02 elapsed 242% CPU
time.9.tcmalloc.32768:4.98 user 0.06 system 5.13 elapsed 98% CPU
time.9.ptmalloc.65536:16.89 user 70.42 system 42.11 elapsed 207% CPU
time.9.tcmalloc.65536:5.55 user 0.04 system 5.65 elapsed 98% CPU
time.9.ptmalloc.131072:18.53 user 94.11 system 61.17 elapsed 184% CPU
time.9.tcmalloc.131072:8.06 user 0.04 system 8.16 elapsed 99% CPU
time.10.ptmalloc.64:9.81 user 5.70 system 7.42 elapsed 208% CPU
time.10.tcmalloc.64:4.43 user 0.03 system 1.20 elapsed 370% CPU
time.10.ptmalloc.128:12.69 user 7.81 system 9.02 elapsed 227% CPU
time.10.tcmalloc.128:4.27 user 0.02 system 1.13 elapsed 378% CPU
time.10.ptmalloc.256:15.04 user 9.53 system 9.92 elapsed 247% CPU
time.10.tcmalloc.256:4.23 user 0.02 system 1.09 elapsed 388% CPU
time.10.ptmalloc.512:17.30 user 10.46 system 10.61 elapsed 261% CPU
time.10.tcmalloc.512:4.14 user 0.05 system 1.10 elapsed 379% CPU
time.10.ptmalloc.1024:16.96 user 9.38 system 9.30 elapsed 283% CPU
time.10.tcmalloc.1024:4.27 user 0.06 system 1.18 elapsed 366% CPU
time.10.ptmalloc.2048:8.07 user 0.03 system 2.06 elapsed 393% CPU
time.10.tcmalloc.2048:4.49 user 0.07 system 1.33 elapsed 342% CPU
time.10.ptmalloc.4096:8.66 user 0.25 system 2.25 elapsed 394% CPU
time.10.tcmalloc.4096:3.61 user 0.05 system 1.78 elapsed 205% CPU
time.10.ptmalloc.8192:21.52 user 17.43 system 10.41 elapsed 374% CPU
time.10.tcmalloc.8192:3.59 user 0.10 system 2.33 elapsed 158% CPU
time.10.ptmalloc.16384:20.55 user 24.85 system 12.55 elapsed 361% CPU
time.10.tcmalloc.16384:3.29 user 0.04 system 3.22 elapsed 103% CPU
time.10.ptmalloc.32768:15.23 user 38.13 system 22.49 elapsed 237% CPU
time.10.tcmalloc.32768:5.62 user 0.05 system 5.72 elapsed 99% CPU
time.10.ptmalloc.65536:19.80 user 85.42 system 49.98 elapsed 210% CPU
time.10.tcmalloc.65536:6.23 user 0.09 system 6.36 elapsed 99% CPU
time.10.ptmalloc.131072:20.91 user 106.97 system 69.08 elapsed 185% CPU
time.10.tcmalloc.131072:8.94 user 0.09 system 9.09 elapsed 99% CPU
time.11.ptmalloc.64:10.82 user 6.34 system 7.92 elapsed 216% CPU
time.11.tcmalloc.64:4.80 user 0.03 system 1.24 elapsed 387% CPU
time.11.ptmalloc.128:14.58 user 8.61 system 9.81 elapsed 236% CPU
time.11.tcmalloc.128:4.65 user 0.03 system 1.21 elapsed 384% CPU
time.11.ptmalloc.256:17.38 user 10.98 system 10.75 elapsed 263% CPU
time.11.tcmalloc.256:4.51 user 0.03 system 1.18 elapsed 384% CPU
time.11.ptmalloc.512:19.18 user 11.71 system 10.95 elapsed 282% CPU
time.11.tcmalloc.512:4.57 user 0.02 system 1.19 elapsed 384% CPU
time.11.ptmalloc.1024:19.94 user 12.41 system 10.48 elapsed 308% CPU
time.11.tcmalloc.1024:4.71 user 0.05 system 1.29 elapsed 367% CPU
time.11.ptmalloc.2048:8.70 user 0.04 system 2.35 elapsed 371% CPU
time.11.tcmalloc.2048:4.97 user 0.07 system 1.43 elapsed 350% CPU
time.11.ptmalloc.4096:22.47 user 18.43 system 10.82 elapsed 377% CPU
time.11.tcmalloc.4096:4.22 user 0.03 system 1.91 elapsed 221% CPU
time.11.ptmalloc.8192:11.61 user 2.38 system 3.73 elapsed 374% CPU
time.11.tcmalloc.8192:3.74 user 0.09 system 2.46 elapsed 155% CPU
time.11.ptmalloc.16384:14.13 user 13.38 system 9.60 elapsed 286% CPU
time.11.tcmalloc.16384:3.61 user 0.03 system 3.63 elapsed 100% CPU
time.11.ptmalloc.32768:17.92 user 43.84 system 26.74 elapsed 230% CPU
time.11.tcmalloc.32768:6.31 user 0.03 system 6.45 elapsed 98% CPU
time.11.ptmalloc.65536:22.40 user 96.38 system 58.30 elapsed 203% CPU
time.11.tcmalloc.65536:6.92 user 0.12 system 6.98 elapsed 100% CPU
time.11.ptmalloc.131072:21.03 user 108.04 system 72.78 elapsed 177% CPU
time.11.tcmalloc.131072:9.79 user 0.08 system 9.94 elapsed 99% CPU
time.12.ptmalloc.64:12.23 user 7.16 system 8.38 elapsed 231% CPU
time.12.tcmalloc.64:5.21 user 0.05 system 1.41 elapsed 371% CPU
time.12.ptmalloc.128:16.97 user 10.19 system 10.47 elapsed 259% CPU
time.12.tcmalloc.128:5.10 user 0.02 system 1.31 elapsed 390% CPU
time.12.ptmalloc.256:19.99 user 12.10 system 11.57 elapsed 277% CPU
time.12.tcmalloc.256:5.01 user 0.03 system 1.29 elapsed 390% CPU
time.12.ptmalloc.512:21.85 user 12.66 system 11.46 elapsed 300% CPU
time.12.tcmalloc.512:5.05 user 0.00 system 1.32 elapsed 379% CPU
time.12.ptmalloc.1024:9.40 user 0.04 system 2.40 elapsed 393% CPU
time.12.tcmalloc.1024:5.14 user 0.02 system 1.39 elapsed 369% CPU
time.12.ptmalloc.2048:9.72 user 0.04 system 2.49 elapsed 391% CPU
time.12.tcmalloc.2048:5.74 user 0.05 system 1.62 elapsed 355% CPU
time.12.ptmalloc.4096:10.64 user 0.20 system 2.75 elapsed 393% CPU
time.12.tcmalloc.4096:4.45 user 0.03 system 2.04 elapsed 218% CPU
time.12.ptmalloc.8192:12.66 user 3.30 system 4.30 elapsed 371% CPU
time.12.tcmalloc.8192:4.21 user 0.13 system 2.65 elapsed 163% CPU
time.12.ptmalloc.16384:15.73 user 15.68 system 11.14 elapsed 281% CPU
time.12.tcmalloc.16384:4.17 user 0.06 system 4.10 elapsed 102% CPU
time.12.ptmalloc.32768:19.45 user 56.00 system 32.74 elapsed 230% CPU
time.12.tcmalloc.32768:6.96 user 0.08 system 7.14 elapsed 98% CPU
time.12.ptmalloc.65536:23.33 user 110.45 system 65.06 elapsed 205% CPU
time.12.tcmalloc.65536:7.77 user 0.15 system 7.72 elapsed 102% CPU
time.12.ptmalloc.131072:24.03 user 124.74 system 82.94 elapsed 179% CPU
time.12.tcmalloc.131072:10.81 user 0.06 system 10.94 elapsed 99% CPU
time.13.ptmalloc.64:14.08 user 7.60 system 8.85 elapsed 244% CPU
time.13.tcmalloc.64:5.51 user 0.01 system 1.47 elapsed 375% CPU
time.13.ptmalloc.128:18.20 user 10.98 system 10.99 elapsed 265% CPU
time.13.tcmalloc.128:5.34 user 0.01 system 1.39 elapsed 382% CPU
time.13.ptmalloc.256:21.48 user 13.94 system 12.25 elapsed 289% CPU
time.13.tcmalloc.256:5.33 user 0.01 system 1.39 elapsed 381% CPU
time.13.ptmalloc.512:24.22 user 14.84 system 12.97 elapsed 301% CPU
time.13.tcmalloc.512:5.49 user 0.02 system 1.41 elapsed 389% CPU
time.13.ptmalloc.1024:25.26 user 17.03 system 12.85 elapsed 328% CPU
time.13.tcmalloc.1024:5.65 user 0.04 system 1.50 elapsed 378% CPU
time.13.ptmalloc.2048:10.41 user 0.03 system 2.69 elapsed 387% CPU
time.13.tcmalloc.2048:5.93 user 0.10 system 1.77 elapsed 339% CPU
time.13.ptmalloc.4096:11.37 user 0.52 system 3.04 elapsed 391% CPU
time.13.tcmalloc.4096:5.08 user 0.11 system 2.22 elapsed 233% CPU
time.13.ptmalloc.8192:21.76 user 18.54 system 10.58 elapsed 380% CPU
time.13.tcmalloc.8192:5.04 user 0.16 system 2.93 elapsed 177% CPU
time.13.ptmalloc.16384:26.35 user 34.47 system 17.01 elapsed 357% CPU
time.13.tcmalloc.16384:4.66 user 0.04 system 4.66 elapsed 100% CPU
time.13.ptmalloc.32768:21.41 user 63.59 system 38.14 elapsed 222% CPU
time.13.tcmalloc.32768:7.71 user 0.03 system 7.83 elapsed 98% CPU
time.13.ptmalloc.65536:24.99 user 120.80 system 71.59 elapsed 203% CPU
time.13.tcmalloc.65536:8.87 user 0.64 system 8.37 elapsed 113% CPU
time.13.ptmalloc.131072:25.97 user 142.27 system 96.00 elapsed 175% CPU
time.13.tcmalloc.131072:11.48 user 0.06 system 11.67 elapsed 98% CPU
time.14.ptmalloc.64:15.01 user 9.11 system 9.41 elapsed 256% CPU
time.14.tcmalloc.64:5.98 user 0.02 system 1.58 elapsed 378% CPU
time.14.ptmalloc.128:20.34 user 12.72 system 11.62 elapsed 284% CPU
time.14.tcmalloc.128:5.88 user 0.04 system 1.51 elapsed 392% CPU
time.14.ptmalloc.256:24.26 user 14.95 system 12.92 elapsed 303% CPU
time.14.tcmalloc.256:5.72 user 0.02 system 1.50 elapsed 381% CPU
time.14.ptmalloc.512:27.28 user 16.45 system 13.89 elapsed 314% CPU
time.14.tcmalloc.512:5.99 user 0.02 system 1.54 elapsed 388% CPU
time.14.ptmalloc.1024:25.84 user 16.99 system 12.61 elapsed 339% CPU
time.14.tcmalloc.1024:5.94 user 0.06 system 1.59 elapsed 375% CPU
time.14.ptmalloc.2048:11.96 user 0.01 system 3.12 elapsed 382% CPU
time.14.tcmalloc.2048:6.39 user 0.07 system 1.79 elapsed 359% CPU
time.14.ptmalloc.4096:20.19 user 11.77 system 8.26 elapsed 386% CPU
time.14.tcmalloc.4096:5.65 user 0.05 system 2.32 elapsed 244% CPU
time.14.ptmalloc.8192:22.01 user 16.39 system 9.89 elapsed 387% CPU
time.14.tcmalloc.8192:5.44 user 0.11 system 3.07 elapsed 180% CPU
time.14.ptmalloc.16384:18.15 user 22.40 system 15.02 elapsed 269% CPU
time.14.tcmalloc.16384:5.29 user 0.08 system 5.34 elapsed 100% CPU
time.14.ptmalloc.32768:24.29 user 72.07 system 42.63 elapsed 225% CPU
time.14.tcmalloc.32768:8.47 user 0.02 system 8.62 elapsed 98% CPU
time.14.ptmalloc.65536:27.63 user 130.56 system 78.64 elapsed 201% CPU
time.14.tcmalloc.65536:9.85 user 1.61 system 9.04 elapsed 126% CPU
time.14.ptmalloc.131072:28.87 user 146.38 system 100.54 elapsed 174% CPU
time.14.tcmalloc.131072:12.46 user 0.11 system 12.71 elapsed 98% CPU
time.15.ptmalloc.64:16.25 user 10.05 system 9.82 elapsed 267% CPU
time.15.tcmalloc.64:6.30 user 0.02 system 1.64 elapsed 385% CPU
time.15.ptmalloc.128:22.33 user 13.23 system 12.24 elapsed 290% CPU
time.15.tcmalloc.128:6.08 user 0.03 system 1.59 elapsed 384% CPU
time.15.ptmalloc.256:26.56 user 16.57 system 13.70 elapsed 314% CPU
time.15.tcmalloc.256:6.14 user 0.03 system 1.61 elapsed 382% CPU
time.15.ptmalloc.512:29.68 user 18.08 system 14.56 elapsed 327% CPU
time.15.tcmalloc.512:6.12 user 0.04 system 1.68 elapsed 364% CPU
time.15.ptmalloc.1024:17.07 user 6.22 system 6.26 elapsed 371% CPU
time.15.tcmalloc.1024:6.38 user 0.02 system 1.75 elapsed 364% CPU
time.15.ptmalloc.2048:26.64 user 17.25 system 11.51 elapsed 381% CPU
time.15.tcmalloc.2048:6.77 user 0.18 system 1.92 elapsed 361% CPU
time.15.ptmalloc.4096:13.21 user 0.74 system 3.57 elapsed 390% CPU
time.15.tcmalloc.4096:6.03 user 0.09 system 2.36 elapsed 258% CPU
time.15.ptmalloc.8192:22.92 user 17.51 system 10.50 elapsed 385% CPU
time.15.tcmalloc.8192:5.96 user 0.12 system 3.36 elapsed 180% CPU
time.15.ptmalloc.16384:19.37 user 24.87 system 16.69 elapsed 264% CPU
time.15.tcmalloc.16384:5.88 user 0.07 system 5.84 elapsed 101% CPU
time.15.ptmalloc.32768:25.43 user 82.30 system 48.98 elapsed 219% CPU
time.15.tcmalloc.32768:9.11 user 0.05 system 9.30 elapsed 98% CPU
time.15.ptmalloc.65536:29.31 user 140.07 system 83.78 elapsed 202% CPU
time.15.tcmalloc.65536:8.51 user 1.59 system 9.75 elapsed 103% CPU
time.15.ptmalloc.131072:30.22 user 163.15 system 109.50 elapsed 176% CPU
time.15.tcmalloc.131072:13.35 user 0.10 system 13.54 elapsed 99% CPU
time.16.ptmalloc.64:17.69 user 10.11 system 10.11 elapsed 274% CPU
time.16.tcmalloc.64:6.63 user 0.04 system 1.72 elapsed 387% CPU
time.16.ptmalloc.128:23.05 user 14.37 system 12.75 elapsed 293% CPU
time.16.tcmalloc.128:6.61 user 0.02 system 1.71 elapsed 387% CPU
time.16.ptmalloc.256:29.11 user 19.35 system 14.57 elapsed 332% CPU
time.16.tcmalloc.256:6.62 user 0.03 system 1.73 elapsed 382% CPU
time.16.ptmalloc.512:31.65 user 18.71 system 14.71 elapsed 342% CPU
time.16.tcmalloc.512:6.63 user 0.04 system 1.73 elapsed 383% CPU
time.16.ptmalloc.1024:31.99 user 21.22 system 14.87 elapsed 357% CPU
time.16.tcmalloc.1024:6.81 user 0.04 system 1.79 elapsed 382% CPU
time.16.ptmalloc.2048:30.35 user 21.36 system 13.30 elapsed 388% CPU
time.16.tcmalloc.2048:6.91 user 0.50 system 2.01 elapsed 367% CPU
time.16.ptmalloc.4096:18.85 user 7.18 system 6.61 elapsed 393% CPU
time.16.tcmalloc.4096:6.70 user 0.10 system 2.62 elapsed 259% CPU
time.16.ptmalloc.8192:22.19 user 14.30 system 9.37 elapsed 389% CPU
time.16.tcmalloc.8192:6.18 user 0.19 system 3.58 elapsed 177% CPU
time.16.ptmalloc.16384:31.22 user 46.78 system 22.92 elapsed 340% CPU
time.16.tcmalloc.16384:6.79 user 0.07 system 6.86 elapsed 99% CPU
time.16.ptmalloc.32768:27.31 user 87.32 system 52.00 elapsed 220% CPU
time.16.tcmalloc.32768:9.85 user 0.06 system 10.07 elapsed 98% CPU
time.16.ptmalloc.65536:32.83 user 160.62 system 95.67 elapsed 202% CPU
time.16.tcmalloc.65536:10.18 user 0.09 system 10.41 elapsed 98% CPU
time.16.ptmalloc.131072:31.99 user 173.41 system 115.98 elapsed 177% CPU
time.16.tcmalloc.131072:14.52 user 0.05 system 14.67 elapsed 99% CPU
time.17.ptmalloc.64:19.38 user 11.61 system 10.61 elapsed 291% CPU
time.17.tcmalloc.64:7.11 user 0.02 system 1.84 elapsed 386% CPU
time.17.ptmalloc.128:26.25 user 16.15 system 13.53 elapsed 313% CPU
time.17.tcmalloc.128:6.97 user 0.02 system 1.78 elapsed 390% CPU
time.17.ptmalloc.256:30.66 user 18.36 system 14.97 elapsed 327% CPU
time.17.tcmalloc.256:6.94 user 0.04 system 1.80 elapsed 387% CPU
time.17.ptmalloc.512:33.71 user 22.79 system 15.95 elapsed 354% CPU
time.17.tcmalloc.512:7.00 user 0.02 system 1.83 elapsed 381% CPU
time.17.ptmalloc.1024:33.49 user 22.47 system 15.00 elapsed 373% CPU
time.17.tcmalloc.1024:7.20 user 0.03 system 1.90 elapsed 380% CPU
time.17.ptmalloc.2048:23.87 user 11.92 system 9.26 elapsed 386% CPU
time.17.tcmalloc.2048:6.01 user 1.83 system 2.15 elapsed 363% CPU
time.17.ptmalloc.4096:14.69 user 0.95 system 3.98 elapsed 392% CPU
time.17.tcmalloc.4096:7.25 user 0.10 system 2.62 elapsed 279% CPU
time.17.ptmalloc.8192:22.44 user 13.52 system 9.39 elapsed 382% CPU
time.17.tcmalloc.8192:7.21 user 0.24 system 3.95 elapsed 188% CPU
time.17.ptmalloc.16384:23.33 user 33.67 system 21.89 elapsed 260% CPU
time.17.tcmalloc.16384:7.28 user 0.06 system 7.10 elapsed 103% CPU
time.17.ptmalloc.32768:29.35 user 103.11 system 60.36 elapsed 219% CPU
time.17.tcmalloc.32768:10.53 user 0.07 system 10.71 elapsed 98% CPU
time.17.ptmalloc.65536:33.21 user 170.89 system 100.84 elapsed 202% CPU
time.17.tcmalloc.65536:10.85 user 0.05 system 11.04 elapsed 98% CPU
time.17.ptmalloc.131072:34.98 user 182.87 system 122.05 elapsed 178% CPU
time.17.tcmalloc.131072:15.27 user 0.09 system 15.49 elapsed 99% CPU
time.18.ptmalloc.64:21.08 user 12.15 system 11.43 elapsed 290% CPU
time.18.tcmalloc.64:7.45 user 0.03 system 1.95 elapsed 383% CPU
time.18.ptmalloc.128:27.65 user 17.26 system 14.03 elapsed 320% CPU
time.18.tcmalloc.128:7.46 user 0.03 system 1.92 elapsed 389% CPU
time.18.ptmalloc.256:32.78 user 20.55 system 15.70 elapsed 339% CPU
time.18.tcmalloc.256:7.31 user 0.02 system 1.88 elapsed 389% CPU
time.18.ptmalloc.512:33.31 user 20.06 system 15.05 elapsed 354% CPU
time.18.tcmalloc.512:7.33 user 0.02 system 1.91 elapsed 383% CPU
time.18.ptmalloc.1024:35.46 user 24.83 system 16.30 elapsed 369% CPU
time.18.tcmalloc.1024:7.60 user 0.06 system 2.05 elapsed 373% CPU
time.18.ptmalloc.2048:19.98 user 6.80 system 6.76 elapsed 395% CPU
time.18.tcmalloc.2048:6.89 user 1.29 system 2.28 elapsed 357% CPU
time.18.ptmalloc.4096:15.99 user 0.93 system 4.32 elapsed 391% CPU
time.18.tcmalloc.4096:7.70 user 0.10 system 2.77 elapsed 280% CPU
time.18.ptmalloc.8192:23.51 user 14.84 system 9.97 elapsed 384% CPU
time.18.tcmalloc.8192:8.16 user 0.27 system 4.25 elapsed 197% CPU
time.18.ptmalloc.16384:35.79 user 52.41 system 26.47 elapsed 333% CPU
time.18.tcmalloc.16384:7.81 user 0.07 system 7.61 elapsed 103% CPU
time.18.ptmalloc.32768:33.17 user 116.07 system 68.64 elapsed 217% CPU
time.18.tcmalloc.32768:11.34 user 0.13 system 11.57 elapsed 99% CPU
time.18.ptmalloc.65536:35.91 user 177.82 system 106.75 elapsed 200% CPU
time.18.tcmalloc.65536:11.54 user 0.06 system 11.74 elapsed 98% CPU
time.18.ptmalloc.131072:36.38 user 187.18 system 126.91 elapsed 176% CPU
time.18.tcmalloc.131072:16.34 user 0.05 system 16.43 elapsed 99% CPU
time.19.ptmalloc.64:22.90 user 13.23 system 11.82 elapsed 305% CPU
time.19.tcmalloc.64:7.81 user 0.02 system 2.01 elapsed 388% CPU
time.19.ptmalloc.128:30.13 user 18.58 system 14.77 elapsed 329% CPU
time.19.tcmalloc.128:7.74 user 0.02 system 2.01 elapsed 386% CPU
time.19.ptmalloc.256:35.33 user 21.41 system 16.35 elapsed 347% CPU
time.19.tcmalloc.256:7.79 user 0.04 system 2.04 elapsed 382% CPU
time.19.ptmalloc.512:39.30 user 26.22 system 17.84 elapsed 367% CPU
time.19.tcmalloc.512:7.80 user 0.06 system 2.05 elapsed 381% CPU
time.19.ptmalloc.1024:35.70 user 23.90 system 15.66 elapsed 380% CPU
time.19.tcmalloc.1024:8.08 user 0.06 system 2.16 elapsed 376% CPU
time.19.ptmalloc.2048:18.33 user 3.28 system 5.47 elapsed 394% CPU
time.19.tcmalloc.2048:8.71 user 0.05 system 2.40 elapsed 363% CPU
time.19.ptmalloc.4096:16.94 user 0.89 system 4.64 elapsed 383% CPU
time.19.tcmalloc.4096:8.21 user 0.07 system 2.85 elapsed 289% CPU
time.19.ptmalloc.8192:25.61 user 17.15 system 11.33 elapsed 377% CPU
time.19.tcmalloc.8192:8.79 user 0.30 system 4.58 elapsed 198% CPU
time.19.ptmalloc.16384:27.11 user 46.66 system 29.67 elapsed 248% CPU
time.19.tcmalloc.16384:8.64 user 0.05 system 8.58 elapsed 101% CPU
time.19.ptmalloc.32768:33.80 user 117.69 system 70.65 elapsed 214% CPU
time.19.tcmalloc.32768:11.88 user 0.07 system 12.04 elapsed 99% CPU
time.19.ptmalloc.65536:36.90 user 180.21 system 109.01 elapsed 199% CPU
time.19.tcmalloc.65536:12.17 user 0.07 system 12.40 elapsed 98% CPU
time.19.ptmalloc.131072:38.50 user 195.15 system 132.81 elapsed 175% CPU
time.19.tcmalloc.131072:17.44 user 0.10 system 17.65 elapsed 99% CPU
time.20.ptmalloc.64:23.37 user 13.74 system 11.86 elapsed 312% CPU
time.20.tcmalloc.64:8.18 user 0.02 system 2.10 elapsed 389% CPU
time.20.ptmalloc.128:31.29 user 19.97 system 15.53 elapsed 329% CPU
time.20.tcmalloc.128:8.03 user 0.02 system 2.12 elapsed 378% CPU
time.20.ptmalloc.256:38.40 user 25.65 system 18.25 elapsed 350% CPU
time.20.tcmalloc.256:8.05 user 0.05 system 2.12 elapsed 380% CPU
time.20.ptmalloc.512:40.60 user 27.70 system 18.46 elapsed 369% CPU
time.20.tcmalloc.512:8.22 user 0.08 system 2.20 elapsed 375% CPU
time.20.ptmalloc.1024:40.02 user 28.52 system 17.56 elapsed 390% CPU
time.20.tcmalloc.1024:8.50 user 0.07 system 2.19 elapsed 391% CPU
time.20.ptmalloc.2048:16.13 user 0.23 system 4.23 elapsed 386% CPU
time.20.tcmalloc.2048:8.98 user 0.03 system 2.45 elapsed 367% CPU
time.20.ptmalloc.4096:17.14 user 0.87 system 4.60 elapsed 391% CPU
time.20.tcmalloc.4096:8.93 user 0.20 system 2.97 elapsed 306% CPU
time.20.ptmalloc.8192:25.24 user 17.16 system 11.14 elapsed 380% CPU
time.20.tcmalloc.8192:9.78 user 0.30 system 5.14 elapsed 195% CPU
time.20.ptmalloc.16384:39.93 user 60.36 system 30.24 elapsed 331% CPU
time.20.tcmalloc.16384:9.57 user 0.09 system 9.43 elapsed 102% CPU
time.20.ptmalloc.32768:36.44 user 130.23 system 76.79 elapsed 217% CPU
time.20.tcmalloc.32768:12.71 user 0.09 system 12.97 elapsed 98% CPU
time.20.ptmalloc.65536:39.79 user 202.09 system 120.34 elapsed 200% CPU
time.20.tcmalloc.65536:12.93 user 0.06 system 13.15 elapsed 98% CPU
time.20.ptmalloc.131072:41.91 user 202.76 system 138.51 elapsed 176% CPU
time.20.tcmalloc.131072:18.23 user 0.07 system 18.42 elapsed 99% CPU

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

597
docs/tcmalloc.adoc Normal file
View File

@ -0,0 +1,597 @@
= TCMalloc : Thread-Caching Malloc
:reproducible:
== [#motivation]#Motivation#
+[alk: Update from Dec 2024]+ Wondering how to update this document
from beginning of 2000-x I am choosing to keep this original
motivation writeup just below. Do keep in mind that referenced glibc
versions are now long obsolete. And amount of time per malloc call is
far from correct anymore. And the description is for 32-bit
computers. Still, I am choosing to keep this text intact, to help
people see where tcmalloc came from. "I" below refers to original
author: Sanjay. See at the end of this paragraph for some commentary
relevant for today.
'''''
TCMalloc is faster than the glibc 2.3 malloc (available as a separate
library called ptmalloc2) and other mallocs that I have tested.
ptmalloc2 takes approximately 300 nanoseconds to execute a malloc/free
pair on a 2.8 GHz P4 (for small objects). The TCMalloc implementation
takes approximately 50 nanoseconds for the same operation pair. Speed is
important for a malloc implementation because if malloc is not fast
enough, application writers are inclined to write their own custom free
lists on top of malloc. This can lead to extra complexity, and more
memory usage unless the application writer is very careful to
appropriately size the free lists and scavenge idle objects out of the
free list.
TCMalloc also reduces lock contention for multi-threaded programs. For
small objects, there is virtually zero contention. For large objects,
TCMalloc tries to use fine grained and efficient spinlocks. ptmalloc2
also reduces lock contention by using per-thread arenas but there is a
big problem with ptmalloc2's use of per-thread arenas. In ptmalloc2
memory can never move from one arena to another. This can lead to huge
amounts of wasted space. For example, in one Google application, the
first phase would allocate approximately 300MB of memory for its URL
canonicalization data structures. When the first phase finished, a
second phase would be started in the same address space. If this second
phase was assigned a different arena than the one used by the first
phase, this phase would not reuse any of the memory left after the first
phase and would add another 300MB to the address space. Similar memory
blowup problems were also noticed in other applications.
Another benefit of TCMalloc is space-efficient representation of small
objects. For example, N 8-byte objects can be allocated while using
space approximately `+8N * 1.01+` bytes. I.e., a one-percent space
overhead. ptmalloc2 uses a four-byte header for each object and (I
think) rounds up the size to a multiple of 8 bytes and ends up using
`+16N+` bytes.
'''''
+[alk: Update from Dec 2024]+ tcmalloc (now gperftools) has evolved a
lot over last 20-ish years. Back then it was one of the first
production-grade malloc that used per-thread caching. This days
per-thread (or even per-cpu) caching is widespread. Typical C++
programs tend to allocate and free memory somewhat frequently and
those small allocations are generally kept fast and avoid any locks
(in most cases). Most of gperftools evolution was on getting those
common cases even cheaper. Others improved too. glibc, while still
being somewhat slower than gperftools, is a lot faster than it was and
also avoids locks in many of those common case allocations.
gperftools on modern systems with efficient "native" thread-local
storage access (i.e. GNU/Linux, most BSDs, even Windows, but, notably,
not OSX) takes just a couple dozen cheap instructions for allocation
or deallocation, which is better than most competition. We're talking
in the ballpark of just a couple nanoseconds per operation on modern
fast out-of-order CPUs in this fast-path case (all caches are hot
etc). I.e. compare to mid-tens of nanos per malloc/free pair 20 years
ago (!)
Also, the reader should be aware that another descendant of the
original tcmalloc is now available at
https://github.com/google/tcmalloc (I call it "abseil tcmalloc" due to
it's hard dependency on abseil). Its main feature is efficient per-cpu
caches (but it needs RSEQ support from fairly recent Linux kernels).
Another direction of evolution, particularly at Google, was increasing
focus on helping diagnose or prevent production problems related to
dynamic memory allocation. So there is debug version of tcmalloc with
some relatively lightweight checking against common bugs (like
double-free). So there is heap sampling that has low enough overhead
to be always enabled. There are relatively comprehensive statistics
available and more. "abseil tcmalloc" is doing even better than
gperftools in this regard.
== [#Usage]#Usage#
To use TCMalloc, just link TCMalloc into your application via the
"-ltcmalloc" linker flag.
You can use TCMalloc in applications you didn't compile yourself, by
using LD_PRELOAD:
....
% LD_PRELOAD="/usr/lib/libtcmalloc.so"
....
TCMalloc includes a link:heapprofile.html[heap profiler] as well.
If you'd rather link in a version of TCMalloc that does not include
the heap profiler (perhaps to reduce binary size for a static binary),
you can link in `+libtcmalloc_minimal+` instead.
== [#Overview]#Overview#
TCMalloc assigns each thread a thread-local cache. Small allocations are
satisfied from the thread-local cache. Objects are moved from central
data structures into a thread-local cache as needed, and periodic
garbage collections are used to migrate memory back from a thread-local
cache into the central data structures.
image:overview.gif[overview]
TCMalloc treats objects with size +<=+ 256K ("small" objects) differently
from larger objects. Large objects are allocated directly from the
central heap using a page-level allocator (a page is a 8K aligned region
of memory). I.e., a large object is always page-aligned and occupies an
integral number of pages.
A run of pages can be carved up into a sequence of small objects, each
equally sized. For example a run of one page (4K) can be carved up into
32 objects of size 128 bytes each.
== [#Small_Object_Allocation]#Small Object Allocation#
Each small object size maps to one of approximately 88 allocatable
size-classes. For example, all allocations in the range 961 to 1024
bytes are rounded up to 1024. The size-classes are spaced so that small
sizes are separated by 8 bytes, larger sizes by 16 bytes, even larger
sizes by 32 bytes, and so forth. The maximal spacing is controlled so
that not too much space is wasted when an allocation request falls just
past the end of a size class and has to be rounded up to the next class.
A thread cache contains a singly linked list of free objects per
size-class.
image:threadheap.gif[threadheap]
When allocating a small object: (1) We map its size to the corresponding
size-class. (2) Look in the corresponding free list in the thread cache
for the current thread. (3) If the free list is not empty, we remove the
first object from the list and return it. When following this fast path,
TCMalloc acquires no locks at all.
If the free list is empty: (1) We fetch a bunch of objects from a
central free list for this size-class (the central free list is shared
by all threads). (2) Place them in the thread-local free list. (3)
Return one of the newly fetched objects to the applications.
If the central free list is also empty: (1) We allocate a run of pages
from the central page allocator. (2) Split the run into a set of objects
of this size-class. (3) Place the new objects on the central free list.
(4) As before, move some of these objects to the thread-local free list.
=== [#Sizing_Thread_Cache_Free_Lists]#Sizing Thread Cache Free Lists#
It is important to size the thread cache free lists correctly. If the
free list is too small, we'll need to go to the central free list too
often. If the free list is too big, we'll waste memory as objects sit
idle in the free list.
Note that the thread caches are just as important for deallocation as
they are for allocation. Without a cache, each deallocation would
require moving the memory to the central free list. Also, some threads
have asymmetric alloc/free behavior (e.g. producer and consumer
threads), so sizing the free list correctly gets trickier.
To size the free lists appropriately, we use a slow-start algorithm to
determine the maximum length of each individual free list. As the free
list is used more frequently, its maximum length grows. However, if a
free list is used more for deallocation than allocation, its maximum
length will grow only up to a point where the whole list can be
efficiently moved to the central free list at once.
The pseudo-code below illustrates this slow-start algorithm. Note that
`+num_objects_to_move+` is specific to each size class. By moving a list
of objects with a well-known length, the central cache can efficiently
pass these lists between thread caches. If a thread cache wants fewer
than `+num_objects_to_move+`, the operation on the central free list has
linear time complexity. The downside of always using
`+num_objects_to_move+` as the number of objects to transfer to and from
the central cache is that it wastes memory in threads that don't need
all of those objects.
....
Start each freelist max_length at 1.
Allocation
if freelist empty {
fetch min(max_length, num_objects_to_move) from central list;
if max_length < num_objects_to_move { // slow-start
max_length++;
} else {
max_length += num_objects_to_move;
}
}
Deallocation
if length > max_length {
// Don't try to release num_objects_to_move if we don't have that many.
release min(max_length, num_objects_to_move) objects to central list
if max_length < num_objects_to_move {
// Slow-start up to num_objects_to_move.
max_length++;
} else if max_length > num_objects_to_move {
// If we consistently go over max_length, shrink max_length.
overages++;
if overages > kMaxOverages {
max_length -= num_objects_to_move;
overages = 0;
}
}
}
....
See also the section on link:#Garbage_Collection[Garbage Collection] to
see how it affects the `+max_length+`.
== [#Medium_Object_Allocation]#Medium Object Allocation#
A medium object size (256K ≤ size ≤ 1MB) is rounded up to a page size
(8K) and is handled by a central page heap. The central page heap
includes an array of 128 free lists. The `k`-th entry is a free list of
runs that consist of `k + 1` pages:
image:pageheap.gif[pageheap]
An allocation for `k` pages is satisfied by looking in the `k`-th
free list. If that free list is empty, we look in the next free list,
and so forth. If no medium-object free list can satisfy the allocation,
the allocation is treated as a large object.
== [#Large_Object_Allocation]#Large Object Allocation#
Allocations of 1MB or more are considered large allocations. Spans of
free memory which can satisfy these allocations are tracked in a
red-black tree sorted by size. Allocations follow the _best-fit_
algorithm: the tree is searched to find the smallest span of free space
which is larger than the requested allocation. The allocation is carved
out of that span, and the remaining space is reinserted either into the
large object tree or possibly into one of the smaller free-lists as
appropriate. If no span of free memory is located that can fit the
requested allocation, we fetch memory from the system (using `+sbrk+`,
or `+mmap+`).
If an allocation for `+k+` pages is satisfied by a run of pages of
length > `+k+`, the remainder of the run is re-inserted back into the
appropriate free list in the page heap.
== [#Spans]#Spans#
The heap managed by TCMalloc consists of a set of pages. A run of
contiguous pages is represented by a `+Span+` object. A span can either
be _allocated_, or _free_. If free, the span is one of the entries in a
page heap linked-list. If allocated, it is either a large object that
has been handed off to the application, or a run of pages that have been
split up into a sequence of small objects. If split into small objects,
the size-class of the objects is recorded in the span.
A central array indexed by page number can be used to find the span to
which a page belongs. For example, span _a_ below occupies 2 pages, span
_b_ occupies 1 page, span _c_ occupies 5 pages and span _d_ occupies 3
pages.
image:spanmap.gif[spanmap]
In a 32-bit address space, the central array is represented by a a
2-level radix tree where the root contains 32 entries and each leaf
contains 2^14 entries (a 32-bit address space has 2^19 8K pages, and the
first level of tree divides the 2^19 pages by 2^5). This leads to a
starting memory usage of 64KB of space (2^14*4 bytes) for the central
array, which seems acceptable.
On 64-bit machines, we use a 3-level radix tree. Note that, many
common 64-bit machines have limits on actual address space size. So on
x86 we use 48 bits of address and handle it by slightly-faster 2-level
radix tree.
== [#Deallocation]#Deallocation#
When an object is deallocated, we compute its page number and look it
up in the central array to find the corresponding span object. The
span tells us whether or not the object is small, and its size-class
if it is small. If the object is small, we insert it into the
appropriate free list in the current thread's thread cache. If the
thread cache now exceeds it's max_size_ amount, we run a garbage
collector that moves unused objects from the thread cache into central
free lists.
If the object is large, the span tells us the range of pages covered by
the object. Suppose this range is `+[p,q]+`. We also lookup the spans
for pages `+p-1+` and `+q+1+`. If either of these neighboring spans are
free, we coalesce them with the `+[p,q]+` span. The resulting span is
inserted into the appropriate free list in the page heap.
== Central Free Lists for Small Objects
As mentioned before, we keep a central free list for each size-class.
Each central free list is organized as a two-level data structure: a set
of spans, and a linked list of free objects per span.
An object is allocated from a central free list by removing the first
entry from the linked list of some span. (If all spans have empty linked
lists, a suitably sized span is first allocated from the central page
heap.)
An object is returned to a central free list by adding it to the linked
list of its containing span. If the linked list length now equals the
total number of small objects in the span, this span is now completely
free and is returned to the page heap.
== [#Garbage_Collection]#Garbage Collection of Thread Caches#
Garbage collecting objects from a thread cache keeps the size of the
cache under control and returns unused objects to the central free
lists. Some threads need large caches to perform well while others can
get by with little or no cache at all. When a thread cache goes over its
`+max_size+`, garbage collection kicks in and then the thread competes
with the other threads for a larger cache.
Garbage collection is run only during a deallocation. We walk over all
free lists in the cache and move some number of objects from the free
list to the corresponding central list.
The number of objects to be moved from a free list is determined using a
per-list low-water-mark `+L+`. `+L+` records the minimum length of the
list since the last garbage collection. Note that we could have
shortened the list by `+L+` objects at the last garbage collection
without requiring any extra accesses to the central list. We use this
past history as a predictor of future accesses and move `+L/2+` objects
from the thread cache free list to the corresponding central free list.
This algorithm has the nice property that if a thread stops using a
particular size, all objects of that size will quickly move from the
thread cache to the central free list where they can be used by other
threads.
If a thread consistently deallocates more objects of a certain size than
it allocates, this `+L/2+` behavior will cause at least `+L/2+` objects
to always sit in the free list. To avoid wasting memory this way, we
shrink the maximum length of the freelist to converge on
`+num_objects_to_move+` (see also
link:#Sizing_Thread_Cache_Free_Lists[Sizing Thread Cache Free Lists]).
....
Garbage Collection
if (L != 0 && max_length > num_objects_to_move) {
max_length = max(max_length - num_objects_to_move, num_objects_to_move)
}
....
The fact that the thread cache went over its `+max_size+` is an
indication that the thread would benefit from a larger cache. Simply
increasing `+max_size+` would use an inordinate amount of memory in
programs that have lots of active threads. Developers can bound the
memory used with the parameter
`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES`.
Each thread cache starts with a small `+max_size+` (e.g. 64KB) so that
idle threads won't pre-allocate memory they don't need. Each time the
cache runs a garbage collection, it will also try to grow its
`+max_size+`. If the sum of the thread cache sizes is less than
`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES`, `+max_size+` grows easily. If
not, thread cache 1 will try to steal from thread cache 2 (picked
round-robin) by decreasing thread cache 2's `+max_size+`. In this way,
threads that are more active will steal memory from other threads more
often than they are have memory stolen from themselves. Mostly idle
threads end up with small caches and active threads end up with big
caches. Note that this stealing can cause the sum of the thread cache
sizes to be greater than `TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES` until
thread cache 2 deallocates some memory to trigger a garbage
collection.
== [#performance]#Performance Notes#
gperftools' area of relative strength is cases where per-thread caches
are effective. This is typically exercised by fairly typical C++ codes
that allocate relatively often and where object lifetimes tend to be
small-ish.
Both "abseil tcmalloc" and gperftools continue to have un-sharded
central free lists and page heaps. Which means that misses to caches
tend to be not so scalable compared to some competition.
This means that in some cases you may want to tweak thread caches
higher. Also if your workload has many threads that tend to be idle
for longer durations, consider using
`MallocExtension::MarkThread{Idle,Busy}`.
== [#runtime]#Modifying Runtime Behavior#
You can more finely control the behavior of the tcmalloc via environment
variables.
Generally useful flags:
[cols=",,",]
|===
|`TCMALLOC_SAMPLE_PARAMETER` |default: 0 |The approximate gap between
sampling actions. That is, we take one sample approximately once every
`tcmalloc_sample_parmeter` bytes of allocation. This sampled heap
information is available via `MallocExtension::GetHeapSample()` or
`MallocExtension::ReadStackTraces()`. A reasonable value is 524288.
|`TCMALLOC_RELEASE_RATE` |default: 1.0 |Rate at which we release
unused memory to the system, via `+madvise(MADV_DONTNEED)+`, on systems
that support it. Zero means we never release memory back to the system.
Increase this flag to return memory faster; decrease it to return memory
slower. Reasonable rates are in the range [0,10].
|`TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD` |default: 1073741824
|Allocations larger than this value cause a stack trace to be dumped
to stderr. The threshold for dumping stack traces is increased by a
factor of 1.125 every time we print a message so that the threshold
automatically goes up by a factor of ~1000 every 60 messages. This
bounds the amount of extra logging generated by this flag. Default
value of this flag is very large and therefore you should see no extra
logging unless the flag is overridden.
|`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES` |default: 33554432 |Bound
on the total amount of bytes allocated to thread caches. This bound is
not strict, so it is possible for the cache to go over this bound in
certain circumstances. This value defaults to 16MB. For applications
with many threads, this may not be a large enough cache, which can
affect performance. If you suspect your application is not scaling to
many threads due to lock contention in TCMalloc, you can try
increasing this value. This may improve performance, at a cost of
extra memory use by TCMalloc. See link:#Garbage_Collection[Garbage
Collection] for more details.
|`TCMALLOC_AGGRESSIVE_DECOMMIT` | default: false |Enables "aggressive
decommit mode", which makes all tcmalloc to return all free spans to
kernel. This reduces total phsycical memory usage at cost of some
performance (about 2% cpu hit in Chrome was measured at some point).
|`TCMALLOC_OVERRIDE_PAGESIZE` | default: getpagesize() | Sometimes we
run on systems with larger than anticipatesd hardware page
size. I.e. ARMs (and soon RISC-Vs) can run 64k pages mode. We detect
actual page size at run-time and adjust our span sizings to do memory
management syscalls with correct granularity. Larger pages generally
cause somewhat higher memory fragmentation, so we have this parameter
to be able measuring fragmentation impact of larger pages.
|`TCMALLOC_HEAP_LIMIT_MB` | default: No limit | Sets limit on total
size of page heap (in-use spans and "free but not returned"
spans). When tcmalloc hits this limit it tries to return some free
spans to kernel. And if that isn't enough to keep page heap size under
limit it OOMs. "abseil tcmalloc" has equivalent "hard limit".
|===
Advanced "tweaking" flags, that control more precisely how tcmalloc
tries to allocate memory from the kernel.
[cols=",,",]
|===
|`TCMALLOC_SKIP_MMAP` |default: false |If true, do not try to use
`+mmap+` to obtain memory from the kernel.
|`TCMALLOC_SKIP_SBRK` |default: false |If true, do not try to use
`+sbrk+` to obtain memory from the kernel.
|`TCMALLOC_MEMFS_MALLOC_PATH` |default: "" |If set, specify a path
where hugetlbfs or tmpfs is mounted. This may allow for speedier
allocations.
|`TCMALLOC_MEMFS_LIMIT_MB` |default: 0 |Limit total memfs allocation
size to specified number of MB. 0 means "no limit".
|`TCMALLOC_MEMFS_ABORT_ON_FAIL` |default: false |If true, abort()
whenever memfs_malloc fails to satisfy an allocation.
|`TCMALLOC_MEMFS_IGNORE_MMAP_FAIL` |default: false |If true, ignore
failures from mmap.
|`TCMALLOC_MEMFS_MAP_PRIVATE` |default: false |If true, use
MAP_PRIVATE when mapping via memfs, not MAP_SHARED.
|`TCMALLOC_MEMFS_DISABLE_FALLBACK` |default: false |If true, OOM on
failing to allocate from memfs instead of falling back to anonymous
memory (sbrk/mmap)
|===
== [#compiletime]#Modifying Behavior In Code#
The `+MallocExtension+` class, in `+malloc_extension.h+`, provides a few
knobs that you can tweak in your program, to affect tcmalloc's behavior.
=== Releasing Memory Back to the System
By default, tcmalloc will release no-longer-used memory back to the
kernel gradually, over time. The link:#runtime[tcmalloc_release_rate]
flag controls how quickly this happens. You can also force a release at
a given point in the progam execution like so:
....
MallocExtension::instance()->ReleaseFreeMemory();
....
You can also call `+SetMemoryReleaseRate()+` to change the
`+tcmalloc_release_rate+` value at runtime, or `+GetMemoryReleaseRate+`
to see what the current release rate is.
=== Memory Introspection
There are several routines for getting a human-readable form of the
current memory usage:
....
MallocExtension::instance()->GetStats(buffer, buffer_length);
MallocExtension::instance()->GetHeapSample(&string);
MallocExtension::instance()->GetHeapGrowthStacks(&string);
....
The last two create files in the same format as the heap-profiler, and
can be passed as data files to pprof. The first is human-readable and is
meant for debugging.
=== Generic Tcmalloc Status
TCMalloc has support for setting and retrieving arbitrary 'properties':
....
MallocExtension::instance()->SetNumericProperty(property_name, value);
MallocExtension::instance()->GetNumericProperty(property_name, &value);
....
It is possible for an application to set and get these properties, but
the most useful is when a library sets the properties so the application
can read them. Here are the properties TCMalloc defines; you can access
them with a call like
`MallocExtension::instance()->GetNumericProperty("generic.heap_size", &value);`:
[cols=",",]
|===
|`generic.current_allocated_bytes` |Number of bytes used by the
application. This will not typically match the memory use reported by
the OS, because it does not include TCMalloc overhead or memory
fragmentation.
|`generic.heap_size` |Bytes of system memory reserved by TCMalloc.
|`tcmalloc.pageheap_free_bytes` |Number of bytes in free, mapped pages
in page heap. These bytes can be used to fulfill allocation requests.
They always count towards virtual memory usage, and unless the
underlying memory is swapped out by the OS, they also count towards
physical memory usage.
|`tcmalloc.pageheap_unmapped_bytes` |Number of bytes in free, unmapped
pages in page heap. These are bytes that have been released back to the
OS, possibly by one of the MallocExtension "Release" calls. They can be
used to fulfill allocation requests, but typically incur a page fault.
They always count towards virtual memory usage, and depending on the OS,
typically do not count towards physical memory usage.
|`tcmalloc.slack_bytes` |Sum of pageheap_free_bytes and
pageheap_unmapped_bytes. Provided for backwards compatibility only. Do
not use.
|`tcmalloc.max_total_thread_cache_bytes` |A limit to how much memory
TCMalloc dedicates for small objects. Higher numbers trade off more
memory use for -- in some situations -- improved efficiency.
|`tcmalloc.current_total_thread_cache_bytes` |A measure of some of the
memory TCMalloc is using (for small objects).
|`tcmalloc.min_per_thread_cache_bytes` |A lower limit to how much
memory TCMalloc dedicates for small objects per thread. Note that this
property only shows effect if per-thread cache calculated using
tcmalloc.max_total_thread_cache_bytes ended up being less than
tcmalloc.min_per_thread_cache_bytes.
|===
=== [#caveats]#Caveats#
TCMalloc may be somewhat more memory hungry than other mallocs, (but
tends not to have the huge blowups that can happen with other mallocs).
In particular, at startup TCMalloc allocates approximately 240KB of
internal memory.
Don't try to load TCMalloc into a running binary (e.g., using JNI in
Java programs). The binary will have allocated some objects using the
system malloc, and may try to pass them to TCMalloc for deallocation.
TCMalloc will not be able to handle such objects.
'''''
Original author: Sanjay Ghemawat +
Last updated by: Aliaksei Kandratsenka (Dec 2024)

View File

@ -1,788 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.01 transitional//en">
<!-- $Id: $ -->
<html>
<head>
<title>TCMalloc : Thread-Caching Malloc</title>
<link rel="stylesheet" href="designstyle.css">
<style type="text/css">
em {
color: red;
font-style: normal;
}
</style>
</head>
<body>
<h1>TCMalloc : Thread-Caching Malloc</h1>
<address>Sanjay Ghemawat</address>
<h2><A name=motivation>Motivation</A></h2>
<p>TCMalloc is faster than the glibc 2.3 malloc (available as a
separate library called ptmalloc2) and other mallocs that I have
tested. ptmalloc2 takes approximately 300 nanoseconds to execute a
malloc/free pair on a 2.8 GHz P4 (for small objects). The TCMalloc
implementation takes approximately 50 nanoseconds for the same
operation pair. Speed is important for a malloc implementation
because if malloc is not fast enough, application writers are inclined
to write their own custom free lists on top of malloc. This can lead
to extra complexity, and more memory usage unless the application
writer is very careful to appropriately size the free lists and
scavenge idle objects out of the free list.</p>
<p>TCMalloc also reduces lock contention for multi-threaded programs.
For small objects, there is virtually zero contention. For large
objects, TCMalloc tries to use fine grained and efficient spinlocks.
ptmalloc2 also reduces lock contention by using per-thread arenas but
there is a big problem with ptmalloc2's use of per-thread arenas. In
ptmalloc2 memory can never move from one arena to another. This can
lead to huge amounts of wasted space. For example, in one Google
application, the first phase would allocate approximately 300MB of
memory for its URL canonicalization data structures. When the first
phase finished, a second phase would be started in the same address
space. If this second phase was assigned a different arena than the
one used by the first phase, this phase would not reuse any of the
memory left after the first phase and would add another 300MB to the
address space. Similar memory blowup problems were also noticed in
other applications.</p>
<p>Another benefit of TCMalloc is space-efficient representation of
small objects. For example, N 8-byte objects can be allocated while
using space approximately <code>8N * 1.01</code> bytes. I.e., a
one-percent space overhead. ptmalloc2 uses a four-byte header for
each object and (I think) rounds up the size to a multiple of 8 bytes
and ends up using <code>16N</code> bytes.</p>
<h2><A NAME="Usage">Usage</A></h2>
<p>To use TCMalloc, just link TCMalloc into your application via the
"-ltcmalloc" linker flag.</p>
<p>You can use TCMalloc in applications you didn't compile yourself,
by using LD_PRELOAD:</p>
<pre>
$ LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary>
</pre>
<p>LD_PRELOAD is tricky, and we don't necessarily recommend this mode
of usage.</p>
<p>TCMalloc includes a <A HREF="heap_checker.html">heap checker</A>
and <A HREF="heapprofile.html">heap profiler</A> as well.</p>
<p>If you'd rather link in a version of TCMalloc that does not include
the heap profiler and checker (perhaps to reduce binary size for a
static binary), you can link in <code>libtcmalloc_minimal</code>
instead.</p>
<h2><A NAME="Overview">Overview</A></h2>
<p>TCMalloc assigns each thread a thread-local cache. Small
allocations are satisfied from the thread-local cache. Objects are
moved from central data structures into a thread-local cache as
needed, and periodic garbage collections are used to migrate memory
back from a thread-local cache into the central data structures.</p>
<center><img src="overview.gif"></center>
<p>TCMalloc treats objects with size &lt;= 256K ("small" objects)
differently from larger objects. Large objects are allocated directly
from the central heap using a page-level allocator (a page is a 8K
aligned region of memory). I.e., a large object is always
page-aligned and occupies an integral number of pages.</p>
<p>A run of pages can be carved up into a sequence of small objects,
each equally sized. For example a run of one page (4K) can be carved
up into 32 objects of size 128 bytes each.</p>
<h2><A NAME="Small_Object_Allocation">Small Object Allocation</A></h2>
<p>Each small object size maps to one of approximately 88 allocatable
size-classes. For example, all allocations in the range 961 to 1024
bytes are rounded up to 1024. The size-classes are spaced so that
small sizes are separated by 8 bytes, larger sizes by 16 bytes, even
larger sizes by 32 bytes, and so forth. The maximal spacing is
controlled so that not too much space is wasted when an allocation
request falls just past the end of a size class and has to be rounded
up to the next class.</p>
<p>A thread cache contains a singly linked list of free objects per
size-class.</p>
<center><img src="threadheap.gif"></center>
<p>When allocating a small object: (1) We map its size to the
corresponding size-class. (2) Look in the corresponding free list in
the thread cache for the current thread. (3) If the free list is not
empty, we remove the first object from the list and return it. When
following this fast path, TCMalloc acquires no locks at all. This
helps speed-up allocation significantly because a lock/unlock pair
takes approximately 100 nanoseconds on a 2.8 GHz Xeon.</p>
<p>If the free list is empty: (1) We fetch a bunch of objects from a
central free list for this size-class (the central free list is shared
by all threads). (2) Place them in the thread-local free list. (3)
Return one of the newly fetched objects to the applications.</p>
<p>If the central free list is also empty: (1) We allocate a run of
pages from the central page allocator. (2) Split the run into a set
of objects of this size-class. (3) Place the new objects on the
central free list. (4) As before, move some of these objects to the
thread-local free list.</p>
<h3><A NAME="Sizing_Thread_Cache_Free_Lists">
Sizing Thread Cache Free Lists</A></h3>
<p>It is important to size the thread cache free lists correctly. If
the free list is too small, we'll need to go to the central free list
too often. If the free list is too big, we'll waste memory as objects
sit idle in the free list.</p>
<p>Note that the thread caches are just as important for deallocation
as they are for allocation. Without a cache, each deallocation would
require moving the memory to the central free list. Also, some threads
have asymmetric alloc/free behavior (e.g. producer and consumer threads),
so sizing the free list correctly gets trickier.</p>
<p>To size the free lists appropriately, we use a slow-start algorithm
to determine the maximum length of each individual free list. As the
free list is used more frequently, its maximum length grows. However,
if a free list is used more for deallocation than allocation, its
maximum length will grow only up to a point where the whole list can
be efficiently moved to the central free list at once.</p>
<p>The psuedo-code below illustrates this slow-start algorithm. Note
that <code>num_objects_to_move</code> is specific to each size class.
By moving a list of objects with a well-known length, the central
cache can efficiently pass these lists between thread caches. If
a thread cache wants fewer than <code>num_objects_to_move</code>,
the operation on the central free list has linear time complexity.
The downside of always using <code>num_objects_to_move</code> as
the number of objects to transfer to and from the central cache is
that it wastes memory in threads that don't need all of those objects.
<pre>
Start each freelist max_length at 1.
Allocation
if freelist empty {
fetch min(max_length, num_objects_to_move) from central list;
if max_length < num_objects_to_move { // slow-start
max_length++;
} else {
max_length += num_objects_to_move;
}
}
Deallocation
if length > max_length {
// Don't try to release num_objects_to_move if we don't have that many.
release min(max_length, num_objects_to_move) objects to central list
if max_length < num_objects_to_move {
// Slow-start up to num_objects_to_move.
max_length++;
} else if max_length > num_objects_to_move {
// If we consistently go over max_length, shrink max_length.
overages++;
if overages > kMaxOverages {
max_length -= num_objects_to_move;
overages = 0;
}
}
}
</pre>
See also the section on <a href="#Garbage_Collection">Garbage Collection</a>
to see how it affects the <code>max_length</code>.
<h2><A NAME="Medium_Object_Allocation">Medium Object Allocation</A></h2>
<p>A medium object size (256K &le; size &le; 1MB) is rounded up to a page
size (8K) and is handled by a central page heap. The central page heap
includes an array of 128 free lists. The <code>k</code>th entry is a
free list of runs that consist of <code>k + 1</code> pages:</p>
<center><img src="pageheap.gif"></center>
<p>An allocation for <code>k</code> pages is satisfied by looking in
the <code>k</code>th free list. If that free list is empty, we look
in the next free list, and so forth. If no medium-object free list
can satisfy the allocation, the allocation is treated as a large object.
<h2><A NAME="Large_Object_Allocation">Large Object Allocation</A></h2>
Allocations of 1MB or more are considered large allocations. Spans
of free memory which can satisfy these allocations are tracked in
a red-black tree sorted by size. Allocations follow the <em>best-fit</em>
algorithm: the tree is searched to find the smallest span of free
space which is larger than the requested allocation. The allocation
is carved out of that span, and the remaining space is reinserted
either into the large object tree or possibly into one of the smaller
free-lists as appropriate.
If no span of free memory is located that can fit the requested
allocation, we fetch memory from the system (using <code>sbrk</code>,
<code>mmap</code>, or by mapping in portions of
<code>/dev/mem</code>).</p>
<p>If an allocation for <code>k</code> pages is satisfied by a run
of pages of length &gt; <code>k</code>, the remainder of the
run is re-inserted back into the appropriate free list in the
page heap.</p>
<h2><A NAME="Spans">Spans</A></h2>
<p>The heap managed by TCMalloc consists of a set of pages. A run of
contiguous pages is represented by a <code>Span</code> object. A span
can either be <em>allocated</em>, or <em>free</em>. If free, the span
is one of the entries in a page heap linked-list. If allocated, it is
either a large object that has been handed off to the application, or
a run of pages that have been split up into a sequence of small
objects. If split into small objects, the size-class of the objects
is recorded in the span.</p>
<p>A central array indexed by page number can be used to find the span to
which a page belongs. For example, span <em>a</em> below occupies 2
pages, span <em>b</em> occupies 1 page, span <em>c</em> occupies 5
pages and span <em>d</em> occupies 3 pages.</p>
<center><img src="spanmap.gif"></center>
<p>In a 32-bit address space, the central array is represented by a a
2-level radix tree where the root contains 32 entries and each leaf
contains 2^14 entries (a 32-bit address space has 2^19 8K pages, and
the first level of tree divides the 2^19 pages by 2^5). This leads to
a starting memory usage of 64KB of space (2^14*4 bytes) for the
central array, which seems acceptable.</p>
<p>On 64-bit machines, we use a 3-level radix tree.</p>
<h2><A NAME="Deallocation">Deallocation</A></h2>
<p>When an object is deallocated, we compute its page number and look
it up in the central array to find the corresponding span object. The
span tells us whether or not the object is small, and its size-class
if it is small. If the object is small, we insert it into the
appropriate free list in the current thread's thread cache. If the
thread cache now exceeds a predetermined size (2MB by default), we run
a garbage collector that moves unused objects from the thread cache
into central free lists.</p>
<p>If the object is large, the span tells us the range of pages covered
by the object. Suppose this range is <code>[p,q]</code>. We also
lookup the spans for pages <code>p-1</code> and <code>q+1</code>. If
either of these neighboring spans are free, we coalesce them with the
<code>[p,q]</code> span. The resulting span is inserted into the
appropriate free list in the page heap.</p>
<h2>Central Free Lists for Small Objects</h2>
<p>As mentioned before, we keep a central free list for each
size-class. Each central free list is organized as a two-level data
structure: a set of spans, and a linked list of free objects per
span.</p>
<p>An object is allocated from a central free list by removing the
first entry from the linked list of some span. (If all spans have
empty linked lists, a suitably sized span is first allocated from the
central page heap.)</p>
<p>An object is returned to a central free list by adding it to the
linked list of its containing span. If the linked list length now
equals the total number of small objects in the span, this span is now
completely free and is returned to the page heap.</p>
<h2><A NAME="Garbage_Collection">Garbage Collection of Thread Caches</A></h2>
<p>Garbage collecting objects from a thread cache keeps the size of
the cache under control and returns unused objects to the central free
lists. Some threads need large caches to perform well while others
can get by with little or no cache at all. When a thread cache goes
over its <code>max_size</code>, garbage collection kicks in and then the
thread competes with the other threads for a larger cache.</p>
<p>Garbage collection is run only during a deallocation. We walk over
all free lists in the cache and move some number of objects from the
free list to the corresponding central list.</p>
<p>The number of objects to be moved from a free list is determined
using a per-list low-water-mark <code>L</code>. <code>L</code>
records the minimum length of the list since the last garbage
collection. Note that we could have shortened the list by
<code>L</code> objects at the last garbage collection without
requiring any extra accesses to the central list. We use this past
history as a predictor of future accesses and move <code>L/2</code>
objects from the thread cache free list to the corresponding central
free list. This algorithm has the nice property that if a thread
stops using a particular size, all objects of that size will quickly
move from the thread cache to the central free list where they can be
used by other threads.</p>
<p>If a thread consistently deallocates more objects of a certain size
than it allocates, this <code>L/2</code> behavior will cause at least
<code>L/2</code> objects to always sit in the free list. To avoid
wasting memory this way, we shrink the maximum length of the freelist
to converge on <code>num_objects_to_move</code> (see also
<a href="#Sizing_Thread_Cache_Free_Lists">Sizing Thread Cache Free Lists</a>).
<pre>
Garbage Collection
if (L != 0 && max_length > num_objects_to_move) {
max_length = max(max_length - num_objects_to_move, num_objects_to_move)
}
</pre>
<p>The fact that the thread cache went over its <code>max_size</code> is
an indication that the thread would benefit from a larger cache. Simply
increasing <code>max_size</code> would use an inordinate amount of memory
in programs that have lots of active threads. Developers can bound the
memory used with the flag --tcmalloc_max_total_thread_cache_bytes.</p>
<p>Each thread cache starts with a small <code>max_size</code>
(e.g. 64KB) so that idle threads won't pre-allocate memory they don't
need. Each time the cache runs a garbage collection, it will also try
to grow its <code>max_size</code>. If the sum of the thread cache
sizes is less than --tcmalloc_max_total_thread_cache_bytes,
<code>max_size</code> grows easily. If not, thread cache 1 will try
to steal from thread cache 2 (picked round-robin) by decreasing thread
cache 2's <code>max_size</code>. In this way, threads that are more
active will steal memory from other threads more often than they are
have memory stolen from themselves. Mostly idle threads end up with
small caches and active threads end up with big caches. Note that
this stealing can cause the sum of the thread cache sizes to be
greater than --tcmalloc_max_total_thread_cache_bytes until thread
cache 2 deallocates some memory to trigger a garbage collection.</p>
<h2><A NAME="performance">Performance Notes</A></h2>
<h3>PTMalloc2 unittest</h3>
<p>The PTMalloc2 package (now part of glibc) contains a unittest
program <code>t-test1.c</code>. This forks a number of threads and
performs a series of allocations and deallocations in each thread; the
threads do not communicate other than by synchronization in the memory
allocator.</p>
<p><code>t-test1</code> (included in
<code>tests/tcmalloc/</code>, and compiled as
<code>ptmalloc_unittest1</code>) was run with a varying numbers of
threads (1-20) and maximum allocation sizes (64 bytes -
32Kbytes). These tests were run on a 2.4GHz dual Xeon system with
hyper-threading enabled, using Linux glibc-2.3.2 from RedHat 9, with
one million operations per thread in each test. In each case, the test
was run once normally, and once with
<code>LD_PRELOAD=libtcmalloc.so</code>.
<p>The graphs below show the performance of TCMalloc vs PTMalloc2 for
several different metrics. Firstly, total operations (millions) per
elapsed second vs max allocation size, for varying numbers of
threads. The raw data used to generate these graphs (the output of the
<code>time</code> utility) is available in
<code>t-test1.times.txt</code>.</p>
<table>
<tr>
<td><img src="tcmalloc-opspersec.vs.size.1.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.2.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.3.threads.png"></td>
</tr>
<tr>
<td><img src="tcmalloc-opspersec.vs.size.4.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.5.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.8.threads.png"></td>
</tr>
<tr>
<td><img src="tcmalloc-opspersec.vs.size.12.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.16.threads.png"></td>
<td><img src="tcmalloc-opspersec.vs.size.20.threads.png"></td>
</tr>
</table>
<ul>
<li> TCMalloc is much more consistently scalable than PTMalloc2 - for
all thread counts &gt;1 it achieves ~7-9 million ops/sec for small
allocations, falling to ~2 million ops/sec for larger
allocations. The single-thread case is an obvious outlier,
since it is only able to keep a single processor busy and hence
can achieve fewer ops/sec. PTMalloc2 has a much higher variance
on operations/sec - peaking somewhere around 4 million ops/sec
for small allocations and falling to &lt;1 million ops/sec for
larger allocations.
<li> TCMalloc is faster than PTMalloc2 in the vast majority of
cases, and particularly for small allocations. Contention
between threads is less of a problem in TCMalloc.
<li> TCMalloc's performance drops off as the allocation size
increases. This is because the per-thread cache is
garbage-collected when it hits a threshold (defaulting to
2MB). With larger allocation sizes, fewer objects can be stored
in the cache before it is garbage-collected.
<li> There is a noticeable drop in TCMalloc's performance at ~32K
maximum allocation size; at larger sizes performance drops less
quickly. This is due to the 32K maximum size of objects in the
per-thread caches; for objects larger than this TCMalloc
allocates from the central page heap.
</ul>
<p>Next, operations (millions) per second of CPU time vs number of
threads, for max allocation size 64 bytes - 128 Kbytes.</p>
<table>
<tr>
<td><img src="tcmalloc-opspercpusec.vs.threads.64.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.256.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.1024.bytes.png"></td>
</tr>
<tr>
<td><img src="tcmalloc-opspercpusec.vs.threads.4096.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.8192.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.16384.bytes.png"></td>
</tr>
<tr>
<td><img src="tcmalloc-opspercpusec.vs.threads.32768.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.65536.bytes.png"></td>
<td><img src="tcmalloc-opspercpusec.vs.threads.131072.bytes.png"></td>
</tr>
</table>
<p>Here we see again that TCMalloc is both more consistent and more
efficient than PTMalloc2. For max allocation sizes &lt;32K, TCMalloc
typically achieves ~2-2.5 million ops per second of CPU time with a
large number of threads, whereas PTMalloc achieves generally 0.5-1
million ops per second of CPU time, with a lot of cases achieving much
less than this figure. Above 32K max allocation size, TCMalloc drops
to 1-1.5 million ops per second of CPU time, and PTMalloc drops almost
to zero for large numbers of threads (i.e. with PTMalloc, lots of CPU
time is being burned spinning waiting for locks in the heavily
multi-threaded case).</p>
<H2><A NAME="runtime">Modifying Runtime Behavior</A></H2>
<p>You can more finely control the behavior of the tcmalloc via
environment variables.</p>
<p>Generally useful flags:</p>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>TCMALLOC_SAMPLE_PARAMETER</code></td>
<td>default: 0</td>
<td>
The approximate gap between sampling actions. That is, we
take one sample approximately once every
<code>tcmalloc_sample_parmeter</code> bytes of allocation.
This sampled heap information is available via
<code>MallocExtension::GetHeapSample()</code> or
<code>MallocExtension::ReadStackTraces()</code>. A reasonable
value is 524288.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_RELEASE_RATE</code></td>
<td>default: 1.0</td>
<td>
Rate at which we release unused memory to the system, via
<code>madvise(MADV_DONTNEED)</code>, on systems that support
it. Zero means we never release memory back to the system.
Increase this flag to return memory faster; decrease it
to return memory slower. Reasonable rates are in the
range [0,10].
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD</code></td>
<td>default: 1073741824</td>
<td>
Allocations larger than this value cause a stack trace to be
dumped to stderr. The threshold for dumping stack traces is
increased by a factor of 1.125 every time we print a message so
that the threshold automatically goes up by a factor of ~1000
every 60 messages. This bounds the amount of extra logging
generated by this flag. Default value of this flag is very large
and therefore you should see no extra logging unless the flag is
overridden.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES</code></td>
<td>default: 33554432</td>
<td>
Bound on the total amount of bytes allocated to thread caches. This
bound is not strict, so it is possible for the cache to go over this
bound in certain circumstances. This value defaults to 16MB. For
applications with many threads, this may not be a large enough cache,
which can affect performance. If you suspect your application is not
scaling to many threads due to lock contention in TCMalloc, you can
try increasing this value. This may improve performance, at a cost
of extra memory use by TCMalloc. See <a href="#Garbage_Collection">
Garbage Collection</a> for more details.
</td>
</tr>
</table>
<p>Advanced "tweaking" flags, that control more precisely how tcmalloc
tries to allocate memory from the kernel.</p>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>TCMALLOC_SKIP_MMAP</code></td>
<td>default: false</td>
<td>
If true, do not try to use <code>mmap</code> to obtain memory
from the kernel.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_SKIP_SBRK</code></td>
<td>default: false</td>
<td>
If true, do not try to use <code>sbrk</code> to obtain memory
from the kernel.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_DEVMEM_START</code></td>
<td>default: 0</td>
<td>
Physical memory starting location in MB for <code>/dev/mem</code>
allocation. Setting this to 0 disables <code>/dev/mem</code>
allocation.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_DEVMEM_LIMIT</code></td>
<td>default: 0</td>
<td>
Physical memory limit location in MB for <code>/dev/mem</code>
allocation. Setting this to 0 means no limit.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_DEVMEM_DEVICE</code></td>
<td>default: /dev/mem</td>
<td>
Device to use for allocating unmanaged memory.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MEMFS_MALLOC_PATH</code></td>
<td>default: ""</td>
<td>
If set, specify a path where hugetlbfs or tmpfs is mounted.
This may allow for speedier allocations.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MEMFS_LIMIT_MB</code></td>
<td>default: 0</td>
<td>
Limit total memfs allocation size to specified number of MB.
0 means "no limit".
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MEMFS_ABORT_ON_FAIL</code></td>
<td>default: false</td>
<td>
If true, abort() whenever memfs_malloc fails to satisfy an allocation.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MEMFS_IGNORE_MMAP_FAIL</code></td>
<td>default: false</td>
<td>
If true, ignore failures from mmap.
</td>
</tr>
<tr valign=top>
<td><code>TCMALLOC_MEMFS_MAP_PRIVATE</code></td>
<td>default: false</td>
<td>
If true, use MAP_PRIVATE when mapping via memfs, not MAP_SHARED.
</td>
</tr>
</table>
<H2><A NAME="compiletime">Modifying Behavior In Code</A></H2>
<p>The <code>MallocExtension</code> class, in
<code>malloc_extension.h</code>, provides a few knobs that you can
tweak in your program, to affect tcmalloc's behavior.</p>
<h3>Releasing Memory Back to the System</h3>
<p>By default, tcmalloc will release no-longer-used memory back to the
kernel gradually, over time. The <a
href="#runtime">tcmalloc_release_rate</a> flag controls how quickly
this happens. You can also force a release at a given point in the
progam execution like so:</p>
<pre>
MallocExtension::instance()->ReleaseFreeMemory();
</pre>
<p>You can also call <code>SetMemoryReleaseRate()</code> to change the
<code>tcmalloc_release_rate</code> value at runtime, or
<code>GetMemoryReleaseRate</code> to see what the current release rate
is.</p>
<h3>Memory Introspection</h3>
<p>There are several routines for getting a human-readable form of the
current memory usage:</p>
<pre>
MallocExtension::instance()->GetStats(buffer, buffer_length);
MallocExtension::instance()->GetHeapSample(&string);
MallocExtension::instance()->GetHeapGrowthStacks(&string);
</pre>
<p>The last two create files in the same format as the heap-profiler,
and can be passed as data files to pprof. The first is human-readable
and is meant for debugging.</p>
<h3>Generic Tcmalloc Status</h3>
<p>TCMalloc has support for setting and retrieving arbitrary
'properties':</p>
<pre>
MallocExtension::instance()->SetNumericProperty(property_name, value);
MallocExtension::instance()->GetNumericProperty(property_name, &value);
</pre>
<p>It is possible for an application to set and get these properties,
but the most useful is when a library sets the properties so the
application can read them. Here are the properties TCMalloc defines;
you can access them with a call like
<code>MallocExtension::instance()->GetNumericProperty("generic.heap_size",
&value);</code>:</p>
<table frame=box rules=sides cellpadding=5 width=100%>
<tr valign=top>
<td><code>generic.current_allocated_bytes</code></td>
<td>
Number of bytes used by the application. This will not typically
match the memory use reported by the OS, because it does not
include TCMalloc overhead or memory fragmentation.
</td>
</tr>
<tr valign=top>
<td><code>generic.heap_size</code></td>
<td>
Bytes of system memory reserved by TCMalloc.
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.pageheap_free_bytes</code></td>
<td>
Number of bytes in free, mapped pages in page heap. These bytes
can be used to fulfill allocation requests. They always count
towards virtual memory usage, and unless the underlying memory is
swapped out by the OS, they also count towards physical memory
usage.
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.pageheap_unmapped_bytes</code></td>
<td>
Number of bytes in free, unmapped pages in page heap. These are
bytes that have been released back to the OS, possibly by one of
the MallocExtension "Release" calls. They can be used to fulfill
allocation requests, but typically incur a page fault. They
always count towards virtual memory usage, and depending on the
OS, typically do not count towards physical memory usage.
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.slack_bytes</code></td>
<td>
Sum of pageheap_free_bytes and pageheap_unmapped_bytes. Provided
for backwards compatibility only. Do not use.
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.max_total_thread_cache_bytes</code></td>
<td>
A limit to how much memory TCMalloc dedicates for small objects.
Higher numbers trade off more memory use for -- in some situations
-- improved efficiency.
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.current_total_thread_cache_bytes</code></td>
<td>
A measure of some of the memory TCMalloc is using (for
small objects).
</td>
</tr>
<tr valign=top>
<td><code>tcmalloc.min_per_thread_cache_bytes</code></td>
<td>
A lower limit to how much memory TCMalloc dedicates for small objects per
thread. Note that this property only shows effect if per-thread cache
calculated using tcmalloc.max_total_thread_cache_bytes ended up being less
than tcmalloc.min_per_thread_cache_bytes.
</td>
</tr>
</table>
<h2><A NAME="caveats">Caveats</A></h2>
<p>For some systems, TCMalloc may not work correctly with
applications that aren't linked against <code>libpthread.so</code> (or
the equivalent on your OS). It should work on Linux using glibc 2.3,
but other OS/libc combinations have not been tested.</p>
<p>TCMalloc may be somewhat more memory hungry than other mallocs,
(but tends not to have the huge blowups that can happen with other
mallocs). In particular, at startup TCMalloc allocates approximately
240KB of internal memory.</p>
<p>Don't try to load TCMalloc into a running binary (e.g., using JNI
in Java programs). The binary will have allocated some objects using
the system malloc, and may try to pass them to TCMalloc for
deallocation. TCMalloc will not be able to handle such objects.</p>
<hr>
<address>Sanjay Ghemawat, Paul Menage<br>
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
<!-- hhmts start -->
Last modified: Sat Feb 24 13:11:38 PST 2007 (csilvers)
<!-- hhmts end -->
</address>
</body>
</html>