re-organize docs and convert htmls to asciidoc
1
.gitignore
vendored
@ -38,6 +38,7 @@
|
||||
/debugallocation_test
|
||||
/debugallocation_test.sh
|
||||
/depcomp
|
||||
/docs/*.html
|
||||
/frag_unittest
|
||||
/frag_unittest.exe
|
||||
/function_ref_test
|
||||
|
67
Makefile.am
@ -99,9 +99,6 @@ EXTRA_INSTALL =
|
||||
|
||||
## vvvv RULES TO MAKE THE LIBRARIES, BINARIES, AND UNITTESTS
|
||||
|
||||
dist_doc_DATA += docs/index.html docs/designstyle.css
|
||||
|
||||
|
||||
### ------- various support library routines
|
||||
|
||||
# Having set of common helpers helps with unit testing various "guts"
|
||||
@ -223,9 +220,6 @@ check_address_test_SOURCES = src/tests/check_address_test.cc
|
||||
check_address_test_CPPFLAGS = $(gtest_CPPFLAGS)
|
||||
check_address_test_LDADD = libcommon.la libgtest.la
|
||||
|
||||
### Documentation
|
||||
dist_doc_DATA +=
|
||||
|
||||
endif WITH_STACK_TRACE
|
||||
|
||||
### ------- tcmalloc_minimal (thread-caching malloc)
|
||||
@ -436,43 +430,19 @@ min_per_thread_cache_size_test_CPPFLAGS = $(gtest_CPPFLAGS)
|
||||
min_per_thread_cache_size_test_LDADD = libtcmalloc_minimal.la libgtest.la
|
||||
|
||||
### Documentation
|
||||
dist_doc_DATA += docs/tcmalloc.html \
|
||||
docs/overview.gif \
|
||||
docs/pageheap.gif \
|
||||
docs/spanmap.gif \
|
||||
docs/threadheap.gif \
|
||||
docs/t-test1.times.txt \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.1024.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.128.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.131072.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.16384.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.2048.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.256.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.32768.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.4096.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.512.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.64.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.65536.bytes.png \
|
||||
docs/tcmalloc-opspercpusec.vs.threads.8192.bytes.png \
|
||||
docs/tcmalloc-opspersec.vs.size.1.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.12.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.16.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.2.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.20.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.3.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.4.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.5.threads.png \
|
||||
docs/tcmalloc-opspersec.vs.size.8.threads.png
|
||||
dist_doc_DATA += $(top_srcdir)/docs/*adoc $(top_srcdir)/docs/*gif $(top_srcdir)/docs/*png $(top_srcdir)/docs/dots/*dot
|
||||
|
||||
# I don't know how to say "distribute the .dot files but don't install them";
|
||||
# noinst doesn't seem to work with data. I separate them out anyway, in case
|
||||
# one day we figure it out. Regardless, installing the dot files isn't the
|
||||
# end of the world.
|
||||
dist_doc_DATA += docs/overview.dot \
|
||||
docs/pageheap.dot \
|
||||
docs/spanmap.dot \
|
||||
docs/threadheap.dot
|
||||
gperftools_HTMLDOCS = docs/tcmalloc.html docs/heapprofile.html \
|
||||
docs/cpuprofile.html docs/cpuprofile-fileformat.html \
|
||||
docs/pprof_integration.html
|
||||
|
||||
if !MISSING_ASCIIDOCTOR
|
||||
doc_DATA = $(gperftools_HTMLDOCS)
|
||||
MOSTLYCLEANFILES = $(gperftools_HTMLDOCS)
|
||||
|
||||
.adoc.html:
|
||||
$(ASCIIDOCTOR) $(ASCIIDOCTOR_FLAGS) -o $@ $<
|
||||
endif !MISSING_ASCIIDOCTOR
|
||||
|
||||
### ------- tcmalloc_minimal_debug (thread-caching malloc with debugallocation)
|
||||
|
||||
@ -715,11 +685,6 @@ endif !SKIP_PPROF_TESTS
|
||||
|
||||
endif WITH_HEAP_PROFILER
|
||||
|
||||
### Documentation (above and beyond tcmalloc_minimal documentation)
|
||||
if WITH_HEAP_PROFILER
|
||||
dist_doc_DATA += docs/heapprofile.html docs/heap-example1.png
|
||||
endif WITH_HEAP_PROFILER
|
||||
|
||||
### ------- tcmalloc with debugallocation
|
||||
|
||||
if WITH_DEBUGALLOC
|
||||
@ -837,14 +802,6 @@ profiler4_unittest_LDADD = -lstacktrace -lprofiler
|
||||
profiler4_unittest_DEPENDENCIES = libprofiler.la
|
||||
endif !SKIP_PPROF_TESTS
|
||||
|
||||
### Documentation
|
||||
dist_doc_DATA += docs/cpuprofile.html \
|
||||
docs/cpuprofile-fileformat.html \
|
||||
docs/pprof-test-big.gif \
|
||||
docs/pprof-test.gif \
|
||||
docs/pprof-vsnprintf-big.gif \
|
||||
docs/pprof-vsnprintf.gif
|
||||
|
||||
endif WITH_CPU_PROFILER
|
||||
|
||||
|
||||
@ -954,7 +911,7 @@ $(top_distdir)/ChangeLog:
|
||||
EXTRA_DIST = $(SCRIPTS) \
|
||||
src/windows/get_mangled_names.cc src/windows/override_functions.cc \
|
||||
src/windows/CMakeLists.txt \
|
||||
docs/pprof.see_also $(WINDOWS_EXTRA) \
|
||||
$(WINDOWS_EXTRA) \
|
||||
gperftools.sln vsprojects vendor \
|
||||
$(top_srcdir)/src/*h $(top_srcdir)/src/base/*h \
|
||||
$(top_srcdir)/benchmark/*h \
|
||||
|
@ -519,6 +519,12 @@ AM_CONDITIONAL(SKIP_PPROF_TESTS, [test "x$PPROF_PATH" = "x"])
|
||||
AS_IF([test "x$PPROF_PATH" = "x"],
|
||||
[AC_MSG_WARN([pprof tool not found. Will skip several unit tests that need it. Install via go install github.com/google/pprof@latest then add \$HOME/go/bin to PATH])])
|
||||
|
||||
AC_PATH_PROG([ASCIIDOCTOR], [asciidoctor])
|
||||
AM_CONDITIONAL([MISSING_ASCIIDOCTOR], [test "x$ASCIIDOCTOR" = "x"])
|
||||
AS_IF([test "x$ASCIIDOCTOR" = "x"],
|
||||
[AC_MSG_WARN([asciidoctor tool not found. Will skip building .html documentation from .adoc])])
|
||||
AC_ARG_VAR(ASCIIDOCTOR_FLAGS, [flags to pass to asciidoctor])
|
||||
|
||||
# Write generated configuration file
|
||||
AC_CONFIG_FILES([Makefile])
|
||||
AC_OUTPUT
|
||||
|
161
docs/cpuprofile-fileformat.adoc
Normal file
@ -0,0 +1,161 @@
|
||||
= Gperftools CPU Profiler Binary Data File Format
|
||||
:reproducible:
|
||||
|
||||
[.normal]
|
||||
This file documents the binary data file format produced by the
|
||||
gperftools CPU Profiler. It is one of "legacy" formats supported by
|
||||
the pprof tool. For information about using the CPU Profiler, see
|
||||
link:cpuprofile.html[its user guide].
|
||||
|
||||
The profiler source code, which generates files using this format, is at
|
||||
`src/profiler.cc`.
|
||||
|
||||
== CPU Profile Data File Structure
|
||||
|
||||
CPU profile data files each consist of four parts, in order:
|
||||
|
||||
* Binary header
|
||||
* Binary profile records
|
||||
* Binary trailer
|
||||
* Text list of mapped objects
|
||||
|
||||
The binary data is expressed in terms of "slots." These are words large
|
||||
enough to hold the program's pointer type, i.e., for 32-bit programs
|
||||
they are 4 bytes in size, and for 64-bit programs they are 8 bytes. They
|
||||
are stored in the profile data file in the native byte order (i.e.,
|
||||
little-endian for x86 and x86_64).
|
||||
|
||||
== Binary Header
|
||||
|
||||
The binary header format is show below. Values written by the profiler,
|
||||
along with requirements currently enforced by the analysis tools, are
|
||||
shown in parentheses.
|
||||
|
||||
[cols=",",options="header",]
|
||||
|===
|
||||
|slot |data
|
||||
|0 |header count (0; must be 0)
|
||||
|1 |header slots after this one (3; must be >= 3)
|
||||
|2 |format version (0; must be 0)
|
||||
|3 |sampling period, in microseconds
|
||||
|4 |padding (0)
|
||||
|===
|
||||
|
||||
The headers currently generated for 32-bit and 64-bit little-endian (x86
|
||||
and x86_64) profiles are shown below, for comparison.
|
||||
|
||||
[cols=",,,,,",options="header",]
|
||||
|===
|
||||
| |hdr count |hdr words |version |sampling period |pad
|
||||
|32-bit or 64-bit (slots) |0 |3 |0 |10000 |0
|
||||
|
||||
|32-bit (4-byte words in file) |`0x00000` |`0x00003` |`0x00000`
|
||||
|`0x02710` |`0x00000`
|
||||
|
||||
|64-bit LE (4-byte words in file) |`0x00000 0x00000`
|
||||
|`0x00003 0x00000` |`0x00000 0x00000` |`0x02710 0x00000`
|
||||
|`0x00000 0x00000`
|
||||
|===
|
||||
|
||||
The contents are shown in terms of slots, and in terms of 4-byte words
|
||||
in the profile data file. The slot contents for 32-bit and 64-bit
|
||||
headers are identical. For 32-bit profiles, the 4-byte word view matches
|
||||
the slot view. For 64-bit profiles, each (8-byte) slot is shown as two
|
||||
4-byte words, ordered as they would appear in the file.
|
||||
|
||||
The profiling tools examine the contents of the file and use the
|
||||
expected locations and values of the header words field to detect
|
||||
whether the file is 32-bit or 64-bit.
|
||||
|
||||
== Binary Profile Records
|
||||
|
||||
The binary profile record format is shown below.
|
||||
|
||||
[cols=2*]
|
||||
|===
|
||||
|slot
|
||||
|data
|
||||
|
||||
|0
|
||||
|sample count, must be >= 1
|
||||
|
||||
|1
|
||||
|number of call chain PCs (num_pcs), must be >= 1
|
||||
|
||||
|2 .. (num_pcs + 1)
|
||||
|call chain PCs, most-recently-called function first.
|
||||
|===
|
||||
|
||||
The total length of a given record is 2 + num_pcs.
|
||||
|
||||
Note that multiple profile records can be emitted by the profiler having
|
||||
an identical call chain. In that case, analysis tools should sum the
|
||||
counts of all records having identical call chains.
|
||||
|
||||
*Note:* Some profile analysis tools terminate if they see _any_ profile
|
||||
record with a call chain with its first entry having the address 0.
|
||||
(This is similar to the binary trailer.)
|
||||
|
||||
=== Example
|
||||
|
||||
This example shows the slots contained in a sample profile record.
|
||||
|
||||
[cols=",,,,",]
|
||||
|===
|
||||
|5 |3 |0xa0000 |0xc0000 |0xe0000
|
||||
|===
|
||||
|
||||
In this example, 5 ticks were received at PC 0xa0000, whose function had
|
||||
been called by the function containing 0xc0000, which had been called
|
||||
from the function containing 0xe0000.
|
||||
|
||||
== Binary Trailer
|
||||
|
||||
The binary trailer consists of three slots of data with fixed values,
|
||||
shown below.
|
||||
|
||||
[cols=",",options="header",]
|
||||
|===
|
||||
|slot |value
|
||||
|0 |0
|
||||
|1 |1
|
||||
|2 |0
|
||||
|===
|
||||
|
||||
Note that this is the same data that would contained in a profile record
|
||||
with sample count = 0, num_pcs = 1, and a one-element call chain
|
||||
containing the address 0.
|
||||
|
||||
== Text List of Mapped Objects
|
||||
|
||||
The binary data in the file is followed immediately by a list of mapped
|
||||
objects. This list consists of lines of text separated by newline
|
||||
characters.
|
||||
|
||||
Each line describes one mapping as produced by SaveProcSelfMaps. For
|
||||
example:
|
||||
|
||||
....
|
||||
40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so
|
||||
....
|
||||
|
||||
The first address must start at the beginning of the line. This is
|
||||
essentially the same format as Linux's `/proc/<pid>/maps` file.
|
||||
Recent Linux systems have this format documented in
|
||||
link:https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html[`man 5
|
||||
proc_pid_maps`]. Less recent systems document it under `man 5 proc`.
|
||||
|
||||
Tools ignore minor:major number and inode number. And only executable
|
||||
mappings really need to be present. See
|
||||
`src/base/proc_maps_iterator.{h,cc}` for how it is produced.
|
||||
|
||||
Unrecognized lines should be ignored by analysis tools.
|
||||
|
||||
Note, original pprof tool also supported processing `$build`
|
||||
"variable" when processing mappings, but we never produced such
|
||||
mappings. So we don't document this anymore.
|
||||
|
||||
'''''
|
||||
|
||||
Original authror: Chris Demetriou (cgd) +
|
||||
Last update by: Aliaksei Kandratsenka +
|
@ -1,264 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<HTML>
|
||||
|
||||
<HEAD>
|
||||
<link rel="stylesheet" href="designstyle.css">
|
||||
<title>Google CPU Profiler Binary Data File Format</title>
|
||||
</HEAD>
|
||||
|
||||
<BODY>
|
||||
|
||||
<h1>Google CPU Profiler Binary Data File Format</h1>
|
||||
|
||||
<p align=right>
|
||||
<i>Last modified
|
||||
<script type=text/javascript>
|
||||
var lm = new Date(document.lastModified);
|
||||
document.write(lm.toDateString());
|
||||
</script></i>
|
||||
</p>
|
||||
|
||||
<p>This file documents the binary data file format produced by the
|
||||
Google CPU Profiler. For information about using the CPU Profiler,
|
||||
see <a href="cpuprofile.html">its user guide</a>.
|
||||
|
||||
<p>The profiler source code, which generates files using this format, is at
|
||||
<code>src/profiler.cc</code></a>.
|
||||
|
||||
|
||||
<h2>CPU Profile Data File Structure</h2>
|
||||
|
||||
<p>CPU profile data files each consist of four parts, in order:
|
||||
|
||||
<ul>
|
||||
<li> Binary header
|
||||
<li> Binary profile records
|
||||
<li> Binary trailer
|
||||
<li> Text list of mapped objects
|
||||
</ul>
|
||||
|
||||
<p>The binary data is expressed in terms of "slots." These are words
|
||||
large enough to hold the program's pointer type, i.e., for 32-bit
|
||||
programs they are 4 bytes in size, and for 64-bit programs they are 8
|
||||
bytes. They are stored in the profile data file in the native byte
|
||||
order (i.e., little-endian for x86 and x86_64).
|
||||
|
||||
|
||||
<h2>Binary Header</h2>
|
||||
|
||||
<p>The binary header format is show below. Values written by the
|
||||
profiler, along with requirements currently enforced by the analysis
|
||||
tools, are shown in parentheses.
|
||||
|
||||
<p>
|
||||
<table summary="Header Format"
|
||||
frame="box" rules="sides" cellpadding="5" width="50%">
|
||||
<tr>
|
||||
<th width="30%">slot</th>
|
||||
<th width="70%">data</th>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>0</td>
|
||||
<td>header count (0; must be 0)</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>1</td>
|
||||
<td>header slots after this one (3; must be >= 3)</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>2</td>
|
||||
<td>format version (0; must be 0)</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>3</td>
|
||||
<td>sampling period, in microseconds</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>4</td>
|
||||
<td>padding (0)</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>The headers currently generated for 32-bit and 64-bit little-endian
|
||||
(x86 and x86_64) profiles are shown below, for comparison.
|
||||
|
||||
<p>
|
||||
<table summary="Header Example" frame="box" rules="sides" cellpadding="5">
|
||||
<tr>
|
||||
<th></th>
|
||||
<th>hdr count</th>
|
||||
<th>hdr words</th>
|
||||
<th>version</th>
|
||||
<th>sampling period</th>
|
||||
<th>pad</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>32-bit or 64-bit (slots)</td>
|
||||
<td>0</td>
|
||||
<td>3</td>
|
||||
<td>0</td>
|
||||
<td>10000</td>
|
||||
<td>0</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>32-bit (4-byte words in file)</td>
|
||||
<td><tt>0x00000</tt></td>
|
||||
<td><tt>0x00003</tt></td>
|
||||
<td><tt>0x00000</tt></td>
|
||||
<td><tt>0x02710</tt></td>
|
||||
<td><tt>0x00000</tt></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>64-bit LE (4-byte words in file)</td>
|
||||
<td><tt>0x00000 0x00000</tt></td>
|
||||
<td><tt>0x00003 0x00000</tt></td>
|
||||
<td><tt>0x00000 0x00000</tt></td>
|
||||
<td><tt>0x02710 0x00000</tt></td>
|
||||
<td><tt>0x00000 0x00000</tt></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>The contents are shown in terms of slots, and in terms of 4-byte
|
||||
words in the profile data file. The slot contents for 32-bit and
|
||||
64-bit headers are identical. For 32-bit profiles, the 4-byte word
|
||||
view matches the slot view. For 64-bit profiles, each (8-byte) slot
|
||||
is shown as two 4-byte words, ordered as they would appear in the
|
||||
file.
|
||||
|
||||
<p>The profiling tools examine the contents of the file and use the
|
||||
expected locations and values of the header words field to detect
|
||||
whether the file is 32-bit or 64-bit.
|
||||
|
||||
|
||||
<h2>Binary Profile Records</h2>
|
||||
|
||||
<p>The binary profile record format is shown below.
|
||||
|
||||
<p>
|
||||
<table summary="Profile Record Format"
|
||||
frame="box" rules="sides" cellpadding="5" width="50%">
|
||||
<tr>
|
||||
<th width="30%">slot</th>
|
||||
<th width="70%">data</th>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>0</td>
|
||||
<td>sample count, must be >= 1</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>1</td>
|
||||
<td>number of call chain PCs (num_pcs), must be >= 1</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>2 .. (num_pcs + 1)</td>
|
||||
<td>call chain PCs, most-recently-called function first.
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>The total length of a given record is 2 + num_pcs.
|
||||
|
||||
<p>Note that multiple profile records can be emitted by the profiler
|
||||
having an identical call chain. In that case, analysis tools should
|
||||
sum the counts of all records having identical call chains.
|
||||
|
||||
<p><b>Note:</b> Some profile analysis tools terminate if they see
|
||||
<em>any</em> profile record with a call chain with its first entry
|
||||
having the address 0. (This is similar to the binary trailer.)
|
||||
|
||||
<h3>Example</h3>
|
||||
|
||||
This example shows the slots contained in a sample profile record.
|
||||
|
||||
<p>
|
||||
<table summary="Profile Record Example"
|
||||
frame="box" rules="sides" cellpadding="5">
|
||||
<tr>
|
||||
<td>5</td>
|
||||
<td>3</td>
|
||||
<td>0xa0000</td>
|
||||
<td>0xc0000</td>
|
||||
<td>0xe0000</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>In this example, 5 ticks were received at PC 0xa0000, whose
|
||||
function had been called by the function containing 0xc0000, which had
|
||||
been called from the function containing 0xe0000.
|
||||
|
||||
|
||||
<h2>Binary Trailer</h2>
|
||||
|
||||
<p>The binary trailer consists of three slots of data with fixed
|
||||
values, shown below.
|
||||
|
||||
<p>
|
||||
<table summary="Trailer Format"
|
||||
frame="box" rules="sides" cellpadding="5" width="50%">
|
||||
<tr>
|
||||
<th width="30%">slot</th>
|
||||
<th width="70%">value</th>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>0</td>
|
||||
<td>0</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>1</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>2</td>
|
||||
<td>0</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>Note that this is the same data that would contained in a profile
|
||||
record with sample count = 0, num_pcs = 1, and a one-element call
|
||||
chain containing the address 0.
|
||||
|
||||
|
||||
<h2>Text List of Mapped Objects</h2>
|
||||
|
||||
<p>The binary data in the file is followed immediately by a list of
|
||||
mapped objects. This list consists of lines of text separated by
|
||||
newline characters.
|
||||
|
||||
<p>Each line is one of the following types:
|
||||
|
||||
<ul>
|
||||
<li>Build specifier, starting with "<tt>build=</tt>". For example:
|
||||
<pre> build=/path/to/binary</pre>
|
||||
Leading spaces on the line are ignored.
|
||||
|
||||
<li>Mapping line from ProcMapsIterator::FormatLine. For example:
|
||||
<pre> 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so</pre>
|
||||
The first address must start at the beginning of the line.
|
||||
</ul>
|
||||
|
||||
<p>Unrecognized lines should be ignored by analysis tools.
|
||||
|
||||
<p>When processing the paths see in mapping lines, occurrences of
|
||||
<tt>$build</tt> followed by a non-word character (i.e., characters
|
||||
other than underscore or alphanumeric characters), should be replaced
|
||||
by the path given on the last build specifier line.
|
||||
|
||||
<hr>
|
||||
<address>Chris Demetriou<br>
|
||||
<!-- Created: Mon Aug 27 12:18:26 PDT 2007 -->
|
||||
<!-- hhmts start -->
|
||||
Last modified: Mon Aug 27 12:18:26 PDT 2007 (cgd)
|
||||
<!-- hhmts end -->
|
||||
</address>
|
||||
</BODY>
|
||||
</HTML>
|
422
docs/cpuprofile.adoc
Normal file
@ -0,0 +1,422 @@
|
||||
= Using CPU Profiler
|
||||
|
||||
:reproducible:
|
||||
|
||||
[.normal]
|
||||
This is the CPU profiler originally developed at Google. There are
|
||||
three parts to using it: linking the library into an application,
|
||||
running the code, and analyzing the output.
|
||||
|
||||
On the off-chance that you should need to understand it, the CPU
|
||||
profiler data file format is documented separately,
|
||||
link:cpuprofile-fileformat.html[here].
|
||||
|
||||
== Linking in the Library
|
||||
|
||||
To install the CPU profiler into your executable, add `-lprofiler` to
|
||||
the link-time step for your executable. (It's also possible to
|
||||
add in the profiler at run-time using `+LD_PRELOAD+`, e.g.
|
||||
|
||||
% LD_PRELOAD="/usr/lib/libprofiler.so" <binary>
|
||||
|
||||
This does _not_ turn on CPU profiling; it just inserts the code. For
|
||||
that reason, it's practical to just always link `+-lprofiler+` into a
|
||||
binary while developing; that's what we do at Google. (However, since
|
||||
any user can turn on the profiler by setting an environment variable,
|
||||
it's not necessarily recommended to install profiler-linked binaries
|
||||
into a production, running system.)
|
||||
|
||||
== Running the Code
|
||||
|
||||
There are several alternatives to actually turn on CPU profiling for a
|
||||
given run of an executable:
|
||||
|
||||
. Define the environment variable CPUPROFILE to the filename to dump the
|
||||
profile to. For instance, if you had a version of `+/bin/ls+` that had
|
||||
been linked against libprofiler, you could run:
|
||||
+
|
||||
....
|
||||
% env CPUPROFILE=ls.prof /bin/ls
|
||||
....
|
||||
. In addition to defining the environment variable CPUPROFILE you can
|
||||
also define CPUPROFILESIGNAL. This allows profiling to be controlled via
|
||||
the signal number that you specify. The signal number must be unused by
|
||||
the program under normal operation. Internally it acts as a switch,
|
||||
triggered by the signal, which is off by default. For instance, if you
|
||||
had a copy of `+/bin/chrome+` that had been been linked against
|
||||
libprofiler, you could run:
|
||||
+
|
||||
....
|
||||
% env CPUPROFILE=chrome.prof CPUPROFILESIGNAL=12 /bin/chrome &
|
||||
....
|
||||
+
|
||||
You can then trigger profiling to start:
|
||||
+
|
||||
....
|
||||
% killall -12 chrome
|
||||
....
|
||||
+
|
||||
Then after a period of time you can tell it to stop which will generate
|
||||
the profile:
|
||||
+
|
||||
....
|
||||
% killall -12 chrome
|
||||
....
|
||||
. In your code, bracket the code you want profiled in calls to
|
||||
`+ProfilerStart()+` and `+ProfilerStop()+`. (These functions are
|
||||
declared in `+<gperftools/profiler.h>+`.) `+ProfilerStart()+` will take
|
||||
the profile-filename as an argument.
|
||||
|
||||
Profiling works correctly with sub-processes: each child process gets
|
||||
its own profile with its own name (generated by combining CPUPROFILE
|
||||
with the child's process id).
|
||||
|
||||
For security reasons, CPU profiling will not write to a file -- and is
|
||||
thus not usable -- for setuid programs.
|
||||
|
||||
See the include-file `+gperftools/profiler.h+` for advanced-use
|
||||
functions, including `+ProfilerFlush()+` and
|
||||
`+ProfilerStartWithOptions()+`.
|
||||
|
||||
=== Modifying Runtime Behavior
|
||||
|
||||
You can more finely control the behavior of the CPU profiler via
|
||||
environment variables.
|
||||
|
||||
[cols=",,",]
|
||||
|===
|
||||
|`CPUPROFILE_FREQUENCY=__x__` |default: 100 |How many
|
||||
interrupts/second the cpu-profiler samples.
|
||||
|
||||
|`+CPUPROFILE_REALTIME=1+` |default: [not set] |If set to any value
|
||||
(including 0 or the empty string), use ITIMER_REAL instead of
|
||||
ITIMER_PROF to gather profiles. In general, ITIMER_REAL is not as
|
||||
accurate as ITIMER_PROF, and also interacts badly with use of alarm(),
|
||||
so prefer ITIMER_PROF unless you have a reason prefer ITIMER_REAL.
|
||||
|===
|
||||
|
||||
== [#pprof]#Analyzing the Output#
|
||||
|
||||
`+pprof+` is the program used to analyze a profiles. Get it from
|
||||
link:https://github.com/google/pprof[]. For example by running:
|
||||
|
||||
% go install github.com/google/pprof@latest
|
||||
|
||||
You can then add `$HOME/go/bin` to your `$PATH`. Also, note, that they
|
||||
have their own documentation as well. So check it out
|
||||
link:https://github.com/google/pprof/blob/main/doc/README.md[here].
|
||||
|
||||
It has many output modes, both textual and graphical. Some give just
|
||||
raw numbers, much like the `+-pg+` output of `+gcc+`, and others show
|
||||
the data in the form of a dependency graph.
|
||||
|
||||
Here are some ways to call pprof. These are described in more detail
|
||||
below.
|
||||
|
||||
....
|
||||
% pprof /bin/ls ls.prof
|
||||
Enters "interactive" mode
|
||||
% pprof --text /bin/ls ls.prof
|
||||
Outputs one line per procedure
|
||||
% pprof --gv /bin/ls ls.prof
|
||||
Displays annotated call-graph via 'gv'
|
||||
% pprof --gv --focus=Mutex /bin/ls ls.prof
|
||||
Restricts to code paths including a .*Mutex.* entry
|
||||
% pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
|
||||
Code paths including Mutex but not string
|
||||
% pprof --list=getdir /bin/ls ls.prof
|
||||
(Per-line) annotated source listing for getdir()
|
||||
% pprof --disasm=getdir /bin/ls ls.prof
|
||||
(Per-PC) annotated disassembly for getdir()
|
||||
% pprof --text localhost:1234
|
||||
Outputs one line per procedure for localhost:1234
|
||||
% pprof --callgrind /bin/ls ls.prof
|
||||
Outputs the call information in callgrind format
|
||||
|
||||
% pprof --http=:<port> /bin/ls ls.prof
|
||||
Starts Web UI and launches web browser
|
||||
for interactive profile inspection
|
||||
....
|
||||
|
||||
=== Analyzing Text Output
|
||||
|
||||
Text mode has lines of output that look like this:
|
||||
|
||||
....
|
||||
14 2.1% 17.2% 58 8.7% std::_Rb_tree::find
|
||||
....
|
||||
|
||||
Here is how to interpret the columns:
|
||||
|
||||
. Number of profiling samples in this function
|
||||
. Percentage of profiling samples in this function
|
||||
. Percentage of profiling samples in the functions printed so far
|
||||
. Number of profiling samples in this function and its callees
|
||||
. Percentage of profiling samples in this function and its callees
|
||||
. Function name
|
||||
|
||||
=== Analyzing Callgrind Output
|
||||
|
||||
Use http://kcachegrind.sourceforge.net[kcachegrind] to analyze your
|
||||
callgrind output:
|
||||
|
||||
....
|
||||
% pprof --callgrind /bin/ls ls.prof > ls.callgrind
|
||||
% kcachegrind ls.callgrind
|
||||
....
|
||||
|
||||
The cost is specified in 'hits', i.e. how many times a function appears
|
||||
in the recorded call stack information. The 'calls' from function a to b
|
||||
record how many times function b was found in the stack traces directly
|
||||
below function a.
|
||||
|
||||
Tip: if you use a debug build the output will include file and line
|
||||
number information and kcachegrind will show an annotated source code
|
||||
view.
|
||||
|
||||
=== Node Information
|
||||
|
||||
In the various graphical modes of pprof, the output is a call graph
|
||||
annotated with timing information, like so:
|
||||
|
||||
link:pprof-test-big.gif[]
|
||||
|
||||
image:pprof-test.gif[pprof-test]
|
||||
|
||||
Each node represents a procedure. The directed edges indicate caller to
|
||||
callee relations. Each node is formatted as follows:
|
||||
|
||||
....
|
||||
Class Name
|
||||
Method Name
|
||||
local (percentage)
|
||||
of cumulative (percentage)
|
||||
....
|
||||
|
||||
The last one or two lines contains the timing information. (The
|
||||
profiling is done via a sampling method, where by default we take 100
|
||||
samples a second. Therefor one unit of time in the output corresponds to
|
||||
about 10 milliseconds of execution time.) The "local" time is the time
|
||||
spent executing the instructions directly contained in the procedure
|
||||
(and in any other procedures that were inlined into the procedure). The
|
||||
"cumulative" time is the sum of the "local" time and the time spent in
|
||||
any callees. If the cumulative time is the same as the local time, it is
|
||||
not printed.
|
||||
|
||||
For instance, the timing information for test_main_thread() indicates
|
||||
that 155 units (about 1.55 seconds) were spent executing the code in
|
||||
`+test_main_thread()+` and 200 units were spent while executing
|
||||
`+test_main_thread()+` and its callees such as `+snprintf()+`.
|
||||
|
||||
The size of the node is proportional to the local count. The percentage
|
||||
displayed in the node corresponds to the count divided by the total run
|
||||
time of the program (that is, the cumulative count for `+main()+`).
|
||||
|
||||
=== Edge Information
|
||||
|
||||
An edge from one node to another indicates a caller to callee
|
||||
relationship. Each edge is labelled with the time spent by the callee on
|
||||
behalf of the caller. E.g, the edge from `+test_main_thread()+` to
|
||||
`+snprintf()+` indicates that of the 200 samples in
|
||||
`+test_main_thread()+`, 37 are because of calls to `+snprintf()+`.
|
||||
|
||||
Note that `+test_main_thread()+` has an edge to `+vsnprintf()+`, even
|
||||
though `+test_main_thread()+` doesn't call that function directly. This
|
||||
is because the code was compiled with `+-O2+`; the profile reflects the
|
||||
optimized control flow.
|
||||
|
||||
=== Meta Information
|
||||
|
||||
The top of the display should contain some meta information like:
|
||||
|
||||
....
|
||||
/tmp/profiler2_unittest
|
||||
Total samples: 202
|
||||
Focusing on: 202
|
||||
Dropped nodes with <= 1 abs(samples)
|
||||
Dropped edges with <= 0 samples
|
||||
....
|
||||
|
||||
This section contains the name of the program, and the total samples
|
||||
collected during the profiling run. If the `+--focus+` option is on (see
|
||||
the link:#focus[Focus] section below), the legend also contains the
|
||||
number of samples being shown in the focused display. Furthermore, some
|
||||
unimportant nodes and edges are dropped to reduce clutter. The
|
||||
characteristics of the dropped nodes and edges are also displayed in the
|
||||
legend.
|
||||
|
||||
=== [#focus]#Focus and Ignore#
|
||||
|
||||
You can ask pprof to generate a display focused on a particular piece of
|
||||
the program. You specify a regular expression. Any portion of the
|
||||
call-graph that is on a path which contains at least one node matching
|
||||
the regular expression is preserved. The rest of the call-graph is
|
||||
dropped on the floor. For example, you can focus on the `+vsnprintf()+`
|
||||
libc call in `+profiler2_unittest+` as follows:
|
||||
|
||||
....
|
||||
% pprof --gv --focus=vsnprintf /tmp/profiler2_unittest test.prof
|
||||
....
|
||||
|
||||
link:pprof-vsnprintf-big.gif[]
|
||||
|
||||
[cols="",]
|
||||
|===
|
||||
|image:pprof-vsnprintf.gif[pprof-vsnprintf]
|
||||
|===
|
||||
|
||||
Similarly, you can supply the `+--ignore+` option to ignore samples that
|
||||
match a specified regular expression. E.g., if you are interested in
|
||||
everything except calls to `+snprintf()+`, you can say:
|
||||
|
||||
....
|
||||
% pprof --gv --ignore=snprintf /tmp/profiler2_unittest test.prof
|
||||
....
|
||||
|
||||
=== Text interactive mode
|
||||
|
||||
By default -- if you don't specify any flags to the contrary -- pprof
|
||||
runs in interactive mode. At the `+(pprof)+` prompt, you can run many of
|
||||
the commands described above. You can type `+help+` for a list of what
|
||||
commands are available in interactive mode.
|
||||
|
||||
=== [#options]#pprof Options#
|
||||
|
||||
For a complete list of pprof options, you can run `+pprof --help+`.
|
||||
|
||||
==== Output Type
|
||||
|
||||
[width="100%",cols="50%,50%",]
|
||||
|===
|
||||
|`+--text+` |Produces a textual listing. (Note: If you have an X
|
||||
display, and `+dot+` and `+gv+` installed, you will probably be happier
|
||||
with the `+--gv+` output.)
|
||||
|
||||
|`+--gv+` |Generates annotated call-graph, converts to postscript, and
|
||||
displays via gv (requres `+dot+` and `+gv+` be installed).
|
||||
|
||||
|`+--dot+` |Generates the annotated call-graph in dot format and emits
|
||||
to stdout (requres `+dot+` be installed).
|
||||
|
||||
|`+--ps+` |Generates the annotated call-graph in Postscript format and
|
||||
emits to stdout (requres `+dot+` be installed).
|
||||
|
||||
|`+--pdf+` |Generates the annotated call-graph in PDF format and emits
|
||||
to stdout (requires `+dot+` and `+ps2pdf+` be installed).
|
||||
|
||||
|`+--gif+` |Generates the annotated call-graph in GIF format and emits
|
||||
to stdout (requres `+dot+` be installed).
|
||||
|
||||
|`--list=<__regexp__>` |
|
||||
Outputs source-code listing of routines whose name matches <regexp>.
|
||||
Each line in the listing is annotated with flat and cumulative sample
|
||||
counts.
|
||||
|
||||
In the presence of inlined calls, the samples associated with inlined
|
||||
code tend to get assigned to a line that follows the location of the
|
||||
inlined call. A more precise accounting can be obtained by disassembling
|
||||
the routine using the --disasm flag.
|
||||
|
||||
|`--disasm=<__regexp__>` |Generates disassembly of routines that
|
||||
match <regexp>, annotated with flat and cumulative sample counts and
|
||||
emits to stdout.
|
||||
|===
|
||||
|
||||
==== Reporting Granularity
|
||||
|
||||
By default, pprof produces one entry per procedure. However you can use
|
||||
one of the following options to change the granularity of the output.
|
||||
|
||||
[cols=2*]
|
||||
|===
|
||||
|`+--addresses+`
|
||||
|Produce one node per program address.
|
||||
|
||||
|`+--lines+`
|
||||
|Produce one node per source line.
|
||||
|
||||
|`+--functions+`
|
||||
|Produce one node per function (this is the default).
|
||||
|
||||
|`+--files+`
|
||||
|Produce one node per source file.
|
||||
|===
|
||||
|
||||
==== Controlling the Call Graph Display
|
||||
|
||||
Some nodes and edges are dropped to reduce clutter in the output
|
||||
display. The following options control this effect:
|
||||
|
||||
[cols=",",]
|
||||
|===
|
||||
|`+--nodecount=<n>+` |This option controls the number of displayed
|
||||
nodes. The nodes are first sorted by decreasing cumulative count, and
|
||||
then only the top N nodes are kept. The default value is 80.
|
||||
|
||||
|`+--nodefraction=<f>+` |This option provides another mechanism for
|
||||
discarding nodes from the display. If the cumulative count for a node is
|
||||
less than this option's value multiplied by the total count for the
|
||||
profile, the node is dropped. The default value is 0.005; i.e. nodes
|
||||
that account for less than half a percent of the total time are dropped.
|
||||
A node is dropped if either this condition is satisfied, or the
|
||||
--nodecount condition is satisfied.
|
||||
|
||||
|`+--edgefraction=<f>+` |This option controls the number of displayed
|
||||
edges. First of all, an edge is dropped if either its source or
|
||||
destination node is dropped. Otherwise, the edge is dropped if the
|
||||
sample count along the edge is less than this option's value multiplied
|
||||
by the total count for the profile. The default value is 0.001; i.e.,
|
||||
edges that account for less than 0.1% of the total time are dropped.
|
||||
|
||||
|`+--focus=<re>+` |This option controls what region of the graph is
|
||||
displayed based on the regular expression supplied with the option. For
|
||||
any path in the callgraph, we check all nodes in the path against the
|
||||
supplied regular expression. If none of the nodes match, the path is
|
||||
dropped from the output.
|
||||
|
||||
|`+--ignore=<re>+` |This option controls what region of the graph is
|
||||
displayed based on the regular expression supplied with the option. For
|
||||
any path in the callgraph, we check all nodes in the path against the
|
||||
supplied regular expression. If any of the nodes match, the path is
|
||||
dropped from the output.
|
||||
|===
|
||||
|
||||
The dropped edges and nodes account for some count mismatches in the
|
||||
display. For example, the cumulative count for `+snprintf()+` in the
|
||||
first diagram above was 41. However the local count (1) and the count
|
||||
along the outgoing edges (12+1+20+6) add up to only 40.
|
||||
|
||||
== Caveats
|
||||
|
||||
* If the program exits because of a signal, the generated profile will
|
||||
be incomplete, and may perhaps be completely empty.
|
||||
|
||||
* The displayed graph may have disconnected regions because of the
|
||||
edge-dropping heuristics described above.
|
||||
|
||||
* If the program linked in a library that was not compiled with enough
|
||||
symbolic information, all samples associated with the library may be
|
||||
charged to the last symbol found in the program before the
|
||||
library. This will artificially inflate the count for that symbol.
|
||||
|
||||
* If you run the program on one machine, and profile it on another,
|
||||
and the shared libraries are different on the two machines, the
|
||||
profiling output may be confusing: samples that fall within shared
|
||||
libaries may be assigned to arbitrary procedures.
|
||||
|
||||
* If your program forks, the children will also be profiled (since
|
||||
they inherit the same CPUPROFILE setting). Each process is profiled
|
||||
separately; to distinguish the child profiles from the parent profile
|
||||
and from each other, all children will have their process-id appended
|
||||
to the CPUPROFILE name.
|
||||
|
||||
* Due to a hack we use to trigger appending of pid in child processes,
|
||||
your profiles may end up named strangely if the first character of
|
||||
your CPUPROFILE variable has ascii value greater than 127. This should
|
||||
be exceedingly rare, but if you need to use such a name, just set
|
||||
prepend `+./+` to your filename: `+CPUPROFILE=./Ägypten+`.
|
||||
|
||||
'''''
|
||||
|
||||
Original author: Sanjay Ghemawat +
|
||||
Last updated by: Aliaksei Kandratsenka
|
@ -1,536 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
|
||||
<HTML>
|
||||
|
||||
<HEAD>
|
||||
<link rel="stylesheet" href="designstyle.css">
|
||||
<title>Gperftools CPU Profiler</title>
|
||||
</HEAD>
|
||||
|
||||
<BODY>
|
||||
|
||||
<p align=right>
|
||||
<i>Last modified
|
||||
<script type=text/javascript>
|
||||
var lm = new Date(document.lastModified);
|
||||
document.write(lm.toDateString());
|
||||
</script></i>
|
||||
</p>
|
||||
|
||||
<p>This is the CPU profiler we use at Google. There are three parts
|
||||
to using it: linking the library into an application, running the
|
||||
code, and analyzing the output.</p>
|
||||
|
||||
<p>On the off-chance that you should need to understand it, the CPU
|
||||
profiler data file format is documented separately,
|
||||
<a href="cpuprofile-fileformat.html">here</a>.
|
||||
|
||||
|
||||
<H1>Linking in the Library</H1>
|
||||
|
||||
<p>To install the CPU profiler into your executable, add
|
||||
<code>-lprofiler</code> to the link-time step for your executable.
|
||||
(It's also probably possible to add in the profiler at run-time using
|
||||
<code>LD_PRELOAD</code>, e.g.
|
||||
<code>% env LD_PRELOAD="/usr/lib/libprofiler.so" <binary></code>,
|
||||
but this isn't necessarily recommended.)</p>
|
||||
|
||||
<p>This does <i>not</i> turn on CPU profiling; it just inserts the
|
||||
code. For that reason, it's practical to just always link
|
||||
<code>-lprofiler</code> into a binary while developing; that's what we
|
||||
do at Google. (However, since any user can turn on the profiler by
|
||||
setting an environment variable, it's not necessarily recommended to
|
||||
install profiler-linked binaries into a production, running
|
||||
system.)</p>
|
||||
|
||||
|
||||
<H1>Running the Code</H1>
|
||||
|
||||
<p>There are several alternatives to actually turn on CPU profiling
|
||||
for a given run of an executable:</p>
|
||||
|
||||
<ol>
|
||||
<li> <p>Define the environment variable CPUPROFILE to the filename
|
||||
to dump the profile to. For instance, if you had a version of
|
||||
<code>/bin/ls</code> that had been linked against libprofiler,
|
||||
you could run:</p>
|
||||
<pre>% env CPUPROFILE=ls.prof /bin/ls</pre>
|
||||
</li>
|
||||
<li> <p>In addition to defining the environment variable CPUPROFILE
|
||||
you can also define CPUPROFILESIGNAL. This allows profiling to be
|
||||
controlled via the signal number that you specify. The signal number
|
||||
must be unused by the program under normal operation. Internally it
|
||||
acts as a switch, triggered by the signal, which is off by default.
|
||||
For instance, if you had a copy of <code>/bin/chrome</code> that had been
|
||||
been linked against libprofiler, you could run:</p>
|
||||
<pre>% env CPUPROFILE=chrome.prof CPUPROFILESIGNAL=12 /bin/chrome &</pre>
|
||||
<p>You can then trigger profiling to start:</p>
|
||||
<pre>% killall -12 chrome</pre>
|
||||
<p>Then after a period of time you can tell it to stop which will
|
||||
generate the profile:</p>
|
||||
<pre>% killall -12 chrome</pre>
|
||||
</li>
|
||||
<li> <p>In your code, bracket the code you want profiled in calls to
|
||||
<code>ProfilerStart()</code> and <code>ProfilerStop()</code>.
|
||||
(These functions are declared in <code><gperftools/profiler.h></code>.)
|
||||
<code>ProfilerStart()</code> will take
|
||||
the profile-filename as an argument.</p>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
<p>In Linux 2.6 and above, profiling works correctly with threads,
|
||||
automatically profiling all threads. In Linux 2.4, profiling only
|
||||
profiles the main thread (due to a kernel bug involving itimers and
|
||||
threads). Profiling works correctly with sub-processes: each child
|
||||
process gets its own profile with its own name (generated by combining
|
||||
CPUPROFILE with the child's process id).</p>
|
||||
|
||||
<p>For security reasons, CPU profiling will not write to a file -- and
|
||||
is thus not usable -- for setuid programs.</p>
|
||||
|
||||
<p>See the include-file <code>gperftools/profiler.h</code> for
|
||||
advanced-use functions, including <code>ProfilerFlush()</code> and
|
||||
<code>ProfilerStartWithOptions()</code>.</p>
|
||||
|
||||
|
||||
<H2>Modifying Runtime Behavior</H2>
|
||||
|
||||
<p>You can more finely control the behavior of the CPU profiler via
|
||||
environment variables.</p>
|
||||
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>CPUPROFILE_FREQUENCY=<i>x</i></code></td>
|
||||
<td>default: 100</td>
|
||||
<td>
|
||||
How many interrupts/second the cpu-profiler samples.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>CPUPROFILE_REALTIME=1</code></td>
|
||||
<td>default: [not set]</td>
|
||||
<td>
|
||||
If set to any value (including 0 or the empty string), use
|
||||
ITIMER_REAL instead of ITIMER_PROF to gather profiles. In
|
||||
general, ITIMER_REAL is not as accurate as ITIMER_PROF, and also
|
||||
interacts badly with use of alarm(), so prefer ITIMER_PROF unless
|
||||
you have a reason prefer ITIMER_REAL.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
<h1><a name="pprof">Analyzing the Output</a></h1>
|
||||
|
||||
<p><code>pprof</code> is the script used to analyze a profile. It has
|
||||
many output modes, both textual and graphical. Some give just raw
|
||||
numbers, much like the <code>-pg</code> output of <code>gcc</code>,
|
||||
and others show the data in the form of a dependency graph.</p>
|
||||
|
||||
<p>pprof <b>requires</b> <code>perl5</code> to be installed to run.
|
||||
It also requires <code>dot</code> to be installed for any of the
|
||||
graphical output routines, and <code>gv</code> to be installed for
|
||||
<code>--gv</code> mode (described below).
|
||||
</p>
|
||||
|
||||
<p>Here are some ways to call pprof. These are described in more
|
||||
detail below.</p>
|
||||
|
||||
<pre>
|
||||
% pprof /bin/ls ls.prof
|
||||
Enters "interactive" mode
|
||||
% pprof --text /bin/ls ls.prof
|
||||
Outputs one line per procedure
|
||||
% pprof --gv /bin/ls ls.prof
|
||||
Displays annotated call-graph via 'gv'
|
||||
% pprof --gv --focus=Mutex /bin/ls ls.prof
|
||||
Restricts to code paths including a .*Mutex.* entry
|
||||
% pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
|
||||
Code paths including Mutex but not string
|
||||
% pprof --list=getdir /bin/ls ls.prof
|
||||
(Per-line) annotated source listing for getdir()
|
||||
% pprof --disasm=getdir /bin/ls ls.prof
|
||||
(Per-PC) annotated disassembly for getdir()
|
||||
% pprof --text localhost:1234
|
||||
Outputs one line per procedure for localhost:1234
|
||||
% pprof --callgrind /bin/ls ls.prof
|
||||
Outputs the call information in callgrind format
|
||||
</pre>
|
||||
|
||||
|
||||
<h3>Analyzing Text Output</h3>
|
||||
|
||||
<p>Text mode has lines of output that look like this:</p>
|
||||
<pre>
|
||||
14 2.1% 17.2% 58 8.7% std::_Rb_tree::find
|
||||
</pre>
|
||||
|
||||
<p>Here is how to interpret the columns:</p>
|
||||
<ol>
|
||||
<li> Number of profiling samples in this function
|
||||
<li> Percentage of profiling samples in this function
|
||||
<li> Percentage of profiling samples in the functions printed so far
|
||||
<li> Number of profiling samples in this function and its callees
|
||||
<li> Percentage of profiling samples in this function and its callees
|
||||
<li> Function name
|
||||
</ol>
|
||||
|
||||
<h3>Analyzing Callgrind Output</h3>
|
||||
|
||||
<p>Use <a href="http://kcachegrind.sourceforge.net">kcachegrind</a> to
|
||||
analyze your callgrind output:</p>
|
||||
<pre>
|
||||
% pprof --callgrind /bin/ls ls.prof > ls.callgrind
|
||||
% kcachegrind ls.callgrind
|
||||
</pre>
|
||||
|
||||
<p>The cost is specified in 'hits', i.e. how many times a function
|
||||
appears in the recorded call stack information. The 'calls' from
|
||||
function a to b record how many times function b was found in the
|
||||
stack traces directly below function a.</p>
|
||||
|
||||
<p>Tip: if you use a debug build the output will include file and line
|
||||
number information and kcachegrind will show an annotated source
|
||||
code view.</p>
|
||||
|
||||
<h3>Node Information</h3>
|
||||
|
||||
<p>In the various graphical modes of pprof, the output is a call graph
|
||||
annotated with timing information, like so:</p>
|
||||
|
||||
<A HREF="pprof-test-big.gif">
|
||||
<center><table><tr><td>
|
||||
<img src="pprof-test.gif">
|
||||
</td></tr></table></center>
|
||||
</A>
|
||||
|
||||
<p>Each node represents a procedure. The directed edges indicate
|
||||
caller to callee relations. Each node is formatted as follows:</p>
|
||||
|
||||
<center><pre>
|
||||
Class Name
|
||||
Method Name
|
||||
local (percentage)
|
||||
<b>of</b> cumulative (percentage)
|
||||
</pre></center>
|
||||
|
||||
<p>The last one or two lines contains the timing information. (The
|
||||
profiling is done via a sampling method, where by default we take 100
|
||||
samples a second. Therefor one unit of time in the output corresponds
|
||||
to about 10 milliseconds of execution time.) The "local" time is the
|
||||
time spent executing the instructions directly contained in the
|
||||
procedure (and in any other procedures that were inlined into the
|
||||
procedure). The "cumulative" time is the sum of the "local" time and
|
||||
the time spent in any callees. If the cumulative time is the same as
|
||||
the local time, it is not printed.</p>
|
||||
|
||||
<p>For instance, the timing information for test_main_thread()
|
||||
indicates that 155 units (about 1.55 seconds) were spent executing the
|
||||
code in <code>test_main_thread()</code> and 200 units were spent while
|
||||
executing <code>test_main_thread()</code> and its callees such as
|
||||
<code>snprintf()</code>.</p>
|
||||
|
||||
<p>The size of the node is proportional to the local count. The
|
||||
percentage displayed in the node corresponds to the count divided by
|
||||
the total run time of the program (that is, the cumulative count for
|
||||
<code>main()</code>).</p>
|
||||
|
||||
<h3>Edge Information</h3>
|
||||
|
||||
<p>An edge from one node to another indicates a caller to callee
|
||||
relationship. Each edge is labelled with the time spent by the callee
|
||||
on behalf of the caller. E.g, the edge from
|
||||
<code>test_main_thread()</code> to <code>snprintf()</code> indicates
|
||||
that of the 200 samples in <code>test_main_thread()</code>, 37 are
|
||||
because of calls to <code>snprintf()</code>.</p>
|
||||
|
||||
<p>Note that <code>test_main_thread()</code> has an edge to
|
||||
<code>vsnprintf()</code>, even though <code>test_main_thread()</code>
|
||||
doesn't call that function directly. This is because the code was
|
||||
compiled with <code>-O2</code>; the profile reflects the optimized
|
||||
control flow.</p>
|
||||
|
||||
<h3>Meta Information</h3>
|
||||
|
||||
<p>The top of the display should contain some meta information
|
||||
like:</p>
|
||||
<pre>
|
||||
/tmp/profiler2_unittest
|
||||
Total samples: 202
|
||||
Focusing on: 202
|
||||
Dropped nodes with <= 1 abs(samples)
|
||||
Dropped edges with <= 0 samples
|
||||
</pre>
|
||||
|
||||
<p>This section contains the name of the program, and the total
|
||||
samples collected during the profiling run. If the
|
||||
<code>--focus</code> option is on (see the <a href="#focus">Focus</a>
|
||||
section below), the legend also contains the number of samples being
|
||||
shown in the focused display. Furthermore, some unimportant nodes and
|
||||
edges are dropped to reduce clutter. The characteristics of the
|
||||
dropped nodes and edges are also displayed in the legend.</p>
|
||||
|
||||
<h3><a name=focus>Focus and Ignore</a></h3>
|
||||
|
||||
<p>You can ask pprof to generate a display focused on a particular
|
||||
piece of the program. You specify a regular expression. Any portion
|
||||
of the call-graph that is on a path which contains at least one node
|
||||
matching the regular expression is preserved. The rest of the
|
||||
call-graph is dropped on the floor. For example, you can focus on the
|
||||
<code>vsnprintf()</code> libc call in <code>profiler2_unittest</code>
|
||||
as follows:</p>
|
||||
|
||||
<pre>
|
||||
% pprof --gv --focus=vsnprintf /tmp/profiler2_unittest test.prof
|
||||
</pre>
|
||||
<A HREF="pprof-vsnprintf-big.gif">
|
||||
<center><table><tr><td>
|
||||
<img src="pprof-vsnprintf.gif">
|
||||
</td></tr></table></center>
|
||||
</A>
|
||||
|
||||
<p>Similarly, you can supply the <code>--ignore</code> option to
|
||||
ignore samples that match a specified regular expression. E.g., if
|
||||
you are interested in everything except calls to
|
||||
<code>snprintf()</code>, you can say:</p>
|
||||
<pre>
|
||||
% pprof --gv --ignore=snprintf /tmp/profiler2_unittest test.prof
|
||||
</pre>
|
||||
|
||||
|
||||
<h3>Interactive mode</a></h3>
|
||||
|
||||
<p>By default -- if you don't specify any flags to the contrary --
|
||||
pprof runs in interactive mode. At the <code>(pprof)</code> prompt,
|
||||
you can run many of the commands described above. You can type
|
||||
<code>help</code> for a list of what commands are available in
|
||||
interactive mode.</p>
|
||||
|
||||
<h3><a name=options>pprof Options</a></h3>
|
||||
|
||||
For a complete list of pprof options, you can run <code>pprof
|
||||
--help</code>.
|
||||
|
||||
<h4>Output Type</h4>
|
||||
|
||||
<p>
|
||||
<center>
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
<tr valign=top>
|
||||
<td><code>--text</code></td>
|
||||
<td>
|
||||
Produces a textual listing. (Note: If you have an X display, and
|
||||
<code>dot</code> and <code>gv</code> installed, you will probably
|
||||
be happier with the <code>--gv</code> output.)
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--gv</code></td>
|
||||
<td>
|
||||
Generates annotated call-graph, converts to postscript, and
|
||||
displays via gv (requres <code>dot</code> and <code>gv</code> be
|
||||
installed).
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--dot</code></td>
|
||||
<td>
|
||||
Generates the annotated call-graph in dot format and
|
||||
emits to stdout (requres <code>dot</code> be installed).
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--ps</code></td>
|
||||
<td>
|
||||
Generates the annotated call-graph in Postscript format and
|
||||
emits to stdout (requres <code>dot</code> be installed).
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--pdf</code></td>
|
||||
<td>
|
||||
Generates the annotated call-graph in PDF format and emits to
|
||||
stdout (requires <code>dot</code> and <code>ps2pdf</code> be
|
||||
installed).
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--gif</code></td>
|
||||
<td>
|
||||
Generates the annotated call-graph in GIF format and
|
||||
emits to stdout (requres <code>dot</code> be installed).
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--list=<<i>regexp</i>></code></td>
|
||||
<td>
|
||||
<p>Outputs source-code listing of routines whose
|
||||
name matches <regexp>. Each line
|
||||
in the listing is annotated with flat and cumulative
|
||||
sample counts.</p>
|
||||
|
||||
<p>In the presence of inlined calls, the samples
|
||||
associated with inlined code tend to get assigned
|
||||
to a line that follows the location of the
|
||||
inlined call. A more precise accounting can be
|
||||
obtained by disassembling the routine using the
|
||||
--disasm flag.</p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--disasm=<<i>regexp</i>></code></td>
|
||||
<td>
|
||||
Generates disassembly of routines that match
|
||||
<regexp>, annotated with flat and
|
||||
cumulative sample counts and emits to stdout.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</center>
|
||||
|
||||
<h4>Reporting Granularity</h4>
|
||||
|
||||
<p>By default, pprof produces one entry per procedure. However you can
|
||||
use one of the following options to change the granularity of the
|
||||
output. The <code>--files</code> option seems to be particularly
|
||||
useless, and may be removed eventually.</p>
|
||||
|
||||
<center>
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
<tr valign=top>
|
||||
<td><code>--addresses</code></td>
|
||||
<td>
|
||||
Produce one node per program address.
|
||||
</td>
|
||||
</tr>
|
||||
<td><code>--lines</code></td>
|
||||
<td>
|
||||
Produce one node per source line.
|
||||
</td>
|
||||
</tr>
|
||||
<td><code>--functions</code></td>
|
||||
<td>
|
||||
Produce one node per function (this is the default).
|
||||
</td>
|
||||
</tr>
|
||||
<td><code>--files</code></td>
|
||||
<td>
|
||||
Produce one node per source file.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</center>
|
||||
|
||||
<h4>Controlling the Call Graph Display</h4>
|
||||
|
||||
<p>Some nodes and edges are dropped to reduce clutter in the output
|
||||
display. The following options control this effect:</p>
|
||||
|
||||
<center>
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
<tr valign=top>
|
||||
<td><code>--nodecount=<n></code></td>
|
||||
<td>
|
||||
This option controls the number of displayed nodes. The nodes
|
||||
are first sorted by decreasing cumulative count, and then only
|
||||
the top N nodes are kept. The default value is 80.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--nodefraction=<f></code></td>
|
||||
<td>
|
||||
This option provides another mechanism for discarding nodes
|
||||
from the display. If the cumulative count for a node is
|
||||
less than this option's value multiplied by the total count
|
||||
for the profile, the node is dropped. The default value
|
||||
is 0.005; i.e. nodes that account for less than
|
||||
half a percent of the total time are dropped. A node
|
||||
is dropped if either this condition is satisfied, or the
|
||||
--nodecount condition is satisfied.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--edgefraction=<f></code></td>
|
||||
<td>
|
||||
This option controls the number of displayed edges. First of all,
|
||||
an edge is dropped if either its source or destination node is
|
||||
dropped. Otherwise, the edge is dropped if the sample
|
||||
count along the edge is less than this option's value multiplied
|
||||
by the total count for the profile. The default value is
|
||||
0.001; i.e., edges that account for less than
|
||||
0.1% of the total time are dropped.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--focus=<re></code></td>
|
||||
<td>
|
||||
This option controls what region of the graph is displayed
|
||||
based on the regular expression supplied with the option.
|
||||
For any path in the callgraph, we check all nodes in the path
|
||||
against the supplied regular expression. If none of the nodes
|
||||
match, the path is dropped from the output.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign=top>
|
||||
<td><code>--ignore=<re></code></td>
|
||||
<td>
|
||||
This option controls what region of the graph is displayed
|
||||
based on the regular expression supplied with the option.
|
||||
For any path in the callgraph, we check all nodes in the path
|
||||
against the supplied regular expression. If any of the nodes
|
||||
match, the path is dropped from the output.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</center>
|
||||
|
||||
<p>The dropped edges and nodes account for some count mismatches in
|
||||
the display. For example, the cumulative count for
|
||||
<code>snprintf()</code> in the first diagram above was 41. However
|
||||
the local count (1) and the count along the outgoing edges (12+1+20+6)
|
||||
add up to only 40.</p>
|
||||
|
||||
|
||||
<h1>Caveats</h1>
|
||||
|
||||
<ul>
|
||||
<li> If the program exits because of a signal, the generated profile
|
||||
will be <font color=red>incomplete, and may perhaps be
|
||||
completely empty</font>.
|
||||
<li> The displayed graph may have disconnected regions because
|
||||
of the edge-dropping heuristics described above.
|
||||
<li> If the program linked in a library that was not compiled
|
||||
with enough symbolic information, all samples associated
|
||||
with the library may be charged to the last symbol found
|
||||
in the program before the library. This will artificially
|
||||
inflate the count for that symbol.
|
||||
<li> If you run the program on one machine, and profile it on
|
||||
another, and the shared libraries are different on the two
|
||||
machines, the profiling output may be confusing: samples that
|
||||
fall within shared libaries may be assigned to arbitrary
|
||||
procedures.
|
||||
<li> If your program forks, the children will also be profiled
|
||||
(since they inherit the same CPUPROFILE setting). Each process
|
||||
is profiled separately; to distinguish the child profiles from
|
||||
the parent profile and from each other, all children will have
|
||||
their process-id appended to the CPUPROFILE name.
|
||||
<li> Due to a hack we make to work around a possible gcc bug, your
|
||||
profiles may end up named strangely if the first character of
|
||||
your CPUPROFILE variable has ascii value greater than 127.
|
||||
This should be exceedingly rare, but if you need to use such a
|
||||
name, just set prepend <code>./</code> to your filename:
|
||||
<code>CPUPROFILE=./Ägypten</code>.
|
||||
</ul>
|
||||
|
||||
|
||||
<hr>
|
||||
<address>Sanjay Ghemawat<br>
|
||||
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
|
||||
<!-- hhmts start -->
|
||||
Last modified: Fri May 9 14:41:29 PDT 2008
|
||||
<!-- hhmts end -->
|
||||
</address>
|
||||
</BODY>
|
||||
</HTML>
|
@ -1,109 +0,0 @@
|
||||
body {
|
||||
background-color: #ffffff;
|
||||
color: black;
|
||||
margin-right: 1in;
|
||||
margin-left: 1in;
|
||||
}
|
||||
|
||||
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #3366ff;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
@media print {
|
||||
/* Darker version for printing */
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #000080;
|
||||
font-family: helvetica, sans-serif;
|
||||
}
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
font-size: 18pt;
|
||||
}
|
||||
h2 {
|
||||
margin-left: -0.5in;
|
||||
}
|
||||
h3 {
|
||||
margin-left: -0.25in;
|
||||
}
|
||||
h4 {
|
||||
margin-left: -0.125in;
|
||||
}
|
||||
hr {
|
||||
margin-left: -1in;
|
||||
}
|
||||
|
||||
/* Definition lists: definition term bold */
|
||||
dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
address {
|
||||
text-align: right;
|
||||
}
|
||||
/* Use the <code> tag for bits of code and <var> for variables and objects. */
|
||||
code,pre,samp,var {
|
||||
color: #006000;
|
||||
}
|
||||
/* Use the <file> tag for file and directory paths and names. */
|
||||
file {
|
||||
color: #905050;
|
||||
font-family: monospace;
|
||||
}
|
||||
/* Use the <kbd> tag for stuff the user should type. */
|
||||
kbd {
|
||||
color: #600000;
|
||||
}
|
||||
div.note p {
|
||||
float: right;
|
||||
width: 3in;
|
||||
margin-right: 0%;
|
||||
padding: 1px;
|
||||
border: 2px solid #6060a0;
|
||||
background-color: #fffff0;
|
||||
}
|
||||
|
||||
UL.nobullets {
|
||||
list-style-type: none;
|
||||
list-style-image: none;
|
||||
margin-left: -1em;
|
||||
}
|
||||
|
||||
/* pretty printing styles. See prettify.js */
|
||||
.str { color: #080; }
|
||||
.kwd { color: #008; }
|
||||
.com { color: #800; }
|
||||
.typ { color: #606; }
|
||||
.lit { color: #066; }
|
||||
.pun { color: #660; }
|
||||
.pln { color: #000; }
|
||||
.tag { color: #008; }
|
||||
.atn { color: #606; }
|
||||
.atv { color: #080; }
|
||||
pre.prettyprint { padding: 2px; border: 1px solid #888; }
|
||||
|
||||
.embsrc { background: #eee; }
|
||||
|
||||
@media print {
|
||||
.str { color: #060; }
|
||||
.kwd { color: #006; font-weight: bold; }
|
||||
.com { color: #600; font-style: italic; }
|
||||
.typ { color: #404; font-weight: bold; }
|
||||
.lit { color: #044; }
|
||||
.pun { color: #440; }
|
||||
.pln { color: #000; }
|
||||
.tag { color: #006; font-weight: bold; }
|
||||
.atn { color: #404; }
|
||||
.atv { color: #060; }
|
||||
}
|
||||
|
||||
/* Table Column Headers */
|
||||
.hdr {
|
||||
color: #006;
|
||||
font-weight: bold;
|
||||
background-color: #dddddd; }
|
||||
.hdr2 {
|
||||
color: #006;
|
||||
background-color: #eeeeee; }
|
2
docs/dots/README
Normal file
@ -0,0 +1,2 @@
|
||||
This directory contains original graphviz sources of diagrams used in
|
||||
gperftools docs.
|
293
docs/heapprofile.adoc
Normal file
@ -0,0 +1,293 @@
|
||||
= Gperftools Heap Profiler
|
||||
Original author: Sanjay Ghemawat
|
||||
|
||||
:reproducible:
|
||||
|
||||
[.normal]
|
||||
|
||||
This is the heap profiler originally developed at Google, to explore
|
||||
how C++ programs manage memory. This facility can be useful for
|
||||
|
||||
* Figuring out what is in the program heap at any given time
|
||||
* Locating memory leaks
|
||||
* Finding places that do a lot of allocation
|
||||
|
||||
The profiling system instruments all allocations and frees. It
|
||||
keeps track of various pieces of information per allocation site. An
|
||||
allocation site is defined as the active stack trace at the call to
|
||||
`malloc`, `calloc`, `realloc`, or, `new`.
|
||||
|
||||
There are three parts to using it: linking the library into an
|
||||
application, running the code, and analyzing the output.
|
||||
|
||||
== Linking in the Library
|
||||
|
||||
To install the heap profiler into your executable, add
|
||||
`-ltcmalloc` to the link-time step for your executable.
|
||||
Also, while we don't necessarily recommend this form of usage, it's
|
||||
possible to add in the profiler at run-time using
|
||||
`LD_PRELOAD`:
|
||||
|
||||
% env LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary>
|
||||
|
||||
This does _not_ turn on heap profiling; it just inserts the
|
||||
code. For that reason, it's practical to just always link
|
||||
`-ltcmalloc` into a binary while developing; that's what we
|
||||
do at Google. (However, since any user can turn on the profiler by
|
||||
setting an environment variable, it's not necessarily recommended to
|
||||
install profiler-linked binaries into a production, running
|
||||
system.) Note that if you wish to use the heap profiler, you must
|
||||
also use the tcmalloc memory-allocation library. There is no way
|
||||
currently to use the heap profiler separate from tcmalloc.
|
||||
|
||||
== Running the Code
|
||||
|
||||
There are several alternatives to actually turn on heap profiling for
|
||||
a given run of an executable:
|
||||
|
||||
. Define the environment variable HEAPPROFILE to the filename
|
||||
to dump the profile to. For instance, to profile
|
||||
`/usr/local/bin/my_binary_compiled_with_tcmalloc`:
|
||||
|
||||
% env HEAPPROFILE=/tmp/mybin.hprof /usr/local/bin/my_binary_compiled_with_tcmalloc
|
||||
|
||||
. In your code, bracket the code you want profiled in calls to
|
||||
`HeapProfilerStart()` and `HeapProfilerStop()`. (These functions are
|
||||
declared in `<gperftools/heap-profiler.h>`.) `HeapProfilerStart()`
|
||||
will take the profile-filename-prefix as an argument. Then, as often
|
||||
as you'd like before calling `HeapProfilerStop()`, you can use
|
||||
`HeapProfilerDump()` or `GetHeapProfile()` to examine the profile. In
|
||||
case it's useful, `IsHeapProfilerRunning()` will tell you whether
|
||||
you've already called `HeapProfilerStart()` or not.
|
||||
|
||||
For security reasons, heap profiling will not write to a file -- and
|
||||
is thus not usable -- for setuid programs.
|
||||
|
||||
=== Modifying Runtime Behavior
|
||||
|
||||
You can more finely control the behavior of the heap profiler via
|
||||
environment variables.
|
||||
|
||||
[cols=3*]
|
||||
|===
|
||||
|`HEAP_PROFILE_ALLOCATION_INTERVAL`
|
||||
|default: 1073741824 (1 Gb)
|
||||
|Dump heap profiling information each time the specified number of
|
||||
bytes has been allocated by the program.
|
||||
|
||||
|`HEAP_PROFILE_INUSE_INTERVAL`
|
||||
|default: 104857600 (100 Mb)
|
||||
|Dump heap profiling information whenever the high-water memory
|
||||
usage mark increases by the specified number of bytes.
|
||||
|
||||
|`HEAP_PROFILE_TIME_INTERVAL`
|
||||
|default: 0
|
||||
|Dump heap profiling information each time the specified
|
||||
number of seconds has elapsed.
|
||||
|
||||
|`HEAPPROFILESIGNAL`
|
||||
|default: disabled
|
||||
|Dump heap profiling information whenever the specified signal is sent to the
|
||||
process.
|
||||
|
||||
|`HEAP_PROFILE_MMAP`
|
||||
|default: false
|
||||
|Profile `mmap`, `mremap` and `sbrk`
|
||||
calls in addition
|
||||
to `malloc`, `calloc`, `realloc`,
|
||||
and `new`. *NOTE:* this causes the profiler to
|
||||
profile calls internal to tcmalloc, since tcmalloc and friends use
|
||||
mmap and sbrk internally for allocations. One partial solution is
|
||||
to filter these allocations out when running `pprof`,
|
||||
with something like
|
||||
`pprof --ignore='DoAllocWithArena\|SbrkSysAllocator::Alloc\|MmapSysAllocator::Alloc`.
|
||||
|
||||
|`HEAP_PROFILE_ONLY_MMAP`
|
||||
|default: false
|
||||
|Only profile `mmap`, `mremap`, and `sbrk`
|
||||
calls; do not profile
|
||||
`malloc`, `calloc`, `realloc`,
|
||||
or `new`.
|
||||
|
||||
|`HEAP_PROFILE_MMAP_LOG`
|
||||
|default: false
|
||||
|Log `mmap`/`munmap` calls.
|
||||
|===
|
||||
|
||||
== Analyzing the Output
|
||||
|
||||
If heap-profiling is turned on in a program, the program will
|
||||
periodically write profiles to the filesystem. The sequence of
|
||||
profiles will be named:
|
||||
|
||||
<prefix>.0000.heap
|
||||
<prefix>.0001.heap
|
||||
<prefix>.0002.heap
|
||||
...
|
||||
|
||||
where `<prefix>` is the filename-prefix supplied
|
||||
when running the code (e.g. via the `HEAPPROFILE`
|
||||
environment variable). Note that if the supplied prefix
|
||||
does not start with a `/`, the profile files will be
|
||||
written to the program's working directory.
|
||||
|
||||
The profile output can be viewed by passing it to the
|
||||
`pprof` tool -- the same tool that's used to analyze <A
|
||||
link:cpuprofile.html[CPU profiles].
|
||||
|
||||
Here are some examples. These examples assume the binary is named
|
||||
`gfs_master`, and a sequence of heap profile files can be
|
||||
found in files named:
|
||||
|
||||
/tmp/profile.0001.heap
|
||||
/tmp/profile.0002.heap
|
||||
...
|
||||
/tmp/profile.0100.heap
|
||||
|
||||
=== Why is a process so big
|
||||
|
||||
% pprof --gv gfs_master /tmp/profile.0100.heap
|
||||
|
||||
This command will pop-up a `gv` window that displays
|
||||
the profile information as a directed graph. Here is a portion
|
||||
of the resulting output:
|
||||
|
||||
image::heap-example1.png[]
|
||||
|
||||
A few explanations:
|
||||
|
||||
* `GFS_MasterChunk::AddServer` accounts for 255.6 MB
|
||||
of the live memory, which is 25% of the total live memory.
|
||||
* `GFS_MasterChunkTable::UpdateState` is directly
|
||||
accountable for 176.2 MB of the live memory (i.e., it directly
|
||||
allocated 176.2 MB that has not been freed yet). Furthermore,
|
||||
it and its callees are responsible for 729.9 MB. The
|
||||
labels on the outgoing edges give a good indication of the
|
||||
amount allocated by each callee.
|
||||
|
||||
=== Comparing Profiles
|
||||
|
||||
You often want to skip allocations during the initialization phase
|
||||
of a program so you can find gradual memory leaks. One simple way to
|
||||
do this is to compare two profiles -- both collected after the program
|
||||
has been running for a while. Specify the name of the first profile
|
||||
using the `--base` option. For example:
|
||||
|
||||
% pprof --base=/tmp/profile.0004.heap gfs_master /tmp/profile.0100.heap
|
||||
|
||||
The memory-usage in `/tmp/profile.0004.heap` will be
|
||||
subtracted from the memory-usage in
|
||||
`/tmp/profile.0100.heap` and the result will be
|
||||
displayed.
|
||||
|
||||
=== Text display
|
||||
|
||||
% pprof --text gfs_master /tmp/profile.0100.heap
|
||||
255.6 24.7% 24.7% 255.6 24.7% GFS_MasterChunk::AddServer
|
||||
184.6 17.8% 42.5% 298.8 28.8% GFS_MasterChunkTable::Create
|
||||
176.2 17.0% 59.5% 729.9 70.5% GFS_MasterChunkTable::UpdateState
|
||||
169.8 16.4% 75.9% 169.8 16.4% PendingClone::PendingClone
|
||||
76.3 7.4% 83.3% 76.3 7.4% __default_alloc_template::_S_chunk_alloc
|
||||
49.5 4.8% 88.0% 49.5 4.8% hashtable::resize
|
||||
...
|
||||
|
||||
* The first column contains the direct memory use in MB.
|
||||
* The fourth column contains memory use by the procedure
|
||||
and all of its callees.
|
||||
* The second and fifth columns are just percentage
|
||||
representations of the numbers in the first and fourth columns.
|
||||
* The third column is a cumulative sum of the second column
|
||||
(i.e., the `k`-th entry in the third column is the
|
||||
sum of the first `k` entries in the second column.)
|
||||
|
||||
=== Ignoring or focusing on specific regions
|
||||
|
||||
The following command will give a graphical display of a subset of
|
||||
the call-graph. Only paths in the call-graph that match the regular
|
||||
expression `DataBuffer` are included:
|
||||
|
||||
% pprof --gv --focus=DataBuffer gfs_master /tmp/profile.0100.heap
|
||||
|
||||
Similarly, the following command will omit all paths subset of the
|
||||
call-graph. All paths in the call-graph that match the regular
|
||||
expression `DataBuffer` are discarded:
|
||||
|
||||
% pprof --gv --ignore=DataBuffer gfs_master /tmp/profile.0100.heap
|
||||
|
||||
=== Total allocations + object-level information
|
||||
|
||||
All of the previous examples have displayed the amount of in-use
|
||||
space. I.e., the number of bytes that have been allocated but not
|
||||
freed. You can also get other types of information by supplying a
|
||||
flag to `pprof`:
|
||||
|
||||
[cols=2*]
|
||||
|===
|
||||
|`--inuse_space`
|
||||
|Display the number of in-use megabytes (i.e. space that has
|
||||
been allocated but not freed). This is the default.
|
||||
|
||||
|`--inuse_objects`
|
||||
|Display the number of in-use objects (i.e. number of
|
||||
objects that have been allocated but not freed).
|
||||
|
||||
|`--alloc_space`
|
||||
|Display the number of allocated megabytes. This includes
|
||||
the space that has since been de-allocated. Use this
|
||||
if you want to find the main allocation sites in the
|
||||
program.
|
||||
|
||||
|`--alloc_objects`
|
||||
|Display the number of allocated objects. This includes
|
||||
the objects that have since been de-allocated. Use this
|
||||
if you want to find the main allocation sites in the
|
||||
program.
|
||||
|===
|
||||
|
||||
=== Interactive mode
|
||||
|
||||
By default -- if you don't specify any flags to the contrary --
|
||||
pprof runs in interactive mode. At the `(pprof)` prompt,
|
||||
you can run many of the commands described above. You can type
|
||||
`help` for a list of what commands are available in
|
||||
interactive mode.
|
||||
|
||||
== Caveats
|
||||
|
||||
* Heap profiling requires the use of libtcmalloc. This
|
||||
requirement may be removed in a future version of the heap
|
||||
profiler, and the heap profiler separated out into its own
|
||||
library.
|
||||
|
||||
* If the program linked in a library that was not compiled
|
||||
with enough symbolic information, all samples associated
|
||||
with the library may be charged to the last symbol found
|
||||
in the program before the library. This will artificially
|
||||
inflate the count for that symbol.
|
||||
|
||||
* If you run the program on one machine, and profile it on
|
||||
another, and the shared libraries are different on the two
|
||||
machines, the profiling output may be confusing: samples that
|
||||
fall within the shared libaries may be assigned to arbitrary
|
||||
procedures.
|
||||
|
||||
* Several libraries, such as some STL implementations, do their
|
||||
own memory management. This may cause strange profiling
|
||||
results. We have code in libtcmalloc to cause STL to use
|
||||
tcmalloc for memory management (which in our tests is better
|
||||
than STL's internal management), though it only works for some
|
||||
STL implementations.
|
||||
|
||||
* If your program forks, the children will also be profiled
|
||||
(since they inherit the same HEAPPROFILE setting). Each
|
||||
process is profiled separately; to distinguish the child
|
||||
profiles from the parent profile and from each other, all
|
||||
children will have their process-id attached to the HEAPPROFILE
|
||||
name.
|
||||
|
||||
* Due to a hack we make to work around a possible gcc bug, your
|
||||
profiles may end up named strangely if the first character of
|
||||
your HEAPPROFILE variable has ascii value greater than 127.
|
||||
This should be exceedingly rare, but if you need to use such a
|
||||
name, just set prepend `./` to your filename:
|
||||
`HEAPPROFILE=.Ägypten`.
|
@ -1,391 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
|
||||
<HTML>
|
||||
|
||||
<HEAD>
|
||||
<link rel="stylesheet" href="designstyle.css">
|
||||
<title>Gperftools Heap Profiler</title>
|
||||
</HEAD>
|
||||
|
||||
<BODY>
|
||||
|
||||
<p align=right>
|
||||
<i>Last modified
|
||||
<script type=text/javascript>
|
||||
var lm = new Date(document.lastModified);
|
||||
document.write(lm.toDateString());
|
||||
</script></i>
|
||||
</p>
|
||||
|
||||
<p>This is the heap profiler we use at Google, to explore how C++
|
||||
programs manage memory. This facility can be useful for</p>
|
||||
<ul>
|
||||
<li> Figuring out what is in the program heap at any given time
|
||||
<li> Locating memory leaks
|
||||
<li> Finding places that do a lot of allocation
|
||||
</ul>
|
||||
|
||||
<p>The profiling system instruments all allocations and frees. It
|
||||
keeps track of various pieces of information per allocation site. An
|
||||
allocation site is defined as the active stack trace at the call to
|
||||
<code>malloc</code>, <code>calloc</code>, <code>realloc</code>, or,
|
||||
<code>new</code>.</p>
|
||||
|
||||
<p>There are three parts to using it: linking the library into an
|
||||
application, running the code, and analyzing the output.</p>
|
||||
|
||||
|
||||
<h1>Linking in the Library</h1>
|
||||
|
||||
<p>To install the heap profiler into your executable, add
|
||||
<code>-ltcmalloc</code> to the link-time step for your executable.
|
||||
Also, while we don't necessarily recommend this form of usage, it's
|
||||
possible to add in the profiler at run-time using
|
||||
<code>LD_PRELOAD</code>:
|
||||
<pre>% env LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary></pre>
|
||||
|
||||
<p>This does <i>not</i> turn on heap profiling; it just inserts the
|
||||
code. For that reason, it's practical to just always link
|
||||
<code>-ltcmalloc</code> into a binary while developing; that's what we
|
||||
do at Google. (However, since any user can turn on the profiler by
|
||||
setting an environment variable, it's not necessarily recommended to
|
||||
install profiler-linked binaries into a production, running
|
||||
system.) Note that if you wish to use the heap profiler, you must
|
||||
also use the tcmalloc memory-allocation library. There is no way
|
||||
currently to use the heap profiler separate from tcmalloc.</p>
|
||||
|
||||
|
||||
<h1>Running the Code</h1>
|
||||
|
||||
<p>There are several alternatives to actually turn on heap profiling
|
||||
for a given run of an executable:</p>
|
||||
|
||||
<ol>
|
||||
<li> <p>Define the environment variable HEAPPROFILE to the filename
|
||||
to dump the profile to. For instance, to profile
|
||||
<code>/usr/local/bin/my_binary_compiled_with_tcmalloc</code>:</p>
|
||||
<pre>% env HEAPPROFILE=/tmp/mybin.hprof /usr/local/bin/my_binary_compiled_with_tcmalloc</pre>
|
||||
<li> <p>In your code, bracket the code you want profiled in calls to
|
||||
<code>HeapProfilerStart()</code> and <code>HeapProfilerStop()</code>.
|
||||
(These functions are declared in <code><gperftools/heap-profiler.h></code>.)
|
||||
<code>HeapProfilerStart()</code> will take the
|
||||
profile-filename-prefix as an argument. Then, as often as
|
||||
you'd like before calling <code>HeapProfilerStop()</code>, you
|
||||
can use <code>HeapProfilerDump()</code> or
|
||||
<code>GetHeapProfile()</code> to examine the profile. In case
|
||||
it's useful, <code>IsHeapProfilerRunning()</code> will tell you
|
||||
whether you've already called HeapProfilerStart() or not.</p>
|
||||
</ol>
|
||||
|
||||
|
||||
<p>For security reasons, heap profiling will not write to a file --
|
||||
and is thus not usable -- for setuid programs.</p>
|
||||
|
||||
<H2>Modifying Runtime Behavior</H2>
|
||||
|
||||
<p>You can more finely control the behavior of the heap profiler via
|
||||
environment variables.</p>
|
||||
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_ALLOCATION_INTERVAL</code></td>
|
||||
<td>default: 1073741824 (1 Gb)</td>
|
||||
<td>
|
||||
Dump heap profiling information each time the specified number of
|
||||
bytes has been allocated by the program.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_INUSE_INTERVAL</code></td>
|
||||
<td>default: 104857600 (100 Mb)</td>
|
||||
<td>
|
||||
Dump heap profiling information whenever the high-water memory
|
||||
usage mark increases by the specified number of bytes.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_TIME_INTERVAL</code></td>
|
||||
<td>default: 0</td>
|
||||
<td>
|
||||
Dump heap profiling information each time the specified
|
||||
number of seconds has elapsed.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAPPROFILESIGNAL</code></td>
|
||||
<td>default: disabled</td>
|
||||
<td>
|
||||
Dump heap profiling information whenever the specified signal is sent to the
|
||||
process.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_MMAP</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
Profile <code>mmap</code>, <code>mremap</code> and <code>sbrk</code>
|
||||
calls in addition
|
||||
to <code>malloc</code>, <code>calloc</code>, <code>realloc</code>,
|
||||
and <code>new</code>. <b>NOTE:</b> this causes the profiler to
|
||||
profile calls internal to tcmalloc, since tcmalloc and friends use
|
||||
mmap and sbrk internally for allocations. One partial solution is
|
||||
to filter these allocations out when running <code>pprof</code>,
|
||||
with something like
|
||||
<code>pprof --ignore='DoAllocWithArena|SbrkSysAllocator::Alloc|MmapSysAllocator::Alloc</code>.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_ONLY_MMAP</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
Only profile <code>mmap</code>, <code>mremap</code>, and <code>sbrk</code>
|
||||
calls; do not profile
|
||||
<code>malloc</code>, <code>calloc</code>, <code>realloc</code>,
|
||||
or <code>new</code>.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>HEAP_PROFILE_MMAP_LOG</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
Log <code>mmap</code>/<code>munmap</code> calls.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
<H2>Checking for Leaks</H2>
|
||||
|
||||
<p>You can use the heap profiler to manually check for leaks, for
|
||||
instance by reading the profiler output and looking for large
|
||||
allocations. However, for that task, it's easier to use the <A
|
||||
HREF="heap_checker.html">automatic heap-checking facility</A> built
|
||||
into tcmalloc.</p>
|
||||
|
||||
|
||||
<h1><a name="pprof">Analyzing the Output</a></h1>
|
||||
|
||||
<p>If heap-profiling is turned on in a program, the program will
|
||||
periodically write profiles to the filesystem. The sequence of
|
||||
profiles will be named:</p>
|
||||
<pre>
|
||||
<prefix>.0000.heap
|
||||
<prefix>.0001.heap
|
||||
<prefix>.0002.heap
|
||||
...
|
||||
</pre>
|
||||
<p>where <code><prefix></code> is the filename-prefix supplied
|
||||
when running the code (e.g. via the <code>HEAPPROFILE</code>
|
||||
environment variable). Note that if the supplied prefix
|
||||
does not start with a <code>/</code>, the profile files will be
|
||||
written to the program's working directory.</p>
|
||||
|
||||
<p>The profile output can be viewed by passing it to the
|
||||
<code>pprof</code> tool -- the same tool that's used to analyze <A
|
||||
HREF="cpuprofile.html">CPU profiles</A>.
|
||||
|
||||
<p>Here are some examples. These examples assume the binary is named
|
||||
<code>gfs_master</code>, and a sequence of heap profile files can be
|
||||
found in files named:</p>
|
||||
<pre>
|
||||
/tmp/profile.0001.heap
|
||||
/tmp/profile.0002.heap
|
||||
...
|
||||
/tmp/profile.0100.heap
|
||||
</pre>
|
||||
|
||||
<h3>Why is a process so big</h3>
|
||||
|
||||
<pre>
|
||||
% pprof --gv gfs_master /tmp/profile.0100.heap
|
||||
</pre>
|
||||
|
||||
<p>This command will pop-up a <code>gv</code> window that displays
|
||||
the profile information as a directed graph. Here is a portion
|
||||
of the resulting output:</p>
|
||||
|
||||
<p><center>
|
||||
<img src="heap-example1.png">
|
||||
</center></p>
|
||||
|
||||
A few explanations:
|
||||
<ul>
|
||||
<li> <code>GFS_MasterChunk::AddServer</code> accounts for 255.6 MB
|
||||
of the live memory, which is 25% of the total live memory.
|
||||
<li> <code>GFS_MasterChunkTable::UpdateState</code> is directly
|
||||
accountable for 176.2 MB of the live memory (i.e., it directly
|
||||
allocated 176.2 MB that has not been freed yet). Furthermore,
|
||||
it and its callees are responsible for 729.9 MB. The
|
||||
labels on the outgoing edges give a good indication of the
|
||||
amount allocated by each callee.
|
||||
</ul>
|
||||
|
||||
<h3>Comparing Profiles</h3>
|
||||
|
||||
<p>You often want to skip allocations during the initialization phase
|
||||
of a program so you can find gradual memory leaks. One simple way to
|
||||
do this is to compare two profiles -- both collected after the program
|
||||
has been running for a while. Specify the name of the first profile
|
||||
using the <code>--base</code> option. For example:</p>
|
||||
<pre>
|
||||
% pprof --base=/tmp/profile.0004.heap gfs_master /tmp/profile.0100.heap
|
||||
</pre>
|
||||
|
||||
<p>The memory-usage in <code>/tmp/profile.0004.heap</code> will be
|
||||
subtracted from the memory-usage in
|
||||
<code>/tmp/profile.0100.heap</code> and the result will be
|
||||
displayed.</p>
|
||||
|
||||
<h3>Text display</h3>
|
||||
|
||||
<pre>
|
||||
% pprof --text gfs_master /tmp/profile.0100.heap
|
||||
255.6 24.7% 24.7% 255.6 24.7% GFS_MasterChunk::AddServer
|
||||
184.6 17.8% 42.5% 298.8 28.8% GFS_MasterChunkTable::Create
|
||||
176.2 17.0% 59.5% 729.9 70.5% GFS_MasterChunkTable::UpdateState
|
||||
169.8 16.4% 75.9% 169.8 16.4% PendingClone::PendingClone
|
||||
76.3 7.4% 83.3% 76.3 7.4% __default_alloc_template::_S_chunk_alloc
|
||||
49.5 4.8% 88.0% 49.5 4.8% hashtable::resize
|
||||
...
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
<ul>
|
||||
<li> The first column contains the direct memory use in MB.
|
||||
<li> The fourth column contains memory use by the procedure
|
||||
and all of its callees.
|
||||
<li> The second and fifth columns are just percentage
|
||||
representations of the numbers in the first and fourth columns.
|
||||
<li> The third column is a cumulative sum of the second column
|
||||
(i.e., the <code>k</code>th entry in the third column is the
|
||||
sum of the first <code>k</code> entries in the second column.)
|
||||
</ul>
|
||||
|
||||
<h3>Ignoring or focusing on specific regions</h3>
|
||||
|
||||
<p>The following command will give a graphical display of a subset of
|
||||
the call-graph. Only paths in the call-graph that match the regular
|
||||
expression <code>DataBuffer</code> are included:</p>
|
||||
<pre>
|
||||
% pprof --gv --focus=DataBuffer gfs_master /tmp/profile.0100.heap
|
||||
</pre>
|
||||
|
||||
<p>Similarly, the following command will omit all paths subset of the
|
||||
call-graph. All paths in the call-graph that match the regular
|
||||
expression <code>DataBuffer</code> are discarded:</p>
|
||||
<pre>
|
||||
% pprof --gv --ignore=DataBuffer gfs_master /tmp/profile.0100.heap
|
||||
</pre>
|
||||
|
||||
<h3>Total allocations + object-level information</h3>
|
||||
|
||||
<p>All of the previous examples have displayed the amount of in-use
|
||||
space. I.e., the number of bytes that have been allocated but not
|
||||
freed. You can also get other types of information by supplying a
|
||||
flag to <code>pprof</code>:</p>
|
||||
|
||||
<center>
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>--inuse_space</code></td>
|
||||
<td>
|
||||
Display the number of in-use megabytes (i.e. space that has
|
||||
been allocated but not freed). This is the default.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>--inuse_objects</code></td>
|
||||
<td>
|
||||
Display the number of in-use objects (i.e. number of
|
||||
objects that have been allocated but not freed).
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>--alloc_space</code></td>
|
||||
<td>
|
||||
Display the number of allocated megabytes. This includes
|
||||
the space that has since been de-allocated. Use this
|
||||
if you want to find the main allocation sites in the
|
||||
program.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>--alloc_objects</code></td>
|
||||
<td>
|
||||
Display the number of allocated objects. This includes
|
||||
the objects that have since been de-allocated. Use this
|
||||
if you want to find the main allocation sites in the
|
||||
program.
|
||||
</td>
|
||||
|
||||
</table>
|
||||
</center>
|
||||
|
||||
|
||||
<h3>Interactive mode</a></h3>
|
||||
|
||||
<p>By default -- if you don't specify any flags to the contrary --
|
||||
pprof runs in interactive mode. At the <code>(pprof)</code> prompt,
|
||||
you can run many of the commands described above. You can type
|
||||
<code>help</code> for a list of what commands are available in
|
||||
interactive mode.</p>
|
||||
|
||||
|
||||
<h1>Caveats</h1>
|
||||
|
||||
<ul>
|
||||
<li> Heap profiling requires the use of libtcmalloc. This
|
||||
requirement may be removed in a future version of the heap
|
||||
profiler, and the heap profiler separated out into its own
|
||||
library.
|
||||
|
||||
<li> If the program linked in a library that was not compiled
|
||||
with enough symbolic information, all samples associated
|
||||
with the library may be charged to the last symbol found
|
||||
in the program before the library. This will artificially
|
||||
inflate the count for that symbol.
|
||||
|
||||
<li> If you run the program on one machine, and profile it on
|
||||
another, and the shared libraries are different on the two
|
||||
machines, the profiling output may be confusing: samples that
|
||||
fall within the shared libaries may be assigned to arbitrary
|
||||
procedures.
|
||||
|
||||
<li> Several libraries, such as some STL implementations, do their
|
||||
own memory management. This may cause strange profiling
|
||||
results. We have code in libtcmalloc to cause STL to use
|
||||
tcmalloc for memory management (which in our tests is better
|
||||
than STL's internal management), though it only works for some
|
||||
STL implementations.
|
||||
|
||||
<li> If your program forks, the children will also be profiled
|
||||
(since they inherit the same HEAPPROFILE setting). Each
|
||||
process is profiled separately; to distinguish the child
|
||||
profiles from the parent profile and from each other, all
|
||||
children will have their process-id attached to the HEAPPROFILE
|
||||
name.
|
||||
|
||||
<li> Due to a hack we make to work around a possible gcc bug, your
|
||||
profiles may end up named strangely if the first character of
|
||||
your HEAPPROFILE variable has ascii value greater than 127.
|
||||
This should be exceedingly rare, but if you need to use such a
|
||||
name, just set prepend <code>./</code> to your filename:
|
||||
<code>HEAPPROFILE=./Ägypten</code>.
|
||||
</ul>
|
||||
|
||||
<hr>
|
||||
<address>Sanjay Ghemawat
|
||||
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
|
||||
</address>
|
||||
</body>
|
||||
</html>
|
@ -7,14 +7,10 @@
|
||||
<BODY>
|
||||
<ul>
|
||||
<li> <A HREF="tcmalloc.html">thread-caching malloc</A>
|
||||
<li> <A HREF="heap_checker.html">heap-checking using tcmalloc</A>
|
||||
<li> <A HREF="heapprofile.html">heap-profiling using tcmalloc</A>
|
||||
<li> <A HREF="cpuprofile.html">CPU profiler</A>
|
||||
</ul>
|
||||
|
||||
<hr>
|
||||
Last modified: Thu Feb 2 14:40:47 PST 2012
|
||||
|
||||
</BODY>
|
||||
|
||||
</HTML>
|
||||
|
131
docs/pprof.1
@ -1,131 +0,0 @@
|
||||
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.23.
|
||||
.TH PPROF "1" "February 2005" "pprof (part of gperftools)" Google
|
||||
.SH NAME
|
||||
pprof \- manual page for pprof (part of gperftools)
|
||||
.SH SYNOPSIS
|
||||
.B pprof
|
||||
[\fIoptions\fR] \fI<program> <profile>\fR
|
||||
.SH DESCRIPTION
|
||||
.IP
|
||||
Prints specified cpu- or heap-profile
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
\fB\-\-cum\fR
|
||||
Sort by cumulative data
|
||||
.TP
|
||||
\fB\-\-base=\fR<base>
|
||||
Subtract <base> from <profile> before display
|
||||
.SS "Reporting Granularity:"
|
||||
.TP
|
||||
\fB\-\-addresses\fR
|
||||
Report at address level
|
||||
.TP
|
||||
\fB\-\-lines\fR
|
||||
Report at source line level
|
||||
.TP
|
||||
\fB\-\-functions\fR
|
||||
Report at function level [default]
|
||||
.TP
|
||||
\fB\-\-files\fR
|
||||
Report at source file level
|
||||
.SS "Output type:"
|
||||
.TP
|
||||
\fB\-\-text\fR
|
||||
Generate text report [default]
|
||||
.TP
|
||||
\fB\-\-gv\fR
|
||||
Generate Postscript and display
|
||||
.TP
|
||||
\fB\-\-list=\fR<regexp>
|
||||
Generate source listing of matching routines
|
||||
.TP
|
||||
\fB\-\-disasm=\fR<regexp>
|
||||
Generate disassembly of matching routines
|
||||
.TP
|
||||
\fB\-\-dot\fR
|
||||
Generate DOT file to stdout
|
||||
.TP
|
||||
\fB\-\-ps\fR
|
||||
Generate Postscript to stdout
|
||||
.TP
|
||||
\fB\-\-pdf\fR
|
||||
Generate PDF to stdout
|
||||
.TP
|
||||
\fB\-\-gif\fR
|
||||
Generate GIF to stdout
|
||||
.SS "Heap-Profile Options:"
|
||||
.TP
|
||||
\fB\-\-inuse_space\fR
|
||||
Display in-use (mega)bytes [default]
|
||||
.TP
|
||||
\fB\-\-inuse_objects\fR
|
||||
Display in-use objects
|
||||
.TP
|
||||
\fB\-\-alloc_space\fR
|
||||
Display allocated (mega)bytes
|
||||
.TP
|
||||
\fB\-\-alloc_objects\fR
|
||||
Display allocated objects
|
||||
.TP
|
||||
\fB\-\-show_bytes\fR
|
||||
Display space in bytes
|
||||
.TP
|
||||
\fB\-\-drop_negative\fR
|
||||
Ignore negaive differences
|
||||
.SS "Call-graph Options:"
|
||||
.TP
|
||||
\fB\-\-nodecount=\fR<n>
|
||||
Show at most so many nodes [default=80]
|
||||
.TP
|
||||
\fB\-\-nodefraction=\fR<f>
|
||||
Hide nodes below <f>*total [default=.005]
|
||||
.TP
|
||||
\fB\-\-edgefraction=\fR<f>
|
||||
Hide edges below <f>*total [default=.001]
|
||||
.TP
|
||||
\fB\-\-focus=\fR<regexp>
|
||||
Focus on nodes matching <regexp>
|
||||
.TP
|
||||
\fB\-\-ignore=\fR<regexp>
|
||||
Ignore nodes matching <regexp>
|
||||
.TP
|
||||
\fB\-\-scale=\fR<n>
|
||||
Set GV scaling [default=0]
|
||||
.SH EXAMPLES
|
||||
|
||||
pprof /bin/ls ls.prof
|
||||
.IP
|
||||
Outputs one line per procedure
|
||||
.PP
|
||||
pprof \fB\-\-gv\fR /bin/ls ls.prof
|
||||
.IP
|
||||
Displays annotated call-graph via 'gv'
|
||||
.PP
|
||||
pprof \fB\-\-gv\fR \fB\-\-focus\fR=\fIMutex\fR /bin/ls ls.prof
|
||||
.IP
|
||||
Restricts to code paths including a .*Mutex.* entry
|
||||
.PP
|
||||
pprof \fB\-\-gv\fR \fB\-\-focus\fR=\fIMutex\fR \fB\-\-ignore\fR=\fIstring\fR /bin/ls ls.prof
|
||||
.IP
|
||||
Code paths including Mutex but not string
|
||||
.PP
|
||||
pprof \fB\-\-list\fR=\fIgetdir\fR /bin/ls ls.prof
|
||||
.IP
|
||||
Dissassembly (with per-line annotations) for getdir()
|
||||
.PP
|
||||
pprof \fB\-\-disasm\fR=\fIgetdir\fR /bin/ls ls.prof
|
||||
.IP
|
||||
Dissassembly (with per-PC annotations) for getdir()
|
||||
.SH COPYRIGHT
|
||||
Copyright \(co 2005 Google Inc.
|
||||
.SH "SEE ALSO"
|
||||
Further documentation for
|
||||
.B pprof
|
||||
is maintained as a web page called
|
||||
.B cpu_profiler.html
|
||||
and is likely installed at one of the following locations:
|
||||
.IP
|
||||
.B /usr/share/gperftools/cpu_profiler.html
|
||||
.br
|
||||
.B /usr/local/share/gperftools/cpu_profiler.html
|
||||
.PP
|
@ -1,11 +0,0 @@
|
||||
[see also]
|
||||
Further documentation for
|
||||
.B pprof
|
||||
is maintained as a web page called
|
||||
.B cpu_profiler.html
|
||||
and is likely installed at one of the following locations:
|
||||
.IP
|
||||
.B /usr/share/gperftools/cpu_profiler.html
|
||||
.br
|
||||
.B /usr/local/share/gperftools/cpu_profiler.html
|
||||
.PP
|
119
docs/pprof_integration.adoc
Normal file
@ -0,0 +1,119 @@
|
||||
== pprof integration
|
||||
|
||||
:reproducible:
|
||||
|
||||
gperftools was the original home for the pprof program. This program
|
||||
is used to visualize and analyze profiles (CPU profiles, heap
|
||||
profiles, heap samples, set of thread stacks, etc.). The original
|
||||
pprof was written in Perl. As of this writing, the Linux distros are
|
||||
shipping this version of pprof. Meanwhile, pprof was completely
|
||||
modernized and rewritten in Go. The Go version is a much better
|
||||
one. We've been recommending people to switch to the Go version for a
|
||||
number of years and starting gperftools 2.17 we no longer have the
|
||||
original pprof.
|
||||
|
||||
You can get the Go pprof binary by running:
|
||||
|
||||
% go install github.com/google/pprof@latest
|
||||
|
||||
The binary will normally appear in `$HOME/go/bin`. So you may want to
|
||||
add it to your `$PATH`.
|
||||
|
||||
The main documentation of pprof can be found at
|
||||
https://github.com/google/pprof/blob/main/doc/README.md
|
||||
|
||||
On this page, I'll point out some helpful integration aspects.
|
||||
|
||||
Here are the kinds of "profiles" that gperfools can feed into pprof.
|
||||
|
||||
=== CPU profiling
|
||||
|
||||
CPU profiler is provided in a distinct library: libprofiler. It's C++
|
||||
API is in `gperftools/profiler.h`. You can invoke
|
||||
`ProfilerStart()`/`ProfilerStop()` to control it. Or you can have
|
||||
libprofiler automagically profile the full run of your program by setting
|
||||
`CPUPROFILE` environment variable.
|
||||
|
||||
See link:cpuprofile.html[documentation of CPU profiler] for full
|
||||
details.
|
||||
|
||||
A general description of how statistical sampling profilers work can be
|
||||
found in this nice blog post: https://research.swtch.com/pprof.
|
||||
|
||||
We produce a "legacy" CPU profile format. The format is described
|
||||
here: link:cpuprofile-fileformat.html[].
|
||||
|
||||
=== Heap sample
|
||||
|
||||
libtcmalloc supports very low overhead sampling of allocations. If this feature is enabled, you can call:
|
||||
|
||||
std::string sample_profile;
|
||||
MallocExtension::instance()->GetHeapSample(&sample_profile);
|
||||
|
||||
And you'll get a statistical estimate of all currently in-use memory
|
||||
allocations with backtraces of allocations. It will show you where
|
||||
currently in-use memory was allocated. Heap sample can be saved and
|
||||
fed to the pprof program for visualization and analysis.
|
||||
|
||||
At Google, this feature is enabled fleet-wide (and by default), but in
|
||||
gperftools, our default is off. You can turn it on by setting the
|
||||
environment variable `TCMALLOC_SAMPLE_PARAMETER`. However, please note
|
||||
that libtcmalloc_minimal doesn't have this feature. In order to use
|
||||
heap sampling, you need to link to "full" libtcmalloc.
|
||||
|
||||
The reasonable value of the sample parameter is from 524288 (512kb;
|
||||
original default) to a few megs (current default at Google). A lower
|
||||
value gives you more samples, so higher statistical precision. But a
|
||||
lower value also causes higher overhead and lock contention.
|
||||
|
||||
Our sibling project, "abseil" tcmalloc, also supports heap
|
||||
sampling. Implementation has evolved a bit, but this is fundamentally
|
||||
the same logic. In addition to sampling, they also have allocation and
|
||||
deallocation profiling powered by the same sampling facility. Their
|
||||
docs are at:
|
||||
https://github.com/google/tcmalloc/blob/master/docs/sampling.md.
|
||||
|
||||
Go has similar feature called heap profiling. Go's heap profiles
|
||||
combine information about in-use memory and all the allocations ever
|
||||
made. It is similar to gperftools' link:heapprofile.html[heap profiler] but works
|
||||
via sampling, so it is low overhead and runs by default. You can read
|
||||
about it here: https://pkg.go.dev/runtime/pprof. Approximately every
|
||||
512k bytes (value of runtime.MemProfileRate) of memory allocated, Go's
|
||||
runtime triggers heap sampling. Heap sampling grabs backtrace, and
|
||||
then updates per-call-site allocation counters. The heap profile is a
|
||||
collection of call sites (identified by the backtrace chain) and
|
||||
relevant statistics.
|
||||
|
||||
=== Heap Growth stacks
|
||||
|
||||
Every time tcmalloc extends its heap, it grabs stack trace. A
|
||||
collection of those stacks can be obtained by:
|
||||
|
||||
MallocExtension::instance()->GetHeapGrowthStacks(&string);
|
||||
|
||||
and fed to pprof for visualization and analysis. This kind of profile
|
||||
gives you locations in your code that extended heap (either due to
|
||||
regular usage, leaks, or fragmentation).
|
||||
|
||||
Heap growth tracking is always enabled in full libtcmalloc and is cut
|
||||
off from libtcmalloc_minimal.
|
||||
|
||||
=== Heap Profiler
|
||||
|
||||
See link:heapprofile.html[Heap Profiler documentation]. Note that the
|
||||
heap profiler intercepts every allocation and deallocation call, so it
|
||||
runs with a much higher overhead than normal malloc and is not
|
||||
suitable for production.
|
||||
|
||||
=== HTTP interfaces
|
||||
|
||||
The more commonly used pprof integration point used at Google is via
|
||||
HTTP endpoints. Go standard library provides a great example of how it
|
||||
is done and how to use it. https://pkg.go.dev/net/http/pprof documents
|
||||
it.
|
||||
|
||||
gperftools doesn't provide any HTTP handlers, but we do give you raw
|
||||
profiling data, which you can serve by whatever HTTP-serving APIs you
|
||||
like. Each profile kind (with the partial exception of heap profiler)
|
||||
has an API to obtain profile data, which can be returned from an HTTP
|
||||
handler.
|
@ -1,260 +0,0 @@
|
||||
<HTML>
|
||||
|
||||
<HEAD>
|
||||
<title>pprof and Remote Servers</title>
|
||||
</HEAD>
|
||||
|
||||
<BODY>
|
||||
|
||||
<h1><code>pprof</code> and Remote Servers</h1>
|
||||
|
||||
<p>In mid-2006, we added an experimental facility to <A
|
||||
HREF="cpu_profiler.html">pprof</A>, the tool that analyzes CPU and
|
||||
heap profiles. This facility allows you to collect profile
|
||||
information from running applications. It makes it easy to collect
|
||||
profile information without having to stop the program first, and
|
||||
without having to log into the machine where the application is
|
||||
running. This is meant to be used on webservers, but will work on any
|
||||
application that can be modified to accept TCP connections on a port
|
||||
of its choosing, and to respond to HTTP requests on that port.</p>
|
||||
|
||||
<p>We do not currently have infrastructure, such as apache modules,
|
||||
that you can pop into a webserver or other application to get the
|
||||
necessary functionality "for free." However, it's easy to generate
|
||||
the necessary data, which should allow the interested developer to add
|
||||
the necessary support into his or her applications.</p>
|
||||
|
||||
<p>To use <code>pprof</code> in this experimental "server" mode, you
|
||||
give the script a host and port it should query, replacing the normal
|
||||
commandline arguments of application + profile file:</p>
|
||||
<pre>
|
||||
% pprof internalweb.mycompany.com:80
|
||||
</pre>
|
||||
|
||||
<p>The host must be listening on that port, and be able to accept HTTP/1.0
|
||||
requests -- sent via <code>wget</code> and <code>curl</code> -- for
|
||||
several urls. The following sections list the urls that
|
||||
<code>pprof</code> can send, and the responses it expects in
|
||||
return.</p>
|
||||
|
||||
<p>Here are examples that pprof will recognize, when you give them
|
||||
on the commandline, are urls. In general, you
|
||||
specify the host and a port (the port-number is required), and put
|
||||
the service-name at the end of the url.:</p>
|
||||
<blockquote><pre>
|
||||
http://myhost:80/pprof/heap # retrieves a heap profile
|
||||
http://myhost:8008/pprof/profile # retrieves a CPU profile
|
||||
http://myhost:80 # retrieves a CPU profile (the default)
|
||||
http://myhost:8080/ # retrieves a CPU profile (the default)
|
||||
myhost:8088/pprof/growth # "http://" is optional, but port is not
|
||||
http://myhost:80/myservice/pprof/heap # /pprof/heap just has to come at the end
|
||||
http://myhost:80/pprof/pmuprofile # CPU profile using performance counters
|
||||
</pre></blockquote>
|
||||
|
||||
<h2> <code><b>/pprof/heap</b></code> </h2>
|
||||
|
||||
<p><code>pprof</code> asks for the url <code>/pprof/heap</code> to
|
||||
get heap information. The actual url is controlled via the variable
|
||||
<code>HEAP_PAGE</code> in the <code>pprof</code> script, so you
|
||||
can change it if you'd like.</p>
|
||||
|
||||
<p>There are two ways to get this data. The first is to call</p>
|
||||
<pre>
|
||||
MallocExtension::instance()->GetHeapSample(&output);
|
||||
</pre>
|
||||
<p>and have the server send <code>output</code> back as an HTTP
|
||||
response to <code>pprof</code>. <code>MallocExtension</code> is
|
||||
defined in the header file <code>gperftools/malloc_extension.h</code>.</p>
|
||||
|
||||
<p>Note this will only only work if the binary is being run with
|
||||
sampling turned on (which is not the default). To do this, set the
|
||||
environment variable <code>TCMALLOC_SAMPLE_PARAMETER</code> to a
|
||||
positive value, such as 524288, before running.</p>
|
||||
|
||||
<p>The other way is to call <code>HeapProfileStart(filename)</code>
|
||||
(from <code>heap-profiler.h</code>), continue to do work, and then,
|
||||
some number of seconds later, call <code>GetHeapProfile()</code>
|
||||
(followed by <code>HeapProfilerStop()</code>). The server can send
|
||||
the output of <code>GetHeapProfile</code> back as the HTTP response to
|
||||
pprof. (Note you must <code>free()</code> this data after using it.)
|
||||
This is similar to how <A HREF="#profile">profile requests</A> are
|
||||
handled, below. This technique does not require the application to
|
||||
run with sampling turned on.</p>
|
||||
|
||||
<p>Here's an example of what the output should look like:</p>
|
||||
<pre>
|
||||
heap profile: 1923: 127923432 [ 1923: 127923432] @ heap_v2/524288
|
||||
1: 312 [ 1: 312] @ 0x2aaaabaf5ccc 0x2aaaaba4cd2c 0x2aaaac08c09a
|
||||
928: 122586016 [ 928: 122586016] @ 0x2aaaabaf682c 0x400680 0x400bdd 0x2aaaab1c368a 0x2aaaab1c8f77 0x2aaaab1c0396 0x2aaaab1c86ed 0x4007ff 0x2aaaaca62afa
|
||||
1: 16 [ 1: 16] @ 0x2aaaabaf5ccc 0x2aaaabb04bac 0x2aaaabc1b262 0x2aaaabc21496 0x2aaaabc214bb
|
||||
[...]
|
||||
</pre>
|
||||
|
||||
|
||||
<p> Older code may produce "version 1" heap profiles which look like this:<p/>
|
||||
<pre>
|
||||
heap profile: 14933: 791700132 [ 14933: 791700132] @ heap
|
||||
1: 848688 [ 1: 848688] @ 0xa4b142 0x7f5bfc 0x87065e 0x4056e9 0x4125f8 0x42b4f1 0x45b1ba 0x463248 0x460871 0x45cb7c 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
|
||||
1: 1048576 [ 1: 1048576] @ 0xa4a9b2 0x7fd025 0x4ca6d8 0x4ca814 0x4caa88 0x2aaaab104cf0 0x404e20 0x4125f8 0x42b4f1 0x45b1ba 0x463248 0x460871 0x45cb7c 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
|
||||
2942: 388629374 [ 2942: 388629374] @ 0xa4b142 0x4006a0 0x400bed 0x5f0cfa 0x5f1744 0x607cee 0x5f4a5e 0x40080f 0x2aaaabad7afa
|
||||
[...]
|
||||
</pre>
|
||||
<p>pprof accepts both old and new heap profiles and automatically
|
||||
detects which one you are using.</p>
|
||||
|
||||
<h2> <code><b>/pprof/growth</b></code> </h2>
|
||||
|
||||
<p><code>pprof</code> asks for the url <code>/pprof/growth</code> to
|
||||
get heap-profiling delta (growth) information. The actual url is
|
||||
controlled via the variable <code>GROWTH_PAGE</code> in the
|
||||
<code>pprof</code> script, so you can change it if you'd like.</p>
|
||||
|
||||
<p>The server should respond by calling</p>
|
||||
<pre>
|
||||
MallocExtension::instance()->GetHeapGrowthStacks(&output);
|
||||
</pre>
|
||||
<p>and sending <code>output</code> back as an HTTP response to
|
||||
<code>pprof</code>. <code>MallocExtension</code> is defined in the
|
||||
header file <code>gperftools/malloc_extension.h</code>.</p>
|
||||
|
||||
<p>Here's an example, from an actual Google webserver, of what the
|
||||
output should look like:</p>
|
||||
<pre>
|
||||
heap profile: 741: 812122112 [ 741: 812122112] @ growth
|
||||
1: 1572864 [ 1: 1572864] @ 0x87da564 0x87db8a3 0x84787a4 0x846e851 0x836d12f 0x834cd1c 0x8349ba5 0x10a3177 0x8349961
|
||||
1: 1048576 [ 1: 1048576] @ 0x87d92e8 0x87d9213 0x87d9178 0x87d94d3 0x87da9da 0x8a364ff 0x8a437e7 0x8ab7d23 0x8ab7da9 0x8ac7454 0x8348465 0x10a3161 0x8349961
|
||||
[...]
|
||||
</pre>
|
||||
|
||||
|
||||
<h2> <A NAME="profile"><code><b>/pprof/profile</b></code></A> </h2>
|
||||
|
||||
<p><code>pprof</code> asks for the url
|
||||
<code>/pprof/profile?seconds=XX</code> to get cpu-profiling
|
||||
information. The actual url is controlled via the variable
|
||||
<code>PROFILE_PAGE</code> in the <code>pprof</code> script, so you can
|
||||
change it if you'd like.</p>
|
||||
|
||||
<p>The server should respond by calling
|
||||
<code>ProfilerStart(filename)</code>, continuing to do its work, and
|
||||
then, XX seconds later, calling <code>ProfilerStop()</code>. (These
|
||||
functions are declared in <code>gperftools/profiler.h</code>.) The
|
||||
application is responsible for picking a unique filename for
|
||||
<code>ProfilerStart()</code>. After calling
|
||||
<code>ProfilerStop()</code>, the server should read the contents of
|
||||
<code>filename</code> and send them back as an HTTP response to
|
||||
<code>pprof</code>.</p>
|
||||
|
||||
<p>Obviously, to get useful profile information the application must
|
||||
continue to run in the XX seconds that the profiler is running. Thus,
|
||||
the profile start-stop calls should be done in a separate thread, or
|
||||
be otherwise non-blocking.</p>
|
||||
|
||||
<p>The profiler output file is binary, but near the end of it, it
|
||||
should have lines of text somewhat like this:</p>
|
||||
<pre>
|
||||
01016000-01017000 rw-p 00015000 03:01 59314 /lib/ld-2.2.2.so
|
||||
</pre>
|
||||
|
||||
<h2> <code><b>/pprof/pmuprofile</b></code> </h2>
|
||||
|
||||
<code>pprof</code> asks for a url of the form
|
||||
<code>/pprof/pmuprofile?event=hw_event:unit_mask&period=nnn&seconds=xxx</code>
|
||||
to get cpu-profiling information. The actual url is controlled via the variable
|
||||
<code>PMUPROFILE_PAGE</code> in the <code>pprof</code> script, so you can
|
||||
change it if you'd like.</p>
|
||||
|
||||
<p>
|
||||
This is similar to pprof, but is meant to be used with your CPU's hardware
|
||||
performance counters. The server could be implemented on top of a library
|
||||
such as <a href="http://perfmon2.sourceforge.net/">
|
||||
<code>libpfm</code></a>. It should collect a sample every nnn occurrences
|
||||
of the event and stop the sampling after xxx seconds. Much of the code
|
||||
for <code>/pprof/profile</code> can be reused for this purpose.
|
||||
</p>
|
||||
|
||||
<p>The server side routines (the equivalent of
|
||||
ProfilerStart/ProfilerStart) are not available as part of perftools,
|
||||
so this URL is unlikely to be that useful.</p>
|
||||
|
||||
<h2> <code><b>/pprof/contention</b></code> </h2>
|
||||
|
||||
<p>This is intended to be able to profile (thread) lock contention in
|
||||
addition to CPU and memory use. It's not yet usable.</p>
|
||||
|
||||
|
||||
<h2> <code><b>/pprof/cmdline</b></code> </h2>
|
||||
|
||||
<p><code>pprof</code> asks for the url <code>/pprof/cmdline</code> to
|
||||
figure out what application it's profiling. The actual url is
|
||||
controlled via the variable <code>PROGRAM_NAME_PAGE</code> in the
|
||||
<code>pprof</code> script, so you can change it if you'd like.</p>
|
||||
|
||||
<p>The server should respond by reading the contents of
|
||||
<code>/proc/self/cmdline</code>, converting all internal NUL (\0)
|
||||
characters to newlines, and sending the result back as an HTTP
|
||||
response to <code>pprof</code>.</p>
|
||||
|
||||
<p>Here's an example return value:<p>
|
||||
<pre>
|
||||
/root/server/custom_webserver
|
||||
80
|
||||
--configfile=/root/server/ws.config
|
||||
</pre>
|
||||
|
||||
|
||||
<h2> <code><b>/pprof/symbol</b></code> </h2>
|
||||
|
||||
<p><code>pprof</code> asks for the url <code>/pprof/symbol</code> to
|
||||
map from hex addresses to variable names. The actual url is
|
||||
controlled via the variable <code>SYMBOL_PAGE</code> in the
|
||||
<code>pprof</code> script, so you can change it if you'd like.</p>
|
||||
|
||||
<p>When the server receives a GET request for
|
||||
<code>/pprof/symbol</code>, it should return a line formatted like
|
||||
so:</p>
|
||||
<pre>
|
||||
num_symbols: ###
|
||||
</pre>
|
||||
<p>where <code>###</code> is the number of symbols found in the
|
||||
binary. (For now, the only important distinction is whether the value
|
||||
is 0, which it is for executables that lack debug information, or
|
||||
not-0).</p>
|
||||
|
||||
<p>This is perhaps the hardest request to write code for, because in
|
||||
addition to the GET request for this url, the server must accept POST
|
||||
requests. This means that after the HTTP headers, pprof will pass in
|
||||
a list of hex addresses connected by <code>+</code>, like so:</p>
|
||||
<pre>
|
||||
curl -d '0x0824d061+0x0824d1cf' http://remote_host:80/pprof/symbol
|
||||
</pre>
|
||||
|
||||
<p>The server should read the POST data, which will be in one line,
|
||||
and for each hex value, should write one line of output to the output
|
||||
stream, like so:</p>
|
||||
<pre>
|
||||
<hex address><tab><function name>
|
||||
</pre>
|
||||
<p>For instance:</p>
|
||||
<pre>
|
||||
0x08b2dabd _Update
|
||||
</pre>
|
||||
|
||||
<p>The other reason this is the most difficult request to implement,
|
||||
is that the application will have to figure out for itself how to map
|
||||
from address to function name. One possibility is to run <code>nm -C
|
||||
-n <program name></code> to get the mappings at
|
||||
program-compile-time. Another, at least on Linux, is to call out to
|
||||
addr2line for every <code>pprof/symbol</code> call, for instance
|
||||
<code>addr2line -Cfse /proc/<getpid>/exe 0x12345678 0x876543210</code>
|
||||
(presumably with some caching!)</p>
|
||||
|
||||
<p><code>pprof</code> itself does just this for local profiles (not
|
||||
ones that talk to remote servers); look at the subroutine
|
||||
<code>GetProcedureBoundaries</code>.</p>
|
||||
|
||||
|
||||
<hr>
|
||||
Last modified: Mon Jun 12 21:30:14 PDT 2006
|
||||
</body>
|
||||
</html>
|
@ -1,480 +0,0 @@
|
||||
time.1.ptmalloc.64:0.56 user 0.02 system 0.57 elapsed 100% CPU
|
||||
time.1.tcmalloc.64:0.38 user 0.02 system 0.40 elapsed 98% CPU
|
||||
time.1.ptmalloc.128:0.61 user 0.01 system 0.61 elapsed 101% CPU
|
||||
time.1.tcmalloc.128:0.35 user 0.00 system 0.35 elapsed 99% CPU
|
||||
time.1.ptmalloc.256:0.59 user 0.01 system 0.60 elapsed 100% CPU
|
||||
time.1.tcmalloc.256:0.27 user 0.02 system 0.28 elapsed 102% CPU
|
||||
time.1.ptmalloc.512:0.57 user 0.00 system 0.57 elapsed 100% CPU
|
||||
time.1.tcmalloc.512:0.25 user 0.01 system 0.25 elapsed 101% CPU
|
||||
time.1.ptmalloc.1024:0.52 user 0.00 system 0.52 elapsed 99% CPU
|
||||
time.1.tcmalloc.1024:0.22 user 0.02 system 0.24 elapsed 97% CPU
|
||||
time.1.ptmalloc.2048:0.47 user 0.00 system 0.47 elapsed 99% CPU
|
||||
time.1.tcmalloc.2048:0.22 user 0.02 system 0.25 elapsed 95% CPU
|
||||
time.1.ptmalloc.4096:0.48 user 0.01 system 0.48 elapsed 100% CPU
|
||||
time.1.tcmalloc.4096:0.25 user 0.01 system 0.25 elapsed 100% CPU
|
||||
time.1.ptmalloc.8192:0.49 user 0.02 system 0.49 elapsed 102% CPU
|
||||
time.1.tcmalloc.8192:0.27 user 0.02 system 0.28 elapsed 101% CPU
|
||||
time.1.ptmalloc.16384:0.51 user 0.04 system 0.55 elapsed 99% CPU
|
||||
time.1.tcmalloc.16384:0.35 user 0.02 system 0.37 elapsed 100% CPU
|
||||
time.1.ptmalloc.32768:0.53 user 0.14 system 0.66 elapsed 100% CPU
|
||||
time.1.tcmalloc.32768:0.67 user 0.02 system 0.69 elapsed 99% CPU
|
||||
time.1.ptmalloc.65536:0.68 user 0.31 system 0.98 elapsed 100% CPU
|
||||
time.1.tcmalloc.65536:0.71 user 0.01 system 0.72 elapsed 99% CPU
|
||||
time.1.ptmalloc.131072:0.90 user 0.72 system 1.62 elapsed 99% CPU
|
||||
time.1.tcmalloc.131072:0.94 user 0.03 system 0.97 elapsed 99% CPU
|
||||
time.2.ptmalloc.64:1.05 user 0.00 system 0.53 elapsed 196% CPU
|
||||
time.2.tcmalloc.64:0.66 user 0.03 system 0.37 elapsed 185% CPU
|
||||
time.2.ptmalloc.128:1.77 user 0.01 system 0.89 elapsed 198% CPU
|
||||
time.2.tcmalloc.128:0.53 user 0.01 system 0.29 elapsed 184% CPU
|
||||
time.2.ptmalloc.256:1.14 user 0.01 system 0.62 elapsed 182% CPU
|
||||
time.2.tcmalloc.256:0.45 user 0.02 system 0.26 elapsed 180% CPU
|
||||
time.2.ptmalloc.512:1.26 user 0.40 system 1.79 elapsed 92% CPU
|
||||
time.2.tcmalloc.512:0.43 user 0.02 system 0.27 elapsed 166% CPU
|
||||
time.2.ptmalloc.1024:0.98 user 0.03 system 0.56 elapsed 179% CPU
|
||||
time.2.tcmalloc.1024:0.44 user 0.02 system 0.34 elapsed 134% CPU
|
||||
time.2.ptmalloc.2048:0.87 user 0.02 system 0.44 elapsed 199% CPU
|
||||
time.2.tcmalloc.2048:0.49 user 0.02 system 0.34 elapsed 148% CPU
|
||||
time.2.ptmalloc.4096:0.92 user 0.03 system 0.48 elapsed 196% CPU
|
||||
time.2.tcmalloc.4096:0.50 user 0.02 system 0.49 elapsed 105% CPU
|
||||
time.2.ptmalloc.8192:1.05 user 0.04 system 0.55 elapsed 196% CPU
|
||||
time.2.tcmalloc.8192:0.59 user 0.01 system 0.51 elapsed 116% CPU
|
||||
time.2.ptmalloc.16384:1.30 user 0.14 system 0.72 elapsed 198% CPU
|
||||
time.2.tcmalloc.16384:0.63 user 0.03 system 0.68 elapsed 96% CPU
|
||||
time.2.ptmalloc.32768:1.33 user 0.56 system 1.00 elapsed 189% CPU
|
||||
time.2.tcmalloc.32768:1.16 user 0.01 system 1.17 elapsed 99% CPU
|
||||
time.2.ptmalloc.65536:1.86 user 1.79 system 2.01 elapsed 181% CPU
|
||||
time.2.tcmalloc.65536:1.35 user 0.01 system 1.35 elapsed 100% CPU
|
||||
time.2.ptmalloc.131072:2.61 user 5.19 system 4.81 elapsed 162% CPU
|
||||
time.2.tcmalloc.131072:1.86 user 0.04 system 1.90 elapsed 100% CPU
|
||||
time.3.ptmalloc.64:1.79 user 0.03 system 0.67 elapsed 268% CPU
|
||||
time.3.tcmalloc.64:1.58 user 0.04 system 0.62 elapsed 260% CPU
|
||||
time.3.ptmalloc.128:2.77 user 1.34 system 3.07 elapsed 133% CPU
|
||||
time.3.tcmalloc.128:1.19 user 0.01 system 0.50 elapsed 236% CPU
|
||||
time.3.ptmalloc.256:2.14 user 0.02 system 0.85 elapsed 252% CPU
|
||||
time.3.tcmalloc.256:0.96 user 0.01 system 0.41 elapsed 236% CPU
|
||||
time.3.ptmalloc.512:3.37 user 1.31 system 3.33 elapsed 140% CPU
|
||||
time.3.tcmalloc.512:0.93 user 0.04 system 0.39 elapsed 243% CPU
|
||||
time.3.ptmalloc.1024:1.66 user 0.01 system 0.64 elapsed 260% CPU
|
||||
time.3.tcmalloc.1024:0.81 user 0.02 system 0.44 elapsed 187% CPU
|
||||
time.3.ptmalloc.2048:2.07 user 0.01 system 0.82 elapsed 252% CPU
|
||||
time.3.tcmalloc.2048:1.10 user 0.04 system 0.59 elapsed 191% CPU
|
||||
time.3.ptmalloc.4096:2.01 user 0.03 system 0.79 elapsed 258% CPU
|
||||
time.3.tcmalloc.4096:0.87 user 0.03 system 0.65 elapsed 137% CPU
|
||||
time.3.ptmalloc.8192:2.22 user 0.11 system 0.83 elapsed 280% CPU
|
||||
time.3.tcmalloc.8192:0.96 user 0.06 system 0.75 elapsed 135% CPU
|
||||
time.3.ptmalloc.16384:2.56 user 0.47 system 1.02 elapsed 295% CPU
|
||||
time.3.tcmalloc.16384:0.99 user 0.04 system 1.03 elapsed 99% CPU
|
||||
time.3.ptmalloc.32768:3.29 user 1.75 system 1.96 elapsed 256% CPU
|
||||
time.3.tcmalloc.32768:1.67 user 0.02 system 1.69 elapsed 99% CPU
|
||||
time.3.ptmalloc.65536:4.04 user 6.62 system 4.92 elapsed 216% CPU
|
||||
time.3.tcmalloc.65536:1.91 user 0.02 system 1.98 elapsed 97% CPU
|
||||
time.3.ptmalloc.131072:5.55 user 17.86 system 12.44 elapsed 188% CPU
|
||||
time.3.tcmalloc.131072:2.78 user 0.02 system 2.82 elapsed 99% CPU
|
||||
time.4.ptmalloc.64:3.42 user 1.36 system 3.20 elapsed 149% CPU
|
||||
time.4.tcmalloc.64:2.42 user 0.02 system 0.71 elapsed 341% CPU
|
||||
time.4.ptmalloc.128:3.98 user 1.79 system 3.89 elapsed 148% CPU
|
||||
time.4.tcmalloc.128:1.87 user 0.02 system 0.58 elapsed 325% CPU
|
||||
time.4.ptmalloc.256:4.06 user 2.14 system 4.12 elapsed 150% CPU
|
||||
time.4.tcmalloc.256:1.69 user 0.02 system 0.51 elapsed 331% CPU
|
||||
time.4.ptmalloc.512:4.48 user 2.15 system 4.39 elapsed 150% CPU
|
||||
time.4.tcmalloc.512:1.62 user 0.03 system 0.52 elapsed 314% CPU
|
||||
time.4.ptmalloc.1024:3.18 user 0.03 system 0.84 elapsed 381% CPU
|
||||
time.4.tcmalloc.1024:1.53 user 0.02 system 0.56 elapsed 274% CPU
|
||||
time.4.ptmalloc.2048:3.24 user 0.02 system 0.84 elapsed 384% CPU
|
||||
time.4.tcmalloc.2048:1.44 user 0.04 system 0.66 elapsed 221% CPU
|
||||
time.4.ptmalloc.4096:3.50 user 0.04 system 0.91 elapsed 389% CPU
|
||||
time.4.tcmalloc.4096:1.31 user 0.01 system 0.89 elapsed 148% CPU
|
||||
time.4.ptmalloc.8192:6.77 user 3.85 system 4.14 elapsed 256% CPU
|
||||
time.4.tcmalloc.8192:1.20 user 0.05 system 0.97 elapsed 127% CPU
|
||||
time.4.ptmalloc.16384:7.08 user 5.06 system 4.63 elapsed 262% CPU
|
||||
time.4.tcmalloc.16384:1.27 user 0.03 system 1.25 elapsed 103% CPU
|
||||
time.4.ptmalloc.32768:5.57 user 4.22 system 3.31 elapsed 295% CPU
|
||||
time.4.tcmalloc.32768:2.17 user 0.03 system 2.25 elapsed 97% CPU
|
||||
time.4.ptmalloc.65536:6.11 user 15.05 system 9.19 elapsed 230% CPU
|
||||
time.4.tcmalloc.65536:2.51 user 0.02 system 2.57 elapsed 98% CPU
|
||||
time.4.ptmalloc.131072:7.58 user 33.15 system 21.28 elapsed 191% CPU
|
||||
time.4.tcmalloc.131072:3.57 user 0.07 system 3.66 elapsed 99% CPU
|
||||
time.5.ptmalloc.64:4.44 user 2.08 system 4.37 elapsed 148% CPU
|
||||
time.5.tcmalloc.64:2.87 user 0.02 system 0.79 elapsed 361% CPU
|
||||
time.5.ptmalloc.128:4.77 user 2.77 system 5.14 elapsed 146% CPU
|
||||
time.5.tcmalloc.128:2.65 user 0.03 system 0.72 elapsed 367% CPU
|
||||
time.5.ptmalloc.256:5.82 user 2.88 system 5.49 elapsed 158% CPU
|
||||
time.5.tcmalloc.256:2.33 user 0.01 system 0.66 elapsed 352% CPU
|
||||
time.5.ptmalloc.512:6.27 user 3.11 system 5.34 elapsed 175% CPU
|
||||
time.5.tcmalloc.512:2.14 user 0.03 system 0.70 elapsed 307% CPU
|
||||
time.5.ptmalloc.1024:6.82 user 3.18 system 5.23 elapsed 191% CPU
|
||||
time.5.tcmalloc.1024:2.20 user 0.02 system 0.70 elapsed 313% CPU
|
||||
time.5.ptmalloc.2048:6.57 user 3.46 system 5.22 elapsed 192% CPU
|
||||
time.5.tcmalloc.2048:2.15 user 0.03 system 0.82 elapsed 264% CPU
|
||||
time.5.ptmalloc.4096:8.75 user 5.09 system 5.26 elapsed 263% CPU
|
||||
time.5.tcmalloc.4096:1.68 user 0.03 system 1.08 elapsed 158% CPU
|
||||
time.5.ptmalloc.8192:4.48 user 0.61 system 1.51 elapsed 335% CPU
|
||||
time.5.tcmalloc.8192:1.47 user 0.07 system 1.18 elapsed 129% CPU
|
||||
time.5.ptmalloc.16384:5.71 user 1.98 system 2.14 elapsed 358% CPU
|
||||
time.5.tcmalloc.16384:1.58 user 0.03 system 1.52 elapsed 105% CPU
|
||||
time.5.ptmalloc.32768:7.19 user 7.81 system 5.53 elapsed 270% CPU
|
||||
time.5.tcmalloc.32768:2.63 user 0.05 system 2.72 elapsed 98% CPU
|
||||
time.5.ptmalloc.65536:8.45 user 23.51 system 14.30 elapsed 223% CPU
|
||||
time.5.tcmalloc.65536:3.12 user 0.05 system 3.21 elapsed 98% CPU
|
||||
time.5.ptmalloc.131072:10.22 user 43.63 system 27.84 elapsed 193% CPU
|
||||
time.5.tcmalloc.131072:4.42 user 0.07 system 4.51 elapsed 99% CPU
|
||||
time.6.ptmalloc.64:5.57 user 2.56 system 5.08 elapsed 159% CPU
|
||||
time.6.tcmalloc.64:3.20 user 0.01 system 0.89 elapsed 360% CPU
|
||||
time.6.ptmalloc.128:5.98 user 3.52 system 5.71 elapsed 166% CPU
|
||||
time.6.tcmalloc.128:2.76 user 0.02 system 0.78 elapsed 355% CPU
|
||||
time.6.ptmalloc.256:4.61 user 0.02 system 1.19 elapsed 389% CPU
|
||||
time.6.tcmalloc.256:2.65 user 0.02 system 0.74 elapsed 356% CPU
|
||||
time.6.ptmalloc.512:8.28 user 3.88 system 6.61 elapsed 183% CPU
|
||||
time.6.tcmalloc.512:2.60 user 0.02 system 0.72 elapsed 362% CPU
|
||||
time.6.ptmalloc.1024:4.75 user 0.00 system 1.22 elapsed 387% CPU
|
||||
time.6.tcmalloc.1024:2.56 user 0.02 system 0.79 elapsed 325% CPU
|
||||
time.6.ptmalloc.2048:8.90 user 4.59 system 6.15 elapsed 219% CPU
|
||||
time.6.tcmalloc.2048:2.37 user 0.06 system 0.96 elapsed 250% CPU
|
||||
time.6.ptmalloc.4096:11.41 user 7.02 system 6.31 elapsed 291% CPU
|
||||
time.6.tcmalloc.4096:1.82 user 0.03 system 1.19 elapsed 154% CPU
|
||||
time.6.ptmalloc.8192:11.64 user 8.25 system 5.97 elapsed 332% CPU
|
||||
time.6.tcmalloc.8192:1.83 user 0.07 system 1.38 elapsed 136% CPU
|
||||
time.6.ptmalloc.16384:7.44 user 2.98 system 3.01 elapsed 345% CPU
|
||||
time.6.tcmalloc.16384:1.83 user 0.08 system 1.80 elapsed 105% CPU
|
||||
time.6.ptmalloc.32768:8.69 user 12.35 system 8.04 elapsed 261% CPU
|
||||
time.6.tcmalloc.32768:3.14 user 0.06 system 3.24 elapsed 98% CPU
|
||||
time.6.ptmalloc.65536:10.52 user 35.43 system 20.75 elapsed 221% CPU
|
||||
time.6.tcmalloc.65536:3.62 user 0.03 system 3.72 elapsed 98% CPU
|
||||
time.6.ptmalloc.131072:11.74 user 59.00 system 36.93 elapsed 191% CPU
|
||||
time.6.tcmalloc.131072:5.33 user 0.04 system 5.42 elapsed 98% CPU
|
||||
time.7.ptmalloc.64:6.60 user 3.45 system 6.01 elapsed 167% CPU
|
||||
time.7.tcmalloc.64:3.50 user 0.04 system 0.94 elapsed 376% CPU
|
||||
time.7.ptmalloc.128:7.09 user 4.25 system 6.69 elapsed 169% CPU
|
||||
time.7.tcmalloc.128:3.13 user 0.03 system 0.84 elapsed 374% CPU
|
||||
time.7.ptmalloc.256:9.28 user 4.85 system 7.20 elapsed 196% CPU
|
||||
time.7.tcmalloc.256:3.06 user 0.02 system 0.82 elapsed 375% CPU
|
||||
time.7.ptmalloc.512:9.13 user 4.78 system 6.79 elapsed 204% CPU
|
||||
time.7.tcmalloc.512:2.99 user 0.03 system 0.83 elapsed 359% CPU
|
||||
time.7.ptmalloc.1024:10.85 user 6.41 system 7.52 elapsed 229% CPU
|
||||
time.7.tcmalloc.1024:3.05 user 0.04 system 0.89 elapsed 345% CPU
|
||||
time.7.ptmalloc.2048:5.65 user 0.08 system 1.47 elapsed 388% CPU
|
||||
time.7.tcmalloc.2048:3.01 user 0.01 system 0.98 elapsed 306% CPU
|
||||
time.7.ptmalloc.4096:6.09 user 0.08 system 1.58 elapsed 389% CPU
|
||||
time.7.tcmalloc.4096:2.25 user 0.03 system 1.32 elapsed 171% CPU
|
||||
time.7.ptmalloc.8192:6.73 user 0.85 system 1.99 elapsed 379% CPU
|
||||
time.7.tcmalloc.8192:2.22 user 0.08 system 1.61 elapsed 142% CPU
|
||||
time.7.ptmalloc.16384:8.87 user 4.66 system 4.04 elapsed 334% CPU
|
||||
time.7.tcmalloc.16384:2.07 user 0.07 system 2.07 elapsed 103% CPU
|
||||
time.7.ptmalloc.32768:10.61 user 17.85 system 11.22 elapsed 253% CPU
|
||||
time.7.tcmalloc.32768:3.68 user 0.06 system 3.79 elapsed 98% CPU
|
||||
time.7.ptmalloc.65536:13.05 user 45.97 system 27.28 elapsed 216% CPU
|
||||
time.7.tcmalloc.65536:4.16 user 0.07 system 4.31 elapsed 98% CPU
|
||||
time.7.ptmalloc.131072:13.22 user 62.67 system 41.33 elapsed 183% CPU
|
||||
time.7.tcmalloc.131072:6.10 user 0.06 system 6.25 elapsed 98% CPU
|
||||
time.8.ptmalloc.64:7.31 user 3.92 system 6.39 elapsed 175% CPU
|
||||
time.8.tcmalloc.64:4.00 user 0.01 system 1.04 elapsed 383% CPU
|
||||
time.8.ptmalloc.128:9.40 user 5.41 system 7.67 elapsed 192% CPU
|
||||
time.8.tcmalloc.128:3.61 user 0.02 system 0.94 elapsed 386% CPU
|
||||
time.8.ptmalloc.256:10.61 user 6.35 system 7.96 elapsed 212% CPU
|
||||
time.8.tcmalloc.256:3.30 user 0.02 system 0.99 elapsed 335% CPU
|
||||
time.8.ptmalloc.512:12.42 user 7.10 system 8.79 elapsed 221% CPU
|
||||
time.8.tcmalloc.512:3.35 user 0.04 system 0.94 elapsed 358% CPU
|
||||
time.8.ptmalloc.1024:13.63 user 8.54 system 8.95 elapsed 247% CPU
|
||||
time.8.tcmalloc.1024:3.44 user 0.02 system 0.96 elapsed 359% CPU
|
||||
time.8.ptmalloc.2048:6.45 user 0.03 system 1.67 elapsed 386% CPU
|
||||
time.8.tcmalloc.2048:3.55 user 0.05 system 1.09 elapsed 328% CPU
|
||||
time.8.ptmalloc.4096:6.83 user 0.26 system 1.80 elapsed 393% CPU
|
||||
time.8.tcmalloc.4096:2.78 user 0.06 system 1.53 elapsed 185% CPU
|
||||
time.8.ptmalloc.8192:7.59 user 1.29 system 2.36 elapsed 376% CPU
|
||||
time.8.tcmalloc.8192:2.57 user 0.07 system 1.84 elapsed 142% CPU
|
||||
time.8.ptmalloc.16384:10.15 user 6.20 system 5.20 elapsed 314% CPU
|
||||
time.8.tcmalloc.16384:2.40 user 0.05 system 2.42 elapsed 101% CPU
|
||||
time.8.ptmalloc.32768:11.82 user 24.48 system 14.60 elapsed 248% CPU
|
||||
time.8.tcmalloc.32768:4.37 user 0.05 system 4.47 elapsed 98% CPU
|
||||
time.8.ptmalloc.65536:15.41 user 58.94 system 34.42 elapsed 215% CPU
|
||||
time.8.tcmalloc.65536:4.90 user 0.04 system 4.96 elapsed 99% CPU
|
||||
time.8.ptmalloc.131072:16.07 user 82.93 system 52.51 elapsed 188% CPU
|
||||
time.8.tcmalloc.131072:7.13 user 0.04 system 7.19 elapsed 99% CPU
|
||||
time.9.ptmalloc.64:8.44 user 4.59 system 6.92 elapsed 188% CPU
|
||||
time.9.tcmalloc.64:4.00 user 0.02 system 1.05 elapsed 382% CPU
|
||||
time.9.ptmalloc.128:10.92 user 6.14 system 8.31 elapsed 205% CPU
|
||||
time.9.tcmalloc.128:3.88 user 0.02 system 1.01 elapsed 382% CPU
|
||||
time.9.ptmalloc.256:13.01 user 7.75 system 9.12 elapsed 227% CPU
|
||||
time.9.tcmalloc.256:3.89 user 0.01 system 1.00 elapsed 386% CPU
|
||||
time.9.ptmalloc.512:14.96 user 8.89 system 9.73 elapsed 244% CPU
|
||||
time.9.tcmalloc.512:3.80 user 0.03 system 1.01 elapsed 377% CPU
|
||||
time.9.ptmalloc.1024:15.42 user 10.20 system 9.80 elapsed 261% CPU
|
||||
time.9.tcmalloc.1024:3.86 user 0.03 system 1.19 elapsed 325% CPU
|
||||
time.9.ptmalloc.2048:7.24 user 0.02 system 1.87 elapsed 388% CPU
|
||||
time.9.tcmalloc.2048:3.98 user 0.05 system 1.26 elapsed 319% CPU
|
||||
time.9.ptmalloc.4096:7.96 user 0.18 system 2.06 elapsed 394% CPU
|
||||
time.9.tcmalloc.4096:3.27 user 0.04 system 1.69 elapsed 195% CPU
|
||||
time.9.ptmalloc.8192:9.00 user 1.63 system 2.79 elapsed 380% CPU
|
||||
time.9.tcmalloc.8192:3.00 user 0.06 system 2.05 elapsed 148% CPU
|
||||
time.9.ptmalloc.16384:12.07 user 8.13 system 6.55 elapsed 308% CPU
|
||||
time.9.tcmalloc.16384:2.85 user 0.05 system 2.75 elapsed 105% CPU
|
||||
time.9.ptmalloc.32768:13.99 user 29.65 system 18.02 elapsed 242% CPU
|
||||
time.9.tcmalloc.32768:4.98 user 0.06 system 5.13 elapsed 98% CPU
|
||||
time.9.ptmalloc.65536:16.89 user 70.42 system 42.11 elapsed 207% CPU
|
||||
time.9.tcmalloc.65536:5.55 user 0.04 system 5.65 elapsed 98% CPU
|
||||
time.9.ptmalloc.131072:18.53 user 94.11 system 61.17 elapsed 184% CPU
|
||||
time.9.tcmalloc.131072:8.06 user 0.04 system 8.16 elapsed 99% CPU
|
||||
time.10.ptmalloc.64:9.81 user 5.70 system 7.42 elapsed 208% CPU
|
||||
time.10.tcmalloc.64:4.43 user 0.03 system 1.20 elapsed 370% CPU
|
||||
time.10.ptmalloc.128:12.69 user 7.81 system 9.02 elapsed 227% CPU
|
||||
time.10.tcmalloc.128:4.27 user 0.02 system 1.13 elapsed 378% CPU
|
||||
time.10.ptmalloc.256:15.04 user 9.53 system 9.92 elapsed 247% CPU
|
||||
time.10.tcmalloc.256:4.23 user 0.02 system 1.09 elapsed 388% CPU
|
||||
time.10.ptmalloc.512:17.30 user 10.46 system 10.61 elapsed 261% CPU
|
||||
time.10.tcmalloc.512:4.14 user 0.05 system 1.10 elapsed 379% CPU
|
||||
time.10.ptmalloc.1024:16.96 user 9.38 system 9.30 elapsed 283% CPU
|
||||
time.10.tcmalloc.1024:4.27 user 0.06 system 1.18 elapsed 366% CPU
|
||||
time.10.ptmalloc.2048:8.07 user 0.03 system 2.06 elapsed 393% CPU
|
||||
time.10.tcmalloc.2048:4.49 user 0.07 system 1.33 elapsed 342% CPU
|
||||
time.10.ptmalloc.4096:8.66 user 0.25 system 2.25 elapsed 394% CPU
|
||||
time.10.tcmalloc.4096:3.61 user 0.05 system 1.78 elapsed 205% CPU
|
||||
time.10.ptmalloc.8192:21.52 user 17.43 system 10.41 elapsed 374% CPU
|
||||
time.10.tcmalloc.8192:3.59 user 0.10 system 2.33 elapsed 158% CPU
|
||||
time.10.ptmalloc.16384:20.55 user 24.85 system 12.55 elapsed 361% CPU
|
||||
time.10.tcmalloc.16384:3.29 user 0.04 system 3.22 elapsed 103% CPU
|
||||
time.10.ptmalloc.32768:15.23 user 38.13 system 22.49 elapsed 237% CPU
|
||||
time.10.tcmalloc.32768:5.62 user 0.05 system 5.72 elapsed 99% CPU
|
||||
time.10.ptmalloc.65536:19.80 user 85.42 system 49.98 elapsed 210% CPU
|
||||
time.10.tcmalloc.65536:6.23 user 0.09 system 6.36 elapsed 99% CPU
|
||||
time.10.ptmalloc.131072:20.91 user 106.97 system 69.08 elapsed 185% CPU
|
||||
time.10.tcmalloc.131072:8.94 user 0.09 system 9.09 elapsed 99% CPU
|
||||
time.11.ptmalloc.64:10.82 user 6.34 system 7.92 elapsed 216% CPU
|
||||
time.11.tcmalloc.64:4.80 user 0.03 system 1.24 elapsed 387% CPU
|
||||
time.11.ptmalloc.128:14.58 user 8.61 system 9.81 elapsed 236% CPU
|
||||
time.11.tcmalloc.128:4.65 user 0.03 system 1.21 elapsed 384% CPU
|
||||
time.11.ptmalloc.256:17.38 user 10.98 system 10.75 elapsed 263% CPU
|
||||
time.11.tcmalloc.256:4.51 user 0.03 system 1.18 elapsed 384% CPU
|
||||
time.11.ptmalloc.512:19.18 user 11.71 system 10.95 elapsed 282% CPU
|
||||
time.11.tcmalloc.512:4.57 user 0.02 system 1.19 elapsed 384% CPU
|
||||
time.11.ptmalloc.1024:19.94 user 12.41 system 10.48 elapsed 308% CPU
|
||||
time.11.tcmalloc.1024:4.71 user 0.05 system 1.29 elapsed 367% CPU
|
||||
time.11.ptmalloc.2048:8.70 user 0.04 system 2.35 elapsed 371% CPU
|
||||
time.11.tcmalloc.2048:4.97 user 0.07 system 1.43 elapsed 350% CPU
|
||||
time.11.ptmalloc.4096:22.47 user 18.43 system 10.82 elapsed 377% CPU
|
||||
time.11.tcmalloc.4096:4.22 user 0.03 system 1.91 elapsed 221% CPU
|
||||
time.11.ptmalloc.8192:11.61 user 2.38 system 3.73 elapsed 374% CPU
|
||||
time.11.tcmalloc.8192:3.74 user 0.09 system 2.46 elapsed 155% CPU
|
||||
time.11.ptmalloc.16384:14.13 user 13.38 system 9.60 elapsed 286% CPU
|
||||
time.11.tcmalloc.16384:3.61 user 0.03 system 3.63 elapsed 100% CPU
|
||||
time.11.ptmalloc.32768:17.92 user 43.84 system 26.74 elapsed 230% CPU
|
||||
time.11.tcmalloc.32768:6.31 user 0.03 system 6.45 elapsed 98% CPU
|
||||
time.11.ptmalloc.65536:22.40 user 96.38 system 58.30 elapsed 203% CPU
|
||||
time.11.tcmalloc.65536:6.92 user 0.12 system 6.98 elapsed 100% CPU
|
||||
time.11.ptmalloc.131072:21.03 user 108.04 system 72.78 elapsed 177% CPU
|
||||
time.11.tcmalloc.131072:9.79 user 0.08 system 9.94 elapsed 99% CPU
|
||||
time.12.ptmalloc.64:12.23 user 7.16 system 8.38 elapsed 231% CPU
|
||||
time.12.tcmalloc.64:5.21 user 0.05 system 1.41 elapsed 371% CPU
|
||||
time.12.ptmalloc.128:16.97 user 10.19 system 10.47 elapsed 259% CPU
|
||||
time.12.tcmalloc.128:5.10 user 0.02 system 1.31 elapsed 390% CPU
|
||||
time.12.ptmalloc.256:19.99 user 12.10 system 11.57 elapsed 277% CPU
|
||||
time.12.tcmalloc.256:5.01 user 0.03 system 1.29 elapsed 390% CPU
|
||||
time.12.ptmalloc.512:21.85 user 12.66 system 11.46 elapsed 300% CPU
|
||||
time.12.tcmalloc.512:5.05 user 0.00 system 1.32 elapsed 379% CPU
|
||||
time.12.ptmalloc.1024:9.40 user 0.04 system 2.40 elapsed 393% CPU
|
||||
time.12.tcmalloc.1024:5.14 user 0.02 system 1.39 elapsed 369% CPU
|
||||
time.12.ptmalloc.2048:9.72 user 0.04 system 2.49 elapsed 391% CPU
|
||||
time.12.tcmalloc.2048:5.74 user 0.05 system 1.62 elapsed 355% CPU
|
||||
time.12.ptmalloc.4096:10.64 user 0.20 system 2.75 elapsed 393% CPU
|
||||
time.12.tcmalloc.4096:4.45 user 0.03 system 2.04 elapsed 218% CPU
|
||||
time.12.ptmalloc.8192:12.66 user 3.30 system 4.30 elapsed 371% CPU
|
||||
time.12.tcmalloc.8192:4.21 user 0.13 system 2.65 elapsed 163% CPU
|
||||
time.12.ptmalloc.16384:15.73 user 15.68 system 11.14 elapsed 281% CPU
|
||||
time.12.tcmalloc.16384:4.17 user 0.06 system 4.10 elapsed 102% CPU
|
||||
time.12.ptmalloc.32768:19.45 user 56.00 system 32.74 elapsed 230% CPU
|
||||
time.12.tcmalloc.32768:6.96 user 0.08 system 7.14 elapsed 98% CPU
|
||||
time.12.ptmalloc.65536:23.33 user 110.45 system 65.06 elapsed 205% CPU
|
||||
time.12.tcmalloc.65536:7.77 user 0.15 system 7.72 elapsed 102% CPU
|
||||
time.12.ptmalloc.131072:24.03 user 124.74 system 82.94 elapsed 179% CPU
|
||||
time.12.tcmalloc.131072:10.81 user 0.06 system 10.94 elapsed 99% CPU
|
||||
time.13.ptmalloc.64:14.08 user 7.60 system 8.85 elapsed 244% CPU
|
||||
time.13.tcmalloc.64:5.51 user 0.01 system 1.47 elapsed 375% CPU
|
||||
time.13.ptmalloc.128:18.20 user 10.98 system 10.99 elapsed 265% CPU
|
||||
time.13.tcmalloc.128:5.34 user 0.01 system 1.39 elapsed 382% CPU
|
||||
time.13.ptmalloc.256:21.48 user 13.94 system 12.25 elapsed 289% CPU
|
||||
time.13.tcmalloc.256:5.33 user 0.01 system 1.39 elapsed 381% CPU
|
||||
time.13.ptmalloc.512:24.22 user 14.84 system 12.97 elapsed 301% CPU
|
||||
time.13.tcmalloc.512:5.49 user 0.02 system 1.41 elapsed 389% CPU
|
||||
time.13.ptmalloc.1024:25.26 user 17.03 system 12.85 elapsed 328% CPU
|
||||
time.13.tcmalloc.1024:5.65 user 0.04 system 1.50 elapsed 378% CPU
|
||||
time.13.ptmalloc.2048:10.41 user 0.03 system 2.69 elapsed 387% CPU
|
||||
time.13.tcmalloc.2048:5.93 user 0.10 system 1.77 elapsed 339% CPU
|
||||
time.13.ptmalloc.4096:11.37 user 0.52 system 3.04 elapsed 391% CPU
|
||||
time.13.tcmalloc.4096:5.08 user 0.11 system 2.22 elapsed 233% CPU
|
||||
time.13.ptmalloc.8192:21.76 user 18.54 system 10.58 elapsed 380% CPU
|
||||
time.13.tcmalloc.8192:5.04 user 0.16 system 2.93 elapsed 177% CPU
|
||||
time.13.ptmalloc.16384:26.35 user 34.47 system 17.01 elapsed 357% CPU
|
||||
time.13.tcmalloc.16384:4.66 user 0.04 system 4.66 elapsed 100% CPU
|
||||
time.13.ptmalloc.32768:21.41 user 63.59 system 38.14 elapsed 222% CPU
|
||||
time.13.tcmalloc.32768:7.71 user 0.03 system 7.83 elapsed 98% CPU
|
||||
time.13.ptmalloc.65536:24.99 user 120.80 system 71.59 elapsed 203% CPU
|
||||
time.13.tcmalloc.65536:8.87 user 0.64 system 8.37 elapsed 113% CPU
|
||||
time.13.ptmalloc.131072:25.97 user 142.27 system 96.00 elapsed 175% CPU
|
||||
time.13.tcmalloc.131072:11.48 user 0.06 system 11.67 elapsed 98% CPU
|
||||
time.14.ptmalloc.64:15.01 user 9.11 system 9.41 elapsed 256% CPU
|
||||
time.14.tcmalloc.64:5.98 user 0.02 system 1.58 elapsed 378% CPU
|
||||
time.14.ptmalloc.128:20.34 user 12.72 system 11.62 elapsed 284% CPU
|
||||
time.14.tcmalloc.128:5.88 user 0.04 system 1.51 elapsed 392% CPU
|
||||
time.14.ptmalloc.256:24.26 user 14.95 system 12.92 elapsed 303% CPU
|
||||
time.14.tcmalloc.256:5.72 user 0.02 system 1.50 elapsed 381% CPU
|
||||
time.14.ptmalloc.512:27.28 user 16.45 system 13.89 elapsed 314% CPU
|
||||
time.14.tcmalloc.512:5.99 user 0.02 system 1.54 elapsed 388% CPU
|
||||
time.14.ptmalloc.1024:25.84 user 16.99 system 12.61 elapsed 339% CPU
|
||||
time.14.tcmalloc.1024:5.94 user 0.06 system 1.59 elapsed 375% CPU
|
||||
time.14.ptmalloc.2048:11.96 user 0.01 system 3.12 elapsed 382% CPU
|
||||
time.14.tcmalloc.2048:6.39 user 0.07 system 1.79 elapsed 359% CPU
|
||||
time.14.ptmalloc.4096:20.19 user 11.77 system 8.26 elapsed 386% CPU
|
||||
time.14.tcmalloc.4096:5.65 user 0.05 system 2.32 elapsed 244% CPU
|
||||
time.14.ptmalloc.8192:22.01 user 16.39 system 9.89 elapsed 387% CPU
|
||||
time.14.tcmalloc.8192:5.44 user 0.11 system 3.07 elapsed 180% CPU
|
||||
time.14.ptmalloc.16384:18.15 user 22.40 system 15.02 elapsed 269% CPU
|
||||
time.14.tcmalloc.16384:5.29 user 0.08 system 5.34 elapsed 100% CPU
|
||||
time.14.ptmalloc.32768:24.29 user 72.07 system 42.63 elapsed 225% CPU
|
||||
time.14.tcmalloc.32768:8.47 user 0.02 system 8.62 elapsed 98% CPU
|
||||
time.14.ptmalloc.65536:27.63 user 130.56 system 78.64 elapsed 201% CPU
|
||||
time.14.tcmalloc.65536:9.85 user 1.61 system 9.04 elapsed 126% CPU
|
||||
time.14.ptmalloc.131072:28.87 user 146.38 system 100.54 elapsed 174% CPU
|
||||
time.14.tcmalloc.131072:12.46 user 0.11 system 12.71 elapsed 98% CPU
|
||||
time.15.ptmalloc.64:16.25 user 10.05 system 9.82 elapsed 267% CPU
|
||||
time.15.tcmalloc.64:6.30 user 0.02 system 1.64 elapsed 385% CPU
|
||||
time.15.ptmalloc.128:22.33 user 13.23 system 12.24 elapsed 290% CPU
|
||||
time.15.tcmalloc.128:6.08 user 0.03 system 1.59 elapsed 384% CPU
|
||||
time.15.ptmalloc.256:26.56 user 16.57 system 13.70 elapsed 314% CPU
|
||||
time.15.tcmalloc.256:6.14 user 0.03 system 1.61 elapsed 382% CPU
|
||||
time.15.ptmalloc.512:29.68 user 18.08 system 14.56 elapsed 327% CPU
|
||||
time.15.tcmalloc.512:6.12 user 0.04 system 1.68 elapsed 364% CPU
|
||||
time.15.ptmalloc.1024:17.07 user 6.22 system 6.26 elapsed 371% CPU
|
||||
time.15.tcmalloc.1024:6.38 user 0.02 system 1.75 elapsed 364% CPU
|
||||
time.15.ptmalloc.2048:26.64 user 17.25 system 11.51 elapsed 381% CPU
|
||||
time.15.tcmalloc.2048:6.77 user 0.18 system 1.92 elapsed 361% CPU
|
||||
time.15.ptmalloc.4096:13.21 user 0.74 system 3.57 elapsed 390% CPU
|
||||
time.15.tcmalloc.4096:6.03 user 0.09 system 2.36 elapsed 258% CPU
|
||||
time.15.ptmalloc.8192:22.92 user 17.51 system 10.50 elapsed 385% CPU
|
||||
time.15.tcmalloc.8192:5.96 user 0.12 system 3.36 elapsed 180% CPU
|
||||
time.15.ptmalloc.16384:19.37 user 24.87 system 16.69 elapsed 264% CPU
|
||||
time.15.tcmalloc.16384:5.88 user 0.07 system 5.84 elapsed 101% CPU
|
||||
time.15.ptmalloc.32768:25.43 user 82.30 system 48.98 elapsed 219% CPU
|
||||
time.15.tcmalloc.32768:9.11 user 0.05 system 9.30 elapsed 98% CPU
|
||||
time.15.ptmalloc.65536:29.31 user 140.07 system 83.78 elapsed 202% CPU
|
||||
time.15.tcmalloc.65536:8.51 user 1.59 system 9.75 elapsed 103% CPU
|
||||
time.15.ptmalloc.131072:30.22 user 163.15 system 109.50 elapsed 176% CPU
|
||||
time.15.tcmalloc.131072:13.35 user 0.10 system 13.54 elapsed 99% CPU
|
||||
time.16.ptmalloc.64:17.69 user 10.11 system 10.11 elapsed 274% CPU
|
||||
time.16.tcmalloc.64:6.63 user 0.04 system 1.72 elapsed 387% CPU
|
||||
time.16.ptmalloc.128:23.05 user 14.37 system 12.75 elapsed 293% CPU
|
||||
time.16.tcmalloc.128:6.61 user 0.02 system 1.71 elapsed 387% CPU
|
||||
time.16.ptmalloc.256:29.11 user 19.35 system 14.57 elapsed 332% CPU
|
||||
time.16.tcmalloc.256:6.62 user 0.03 system 1.73 elapsed 382% CPU
|
||||
time.16.ptmalloc.512:31.65 user 18.71 system 14.71 elapsed 342% CPU
|
||||
time.16.tcmalloc.512:6.63 user 0.04 system 1.73 elapsed 383% CPU
|
||||
time.16.ptmalloc.1024:31.99 user 21.22 system 14.87 elapsed 357% CPU
|
||||
time.16.tcmalloc.1024:6.81 user 0.04 system 1.79 elapsed 382% CPU
|
||||
time.16.ptmalloc.2048:30.35 user 21.36 system 13.30 elapsed 388% CPU
|
||||
time.16.tcmalloc.2048:6.91 user 0.50 system 2.01 elapsed 367% CPU
|
||||
time.16.ptmalloc.4096:18.85 user 7.18 system 6.61 elapsed 393% CPU
|
||||
time.16.tcmalloc.4096:6.70 user 0.10 system 2.62 elapsed 259% CPU
|
||||
time.16.ptmalloc.8192:22.19 user 14.30 system 9.37 elapsed 389% CPU
|
||||
time.16.tcmalloc.8192:6.18 user 0.19 system 3.58 elapsed 177% CPU
|
||||
time.16.ptmalloc.16384:31.22 user 46.78 system 22.92 elapsed 340% CPU
|
||||
time.16.tcmalloc.16384:6.79 user 0.07 system 6.86 elapsed 99% CPU
|
||||
time.16.ptmalloc.32768:27.31 user 87.32 system 52.00 elapsed 220% CPU
|
||||
time.16.tcmalloc.32768:9.85 user 0.06 system 10.07 elapsed 98% CPU
|
||||
time.16.ptmalloc.65536:32.83 user 160.62 system 95.67 elapsed 202% CPU
|
||||
time.16.tcmalloc.65536:10.18 user 0.09 system 10.41 elapsed 98% CPU
|
||||
time.16.ptmalloc.131072:31.99 user 173.41 system 115.98 elapsed 177% CPU
|
||||
time.16.tcmalloc.131072:14.52 user 0.05 system 14.67 elapsed 99% CPU
|
||||
time.17.ptmalloc.64:19.38 user 11.61 system 10.61 elapsed 291% CPU
|
||||
time.17.tcmalloc.64:7.11 user 0.02 system 1.84 elapsed 386% CPU
|
||||
time.17.ptmalloc.128:26.25 user 16.15 system 13.53 elapsed 313% CPU
|
||||
time.17.tcmalloc.128:6.97 user 0.02 system 1.78 elapsed 390% CPU
|
||||
time.17.ptmalloc.256:30.66 user 18.36 system 14.97 elapsed 327% CPU
|
||||
time.17.tcmalloc.256:6.94 user 0.04 system 1.80 elapsed 387% CPU
|
||||
time.17.ptmalloc.512:33.71 user 22.79 system 15.95 elapsed 354% CPU
|
||||
time.17.tcmalloc.512:7.00 user 0.02 system 1.83 elapsed 381% CPU
|
||||
time.17.ptmalloc.1024:33.49 user 22.47 system 15.00 elapsed 373% CPU
|
||||
time.17.tcmalloc.1024:7.20 user 0.03 system 1.90 elapsed 380% CPU
|
||||
time.17.ptmalloc.2048:23.87 user 11.92 system 9.26 elapsed 386% CPU
|
||||
time.17.tcmalloc.2048:6.01 user 1.83 system 2.15 elapsed 363% CPU
|
||||
time.17.ptmalloc.4096:14.69 user 0.95 system 3.98 elapsed 392% CPU
|
||||
time.17.tcmalloc.4096:7.25 user 0.10 system 2.62 elapsed 279% CPU
|
||||
time.17.ptmalloc.8192:22.44 user 13.52 system 9.39 elapsed 382% CPU
|
||||
time.17.tcmalloc.8192:7.21 user 0.24 system 3.95 elapsed 188% CPU
|
||||
time.17.ptmalloc.16384:23.33 user 33.67 system 21.89 elapsed 260% CPU
|
||||
time.17.tcmalloc.16384:7.28 user 0.06 system 7.10 elapsed 103% CPU
|
||||
time.17.ptmalloc.32768:29.35 user 103.11 system 60.36 elapsed 219% CPU
|
||||
time.17.tcmalloc.32768:10.53 user 0.07 system 10.71 elapsed 98% CPU
|
||||
time.17.ptmalloc.65536:33.21 user 170.89 system 100.84 elapsed 202% CPU
|
||||
time.17.tcmalloc.65536:10.85 user 0.05 system 11.04 elapsed 98% CPU
|
||||
time.17.ptmalloc.131072:34.98 user 182.87 system 122.05 elapsed 178% CPU
|
||||
time.17.tcmalloc.131072:15.27 user 0.09 system 15.49 elapsed 99% CPU
|
||||
time.18.ptmalloc.64:21.08 user 12.15 system 11.43 elapsed 290% CPU
|
||||
time.18.tcmalloc.64:7.45 user 0.03 system 1.95 elapsed 383% CPU
|
||||
time.18.ptmalloc.128:27.65 user 17.26 system 14.03 elapsed 320% CPU
|
||||
time.18.tcmalloc.128:7.46 user 0.03 system 1.92 elapsed 389% CPU
|
||||
time.18.ptmalloc.256:32.78 user 20.55 system 15.70 elapsed 339% CPU
|
||||
time.18.tcmalloc.256:7.31 user 0.02 system 1.88 elapsed 389% CPU
|
||||
time.18.ptmalloc.512:33.31 user 20.06 system 15.05 elapsed 354% CPU
|
||||
time.18.tcmalloc.512:7.33 user 0.02 system 1.91 elapsed 383% CPU
|
||||
time.18.ptmalloc.1024:35.46 user 24.83 system 16.30 elapsed 369% CPU
|
||||
time.18.tcmalloc.1024:7.60 user 0.06 system 2.05 elapsed 373% CPU
|
||||
time.18.ptmalloc.2048:19.98 user 6.80 system 6.76 elapsed 395% CPU
|
||||
time.18.tcmalloc.2048:6.89 user 1.29 system 2.28 elapsed 357% CPU
|
||||
time.18.ptmalloc.4096:15.99 user 0.93 system 4.32 elapsed 391% CPU
|
||||
time.18.tcmalloc.4096:7.70 user 0.10 system 2.77 elapsed 280% CPU
|
||||
time.18.ptmalloc.8192:23.51 user 14.84 system 9.97 elapsed 384% CPU
|
||||
time.18.tcmalloc.8192:8.16 user 0.27 system 4.25 elapsed 197% CPU
|
||||
time.18.ptmalloc.16384:35.79 user 52.41 system 26.47 elapsed 333% CPU
|
||||
time.18.tcmalloc.16384:7.81 user 0.07 system 7.61 elapsed 103% CPU
|
||||
time.18.ptmalloc.32768:33.17 user 116.07 system 68.64 elapsed 217% CPU
|
||||
time.18.tcmalloc.32768:11.34 user 0.13 system 11.57 elapsed 99% CPU
|
||||
time.18.ptmalloc.65536:35.91 user 177.82 system 106.75 elapsed 200% CPU
|
||||
time.18.tcmalloc.65536:11.54 user 0.06 system 11.74 elapsed 98% CPU
|
||||
time.18.ptmalloc.131072:36.38 user 187.18 system 126.91 elapsed 176% CPU
|
||||
time.18.tcmalloc.131072:16.34 user 0.05 system 16.43 elapsed 99% CPU
|
||||
time.19.ptmalloc.64:22.90 user 13.23 system 11.82 elapsed 305% CPU
|
||||
time.19.tcmalloc.64:7.81 user 0.02 system 2.01 elapsed 388% CPU
|
||||
time.19.ptmalloc.128:30.13 user 18.58 system 14.77 elapsed 329% CPU
|
||||
time.19.tcmalloc.128:7.74 user 0.02 system 2.01 elapsed 386% CPU
|
||||
time.19.ptmalloc.256:35.33 user 21.41 system 16.35 elapsed 347% CPU
|
||||
time.19.tcmalloc.256:7.79 user 0.04 system 2.04 elapsed 382% CPU
|
||||
time.19.ptmalloc.512:39.30 user 26.22 system 17.84 elapsed 367% CPU
|
||||
time.19.tcmalloc.512:7.80 user 0.06 system 2.05 elapsed 381% CPU
|
||||
time.19.ptmalloc.1024:35.70 user 23.90 system 15.66 elapsed 380% CPU
|
||||
time.19.tcmalloc.1024:8.08 user 0.06 system 2.16 elapsed 376% CPU
|
||||
time.19.ptmalloc.2048:18.33 user 3.28 system 5.47 elapsed 394% CPU
|
||||
time.19.tcmalloc.2048:8.71 user 0.05 system 2.40 elapsed 363% CPU
|
||||
time.19.ptmalloc.4096:16.94 user 0.89 system 4.64 elapsed 383% CPU
|
||||
time.19.tcmalloc.4096:8.21 user 0.07 system 2.85 elapsed 289% CPU
|
||||
time.19.ptmalloc.8192:25.61 user 17.15 system 11.33 elapsed 377% CPU
|
||||
time.19.tcmalloc.8192:8.79 user 0.30 system 4.58 elapsed 198% CPU
|
||||
time.19.ptmalloc.16384:27.11 user 46.66 system 29.67 elapsed 248% CPU
|
||||
time.19.tcmalloc.16384:8.64 user 0.05 system 8.58 elapsed 101% CPU
|
||||
time.19.ptmalloc.32768:33.80 user 117.69 system 70.65 elapsed 214% CPU
|
||||
time.19.tcmalloc.32768:11.88 user 0.07 system 12.04 elapsed 99% CPU
|
||||
time.19.ptmalloc.65536:36.90 user 180.21 system 109.01 elapsed 199% CPU
|
||||
time.19.tcmalloc.65536:12.17 user 0.07 system 12.40 elapsed 98% CPU
|
||||
time.19.ptmalloc.131072:38.50 user 195.15 system 132.81 elapsed 175% CPU
|
||||
time.19.tcmalloc.131072:17.44 user 0.10 system 17.65 elapsed 99% CPU
|
||||
time.20.ptmalloc.64:23.37 user 13.74 system 11.86 elapsed 312% CPU
|
||||
time.20.tcmalloc.64:8.18 user 0.02 system 2.10 elapsed 389% CPU
|
||||
time.20.ptmalloc.128:31.29 user 19.97 system 15.53 elapsed 329% CPU
|
||||
time.20.tcmalloc.128:8.03 user 0.02 system 2.12 elapsed 378% CPU
|
||||
time.20.ptmalloc.256:38.40 user 25.65 system 18.25 elapsed 350% CPU
|
||||
time.20.tcmalloc.256:8.05 user 0.05 system 2.12 elapsed 380% CPU
|
||||
time.20.ptmalloc.512:40.60 user 27.70 system 18.46 elapsed 369% CPU
|
||||
time.20.tcmalloc.512:8.22 user 0.08 system 2.20 elapsed 375% CPU
|
||||
time.20.ptmalloc.1024:40.02 user 28.52 system 17.56 elapsed 390% CPU
|
||||
time.20.tcmalloc.1024:8.50 user 0.07 system 2.19 elapsed 391% CPU
|
||||
time.20.ptmalloc.2048:16.13 user 0.23 system 4.23 elapsed 386% CPU
|
||||
time.20.tcmalloc.2048:8.98 user 0.03 system 2.45 elapsed 367% CPU
|
||||
time.20.ptmalloc.4096:17.14 user 0.87 system 4.60 elapsed 391% CPU
|
||||
time.20.tcmalloc.4096:8.93 user 0.20 system 2.97 elapsed 306% CPU
|
||||
time.20.ptmalloc.8192:25.24 user 17.16 system 11.14 elapsed 380% CPU
|
||||
time.20.tcmalloc.8192:9.78 user 0.30 system 5.14 elapsed 195% CPU
|
||||
time.20.ptmalloc.16384:39.93 user 60.36 system 30.24 elapsed 331% CPU
|
||||
time.20.tcmalloc.16384:9.57 user 0.09 system 9.43 elapsed 102% CPU
|
||||
time.20.ptmalloc.32768:36.44 user 130.23 system 76.79 elapsed 217% CPU
|
||||
time.20.tcmalloc.32768:12.71 user 0.09 system 12.97 elapsed 98% CPU
|
||||
time.20.ptmalloc.65536:39.79 user 202.09 system 120.34 elapsed 200% CPU
|
||||
time.20.tcmalloc.65536:12.93 user 0.06 system 13.15 elapsed 98% CPU
|
||||
time.20.ptmalloc.131072:41.91 user 202.76 system 138.51 elapsed 176% CPU
|
||||
time.20.tcmalloc.131072:18.23 user 0.07 system 18.42 elapsed 99% CPU
|
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 2.2 KiB |
Before Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.2 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 2.1 KiB |
597
docs/tcmalloc.adoc
Normal file
@ -0,0 +1,597 @@
|
||||
= TCMalloc : Thread-Caching Malloc
|
||||
|
||||
:reproducible:
|
||||
|
||||
== [#motivation]#Motivation#
|
||||
|
||||
+[alk: Update from Dec 2024]+ Wondering how to update this document
|
||||
from beginning of 2000-x I am choosing to keep this original
|
||||
motivation writeup just below. Do keep in mind that referenced glibc
|
||||
versions are now long obsolete. And amount of time per malloc call is
|
||||
far from correct anymore. And the description is for 32-bit
|
||||
computers. Still, I am choosing to keep this text intact, to help
|
||||
people see where tcmalloc came from. "I" below refers to original
|
||||
author: Sanjay. See at the end of this paragraph for some commentary
|
||||
relevant for today.
|
||||
|
||||
'''''
|
||||
|
||||
TCMalloc is faster than the glibc 2.3 malloc (available as a separate
|
||||
library called ptmalloc2) and other mallocs that I have tested.
|
||||
ptmalloc2 takes approximately 300 nanoseconds to execute a malloc/free
|
||||
pair on a 2.8 GHz P4 (for small objects). The TCMalloc implementation
|
||||
takes approximately 50 nanoseconds for the same operation pair. Speed is
|
||||
important for a malloc implementation because if malloc is not fast
|
||||
enough, application writers are inclined to write their own custom free
|
||||
lists on top of malloc. This can lead to extra complexity, and more
|
||||
memory usage unless the application writer is very careful to
|
||||
appropriately size the free lists and scavenge idle objects out of the
|
||||
free list.
|
||||
|
||||
TCMalloc also reduces lock contention for multi-threaded programs. For
|
||||
small objects, there is virtually zero contention. For large objects,
|
||||
TCMalloc tries to use fine grained and efficient spinlocks. ptmalloc2
|
||||
also reduces lock contention by using per-thread arenas but there is a
|
||||
big problem with ptmalloc2's use of per-thread arenas. In ptmalloc2
|
||||
memory can never move from one arena to another. This can lead to huge
|
||||
amounts of wasted space. For example, in one Google application, the
|
||||
first phase would allocate approximately 300MB of memory for its URL
|
||||
canonicalization data structures. When the first phase finished, a
|
||||
second phase would be started in the same address space. If this second
|
||||
phase was assigned a different arena than the one used by the first
|
||||
phase, this phase would not reuse any of the memory left after the first
|
||||
phase and would add another 300MB to the address space. Similar memory
|
||||
blowup problems were also noticed in other applications.
|
||||
|
||||
Another benefit of TCMalloc is space-efficient representation of small
|
||||
objects. For example, N 8-byte objects can be allocated while using
|
||||
space approximately `+8N * 1.01+` bytes. I.e., a one-percent space
|
||||
overhead. ptmalloc2 uses a four-byte header for each object and (I
|
||||
think) rounds up the size to a multiple of 8 bytes and ends up using
|
||||
`+16N+` bytes.
|
||||
|
||||
'''''
|
||||
|
||||
+[alk: Update from Dec 2024]+ tcmalloc (now gperftools) has evolved a
|
||||
lot over last 20-ish years. Back then it was one of the first
|
||||
production-grade malloc that used per-thread caching. This days
|
||||
per-thread (or even per-cpu) caching is widespread. Typical C++
|
||||
programs tend to allocate and free memory somewhat frequently and
|
||||
those small allocations are generally kept fast and avoid any locks
|
||||
(in most cases). Most of gperftools evolution was on getting those
|
||||
common cases even cheaper. Others improved too. glibc, while still
|
||||
being somewhat slower than gperftools, is a lot faster than it was and
|
||||
also avoids locks in many of those common case allocations.
|
||||
|
||||
gperftools on modern systems with efficient "native" thread-local
|
||||
storage access (i.e. GNU/Linux, most BSDs, even Windows, but, notably,
|
||||
not OSX) takes just a couple dozen cheap instructions for allocation
|
||||
or deallocation, which is better than most competition. We're talking
|
||||
in the ballpark of just a couple nanoseconds per operation on modern
|
||||
fast out-of-order CPUs in this fast-path case (all caches are hot
|
||||
etc). I.e. compare to mid-tens of nanos per malloc/free pair 20 years
|
||||
ago (!)
|
||||
|
||||
Also, the reader should be aware that another descendant of the
|
||||
original tcmalloc is now available at
|
||||
https://github.com/google/tcmalloc (I call it "abseil tcmalloc" due to
|
||||
it's hard dependency on abseil). Its main feature is efficient per-cpu
|
||||
caches (but it needs RSEQ support from fairly recent Linux kernels).
|
||||
|
||||
Another direction of evolution, particularly at Google, was increasing
|
||||
focus on helping diagnose or prevent production problems related to
|
||||
dynamic memory allocation. So there is debug version of tcmalloc with
|
||||
some relatively lightweight checking against common bugs (like
|
||||
double-free). So there is heap sampling that has low enough overhead
|
||||
to be always enabled. There are relatively comprehensive statistics
|
||||
available and more. "abseil tcmalloc" is doing even better than
|
||||
gperftools in this regard.
|
||||
|
||||
== [#Usage]#Usage#
|
||||
|
||||
To use TCMalloc, just link TCMalloc into your application via the
|
||||
"-ltcmalloc" linker flag.
|
||||
|
||||
You can use TCMalloc in applications you didn't compile yourself, by
|
||||
using LD_PRELOAD:
|
||||
|
||||
....
|
||||
% LD_PRELOAD="/usr/lib/libtcmalloc.so"
|
||||
....
|
||||
|
||||
TCMalloc includes a link:heapprofile.html[heap profiler] as well.
|
||||
|
||||
If you'd rather link in a version of TCMalloc that does not include
|
||||
the heap profiler (perhaps to reduce binary size for a static binary),
|
||||
you can link in `+libtcmalloc_minimal+` instead.
|
||||
|
||||
== [#Overview]#Overview#
|
||||
|
||||
TCMalloc assigns each thread a thread-local cache. Small allocations are
|
||||
satisfied from the thread-local cache. Objects are moved from central
|
||||
data structures into a thread-local cache as needed, and periodic
|
||||
garbage collections are used to migrate memory back from a thread-local
|
||||
cache into the central data structures.
|
||||
|
||||
image:overview.gif[overview]
|
||||
|
||||
TCMalloc treats objects with size +<=+ 256K ("small" objects) differently
|
||||
from larger objects. Large objects are allocated directly from the
|
||||
central heap using a page-level allocator (a page is a 8K aligned region
|
||||
of memory). I.e., a large object is always page-aligned and occupies an
|
||||
integral number of pages.
|
||||
|
||||
A run of pages can be carved up into a sequence of small objects, each
|
||||
equally sized. For example a run of one page (4K) can be carved up into
|
||||
32 objects of size 128 bytes each.
|
||||
|
||||
== [#Small_Object_Allocation]#Small Object Allocation#
|
||||
|
||||
Each small object size maps to one of approximately 88 allocatable
|
||||
size-classes. For example, all allocations in the range 961 to 1024
|
||||
bytes are rounded up to 1024. The size-classes are spaced so that small
|
||||
sizes are separated by 8 bytes, larger sizes by 16 bytes, even larger
|
||||
sizes by 32 bytes, and so forth. The maximal spacing is controlled so
|
||||
that not too much space is wasted when an allocation request falls just
|
||||
past the end of a size class and has to be rounded up to the next class.
|
||||
|
||||
A thread cache contains a singly linked list of free objects per
|
||||
size-class.
|
||||
|
||||
image:threadheap.gif[threadheap]
|
||||
|
||||
When allocating a small object: (1) We map its size to the corresponding
|
||||
size-class. (2) Look in the corresponding free list in the thread cache
|
||||
for the current thread. (3) If the free list is not empty, we remove the
|
||||
first object from the list and return it. When following this fast path,
|
||||
TCMalloc acquires no locks at all.
|
||||
|
||||
If the free list is empty: (1) We fetch a bunch of objects from a
|
||||
central free list for this size-class (the central free list is shared
|
||||
by all threads). (2) Place them in the thread-local free list. (3)
|
||||
Return one of the newly fetched objects to the applications.
|
||||
|
||||
If the central free list is also empty: (1) We allocate a run of pages
|
||||
from the central page allocator. (2) Split the run into a set of objects
|
||||
of this size-class. (3) Place the new objects on the central free list.
|
||||
(4) As before, move some of these objects to the thread-local free list.
|
||||
|
||||
=== [#Sizing_Thread_Cache_Free_Lists]#Sizing Thread Cache Free Lists#
|
||||
|
||||
It is important to size the thread cache free lists correctly. If the
|
||||
free list is too small, we'll need to go to the central free list too
|
||||
often. If the free list is too big, we'll waste memory as objects sit
|
||||
idle in the free list.
|
||||
|
||||
Note that the thread caches are just as important for deallocation as
|
||||
they are for allocation. Without a cache, each deallocation would
|
||||
require moving the memory to the central free list. Also, some threads
|
||||
have asymmetric alloc/free behavior (e.g. producer and consumer
|
||||
threads), so sizing the free list correctly gets trickier.
|
||||
|
||||
To size the free lists appropriately, we use a slow-start algorithm to
|
||||
determine the maximum length of each individual free list. As the free
|
||||
list is used more frequently, its maximum length grows. However, if a
|
||||
free list is used more for deallocation than allocation, its maximum
|
||||
length will grow only up to a point where the whole list can be
|
||||
efficiently moved to the central free list at once.
|
||||
|
||||
The pseudo-code below illustrates this slow-start algorithm. Note that
|
||||
`+num_objects_to_move+` is specific to each size class. By moving a list
|
||||
of objects with a well-known length, the central cache can efficiently
|
||||
pass these lists between thread caches. If a thread cache wants fewer
|
||||
than `+num_objects_to_move+`, the operation on the central free list has
|
||||
linear time complexity. The downside of always using
|
||||
`+num_objects_to_move+` as the number of objects to transfer to and from
|
||||
the central cache is that it wastes memory in threads that don't need
|
||||
all of those objects.
|
||||
|
||||
....
|
||||
Start each freelist max_length at 1.
|
||||
|
||||
Allocation
|
||||
if freelist empty {
|
||||
fetch min(max_length, num_objects_to_move) from central list;
|
||||
if max_length < num_objects_to_move { // slow-start
|
||||
max_length++;
|
||||
} else {
|
||||
max_length += num_objects_to_move;
|
||||
}
|
||||
}
|
||||
|
||||
Deallocation
|
||||
if length > max_length {
|
||||
// Don't try to release num_objects_to_move if we don't have that many.
|
||||
release min(max_length, num_objects_to_move) objects to central list
|
||||
if max_length < num_objects_to_move {
|
||||
// Slow-start up to num_objects_to_move.
|
||||
max_length++;
|
||||
} else if max_length > num_objects_to_move {
|
||||
// If we consistently go over max_length, shrink max_length.
|
||||
overages++;
|
||||
if overages > kMaxOverages {
|
||||
max_length -= num_objects_to_move;
|
||||
overages = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
....
|
||||
|
||||
See also the section on link:#Garbage_Collection[Garbage Collection] to
|
||||
see how it affects the `+max_length+`.
|
||||
|
||||
== [#Medium_Object_Allocation]#Medium Object Allocation#
|
||||
|
||||
A medium object size (256K ≤ size ≤ 1MB) is rounded up to a page size
|
||||
(8K) and is handled by a central page heap. The central page heap
|
||||
includes an array of 128 free lists. The `k`-th entry is a free list of
|
||||
runs that consist of `k + 1` pages:
|
||||
|
||||
image:pageheap.gif[pageheap]
|
||||
|
||||
An allocation for `k` pages is satisfied by looking in the `k`-th
|
||||
free list. If that free list is empty, we look in the next free list,
|
||||
and so forth. If no medium-object free list can satisfy the allocation,
|
||||
the allocation is treated as a large object.
|
||||
|
||||
== [#Large_Object_Allocation]#Large Object Allocation#
|
||||
|
||||
Allocations of 1MB or more are considered large allocations. Spans of
|
||||
free memory which can satisfy these allocations are tracked in a
|
||||
red-black tree sorted by size. Allocations follow the _best-fit_
|
||||
algorithm: the tree is searched to find the smallest span of free space
|
||||
which is larger than the requested allocation. The allocation is carved
|
||||
out of that span, and the remaining space is reinserted either into the
|
||||
large object tree or possibly into one of the smaller free-lists as
|
||||
appropriate. If no span of free memory is located that can fit the
|
||||
requested allocation, we fetch memory from the system (using `+sbrk+`,
|
||||
or `+mmap+`).
|
||||
|
||||
If an allocation for `+k+` pages is satisfied by a run of pages of
|
||||
length > `+k+`, the remainder of the run is re-inserted back into the
|
||||
appropriate free list in the page heap.
|
||||
|
||||
== [#Spans]#Spans#
|
||||
|
||||
The heap managed by TCMalloc consists of a set of pages. A run of
|
||||
contiguous pages is represented by a `+Span+` object. A span can either
|
||||
be _allocated_, or _free_. If free, the span is one of the entries in a
|
||||
page heap linked-list. If allocated, it is either a large object that
|
||||
has been handed off to the application, or a run of pages that have been
|
||||
split up into a sequence of small objects. If split into small objects,
|
||||
the size-class of the objects is recorded in the span.
|
||||
|
||||
A central array indexed by page number can be used to find the span to
|
||||
which a page belongs. For example, span _a_ below occupies 2 pages, span
|
||||
_b_ occupies 1 page, span _c_ occupies 5 pages and span _d_ occupies 3
|
||||
pages.
|
||||
|
||||
image:spanmap.gif[spanmap]
|
||||
|
||||
In a 32-bit address space, the central array is represented by a a
|
||||
2-level radix tree where the root contains 32 entries and each leaf
|
||||
contains 2^14 entries (a 32-bit address space has 2^19 8K pages, and the
|
||||
first level of tree divides the 2^19 pages by 2^5). This leads to a
|
||||
starting memory usage of 64KB of space (2^14*4 bytes) for the central
|
||||
array, which seems acceptable.
|
||||
|
||||
On 64-bit machines, we use a 3-level radix tree. Note that, many
|
||||
common 64-bit machines have limits on actual address space size. So on
|
||||
x86 we use 48 bits of address and handle it by slightly-faster 2-level
|
||||
radix tree.
|
||||
|
||||
== [#Deallocation]#Deallocation#
|
||||
|
||||
When an object is deallocated, we compute its page number and look it
|
||||
up in the central array to find the corresponding span object. The
|
||||
span tells us whether or not the object is small, and its size-class
|
||||
if it is small. If the object is small, we insert it into the
|
||||
appropriate free list in the current thread's thread cache. If the
|
||||
thread cache now exceeds it's max_size_ amount, we run a garbage
|
||||
collector that moves unused objects from the thread cache into central
|
||||
free lists.
|
||||
|
||||
If the object is large, the span tells us the range of pages covered by
|
||||
the object. Suppose this range is `+[p,q]+`. We also lookup the spans
|
||||
for pages `+p-1+` and `+q+1+`. If either of these neighboring spans are
|
||||
free, we coalesce them with the `+[p,q]+` span. The resulting span is
|
||||
inserted into the appropriate free list in the page heap.
|
||||
|
||||
== Central Free Lists for Small Objects
|
||||
|
||||
As mentioned before, we keep a central free list for each size-class.
|
||||
Each central free list is organized as a two-level data structure: a set
|
||||
of spans, and a linked list of free objects per span.
|
||||
|
||||
An object is allocated from a central free list by removing the first
|
||||
entry from the linked list of some span. (If all spans have empty linked
|
||||
lists, a suitably sized span is first allocated from the central page
|
||||
heap.)
|
||||
|
||||
An object is returned to a central free list by adding it to the linked
|
||||
list of its containing span. If the linked list length now equals the
|
||||
total number of small objects in the span, this span is now completely
|
||||
free and is returned to the page heap.
|
||||
|
||||
== [#Garbage_Collection]#Garbage Collection of Thread Caches#
|
||||
|
||||
Garbage collecting objects from a thread cache keeps the size of the
|
||||
cache under control and returns unused objects to the central free
|
||||
lists. Some threads need large caches to perform well while others can
|
||||
get by with little or no cache at all. When a thread cache goes over its
|
||||
`+max_size+`, garbage collection kicks in and then the thread competes
|
||||
with the other threads for a larger cache.
|
||||
|
||||
Garbage collection is run only during a deallocation. We walk over all
|
||||
free lists in the cache and move some number of objects from the free
|
||||
list to the corresponding central list.
|
||||
|
||||
The number of objects to be moved from a free list is determined using a
|
||||
per-list low-water-mark `+L+`. `+L+` records the minimum length of the
|
||||
list since the last garbage collection. Note that we could have
|
||||
shortened the list by `+L+` objects at the last garbage collection
|
||||
without requiring any extra accesses to the central list. We use this
|
||||
past history as a predictor of future accesses and move `+L/2+` objects
|
||||
from the thread cache free list to the corresponding central free list.
|
||||
This algorithm has the nice property that if a thread stops using a
|
||||
particular size, all objects of that size will quickly move from the
|
||||
thread cache to the central free list where they can be used by other
|
||||
threads.
|
||||
|
||||
If a thread consistently deallocates more objects of a certain size than
|
||||
it allocates, this `+L/2+` behavior will cause at least `+L/2+` objects
|
||||
to always sit in the free list. To avoid wasting memory this way, we
|
||||
shrink the maximum length of the freelist to converge on
|
||||
`+num_objects_to_move+` (see also
|
||||
link:#Sizing_Thread_Cache_Free_Lists[Sizing Thread Cache Free Lists]).
|
||||
|
||||
....
|
||||
Garbage Collection
|
||||
if (L != 0 && max_length > num_objects_to_move) {
|
||||
max_length = max(max_length - num_objects_to_move, num_objects_to_move)
|
||||
}
|
||||
....
|
||||
|
||||
The fact that the thread cache went over its `+max_size+` is an
|
||||
indication that the thread would benefit from a larger cache. Simply
|
||||
increasing `+max_size+` would use an inordinate amount of memory in
|
||||
programs that have lots of active threads. Developers can bound the
|
||||
memory used with the parameter
|
||||
`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES`.
|
||||
|
||||
Each thread cache starts with a small `+max_size+` (e.g. 64KB) so that
|
||||
idle threads won't pre-allocate memory they don't need. Each time the
|
||||
cache runs a garbage collection, it will also try to grow its
|
||||
`+max_size+`. If the sum of the thread cache sizes is less than
|
||||
`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES`, `+max_size+` grows easily. If
|
||||
not, thread cache 1 will try to steal from thread cache 2 (picked
|
||||
round-robin) by decreasing thread cache 2's `+max_size+`. In this way,
|
||||
threads that are more active will steal memory from other threads more
|
||||
often than they are have memory stolen from themselves. Mostly idle
|
||||
threads end up with small caches and active threads end up with big
|
||||
caches. Note that this stealing can cause the sum of the thread cache
|
||||
sizes to be greater than `TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES` until
|
||||
thread cache 2 deallocates some memory to trigger a garbage
|
||||
collection.
|
||||
|
||||
== [#performance]#Performance Notes#
|
||||
|
||||
gperftools' area of relative strength is cases where per-thread caches
|
||||
are effective. This is typically exercised by fairly typical C++ codes
|
||||
that allocate relatively often and where object lifetimes tend to be
|
||||
small-ish.
|
||||
|
||||
Both "abseil tcmalloc" and gperftools continue to have un-sharded
|
||||
central free lists and page heaps. Which means that misses to caches
|
||||
tend to be not so scalable compared to some competition.
|
||||
|
||||
This means that in some cases you may want to tweak thread caches
|
||||
higher. Also if your workload has many threads that tend to be idle
|
||||
for longer durations, consider using
|
||||
`MallocExtension::MarkThread{Idle,Busy}`.
|
||||
|
||||
== [#runtime]#Modifying Runtime Behavior#
|
||||
|
||||
You can more finely control the behavior of the tcmalloc via environment
|
||||
variables.
|
||||
|
||||
Generally useful flags:
|
||||
|
||||
[cols=",,",]
|
||||
|===
|
||||
|
||||
|`TCMALLOC_SAMPLE_PARAMETER` |default: 0 |The approximate gap between
|
||||
sampling actions. That is, we take one sample approximately once every
|
||||
`tcmalloc_sample_parmeter` bytes of allocation. This sampled heap
|
||||
information is available via `MallocExtension::GetHeapSample()` or
|
||||
`MallocExtension::ReadStackTraces()`. A reasonable value is 524288.
|
||||
|
||||
|`TCMALLOC_RELEASE_RATE` |default: 1.0 |Rate at which we release
|
||||
unused memory to the system, via `+madvise(MADV_DONTNEED)+`, on systems
|
||||
that support it. Zero means we never release memory back to the system.
|
||||
Increase this flag to return memory faster; decrease it to return memory
|
||||
slower. Reasonable rates are in the range [0,10].
|
||||
|
||||
|`TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD` |default: 1073741824
|
||||
|Allocations larger than this value cause a stack trace to be dumped
|
||||
to stderr. The threshold for dumping stack traces is increased by a
|
||||
factor of 1.125 every time we print a message so that the threshold
|
||||
automatically goes up by a factor of ~1000 every 60 messages. This
|
||||
bounds the amount of extra logging generated by this flag. Default
|
||||
value of this flag is very large and therefore you should see no extra
|
||||
logging unless the flag is overridden.
|
||||
|
||||
|`TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES` |default: 33554432 |Bound
|
||||
on the total amount of bytes allocated to thread caches. This bound is
|
||||
not strict, so it is possible for the cache to go over this bound in
|
||||
certain circumstances. This value defaults to 16MB. For applications
|
||||
with many threads, this may not be a large enough cache, which can
|
||||
affect performance. If you suspect your application is not scaling to
|
||||
many threads due to lock contention in TCMalloc, you can try
|
||||
increasing this value. This may improve performance, at a cost of
|
||||
extra memory use by TCMalloc. See link:#Garbage_Collection[Garbage
|
||||
Collection] for more details.
|
||||
|
||||
|`TCMALLOC_AGGRESSIVE_DECOMMIT` | default: false |Enables "aggressive
|
||||
decommit mode", which makes all tcmalloc to return all free spans to
|
||||
kernel. This reduces total phsycical memory usage at cost of some
|
||||
performance (about 2% cpu hit in Chrome was measured at some point).
|
||||
|
||||
|`TCMALLOC_OVERRIDE_PAGESIZE` | default: getpagesize() | Sometimes we
|
||||
run on systems with larger than anticipatesd hardware page
|
||||
size. I.e. ARMs (and soon RISC-Vs) can run 64k pages mode. We detect
|
||||
actual page size at run-time and adjust our span sizings to do memory
|
||||
management syscalls with correct granularity. Larger pages generally
|
||||
cause somewhat higher memory fragmentation, so we have this parameter
|
||||
to be able measuring fragmentation impact of larger pages.
|
||||
|
||||
|`TCMALLOC_HEAP_LIMIT_MB` | default: No limit | Sets limit on total
|
||||
size of page heap (in-use spans and "free but not returned"
|
||||
spans). When tcmalloc hits this limit it tries to return some free
|
||||
spans to kernel. And if that isn't enough to keep page heap size under
|
||||
limit it OOMs. "abseil tcmalloc" has equivalent "hard limit".
|
||||
|
||||
|===
|
||||
|
||||
Advanced "tweaking" flags, that control more precisely how tcmalloc
|
||||
tries to allocate memory from the kernel.
|
||||
|
||||
[cols=",,",]
|
||||
|===
|
||||
|
||||
|`TCMALLOC_SKIP_MMAP` |default: false |If true, do not try to use
|
||||
`+mmap+` to obtain memory from the kernel.
|
||||
|
||||
|`TCMALLOC_SKIP_SBRK` |default: false |If true, do not try to use
|
||||
`+sbrk+` to obtain memory from the kernel.
|
||||
|
||||
|`TCMALLOC_MEMFS_MALLOC_PATH` |default: "" |If set, specify a path
|
||||
where hugetlbfs or tmpfs is mounted. This may allow for speedier
|
||||
allocations.
|
||||
|
||||
|`TCMALLOC_MEMFS_LIMIT_MB` |default: 0 |Limit total memfs allocation
|
||||
size to specified number of MB. 0 means "no limit".
|
||||
|
||||
|`TCMALLOC_MEMFS_ABORT_ON_FAIL` |default: false |If true, abort()
|
||||
whenever memfs_malloc fails to satisfy an allocation.
|
||||
|
||||
|`TCMALLOC_MEMFS_IGNORE_MMAP_FAIL` |default: false |If true, ignore
|
||||
failures from mmap.
|
||||
|
||||
|`TCMALLOC_MEMFS_MAP_PRIVATE` |default: false |If true, use
|
||||
MAP_PRIVATE when mapping via memfs, not MAP_SHARED.
|
||||
|
||||
|`TCMALLOC_MEMFS_DISABLE_FALLBACK` |default: false |If true, OOM on
|
||||
failing to allocate from memfs instead of falling back to anonymous
|
||||
memory (sbrk/mmap)
|
||||
|
||||
|===
|
||||
|
||||
== [#compiletime]#Modifying Behavior In Code#
|
||||
|
||||
The `+MallocExtension+` class, in `+malloc_extension.h+`, provides a few
|
||||
knobs that you can tweak in your program, to affect tcmalloc's behavior.
|
||||
|
||||
=== Releasing Memory Back to the System
|
||||
|
||||
By default, tcmalloc will release no-longer-used memory back to the
|
||||
kernel gradually, over time. The link:#runtime[tcmalloc_release_rate]
|
||||
flag controls how quickly this happens. You can also force a release at
|
||||
a given point in the progam execution like so:
|
||||
|
||||
....
|
||||
MallocExtension::instance()->ReleaseFreeMemory();
|
||||
....
|
||||
|
||||
You can also call `+SetMemoryReleaseRate()+` to change the
|
||||
`+tcmalloc_release_rate+` value at runtime, or `+GetMemoryReleaseRate+`
|
||||
to see what the current release rate is.
|
||||
|
||||
=== Memory Introspection
|
||||
|
||||
There are several routines for getting a human-readable form of the
|
||||
current memory usage:
|
||||
|
||||
....
|
||||
MallocExtension::instance()->GetStats(buffer, buffer_length);
|
||||
MallocExtension::instance()->GetHeapSample(&string);
|
||||
MallocExtension::instance()->GetHeapGrowthStacks(&string);
|
||||
....
|
||||
|
||||
The last two create files in the same format as the heap-profiler, and
|
||||
can be passed as data files to pprof. The first is human-readable and is
|
||||
meant for debugging.
|
||||
|
||||
=== Generic Tcmalloc Status
|
||||
|
||||
TCMalloc has support for setting and retrieving arbitrary 'properties':
|
||||
|
||||
....
|
||||
MallocExtension::instance()->SetNumericProperty(property_name, value);
|
||||
MallocExtension::instance()->GetNumericProperty(property_name, &value);
|
||||
....
|
||||
|
||||
It is possible for an application to set and get these properties, but
|
||||
the most useful is when a library sets the properties so the application
|
||||
can read them. Here are the properties TCMalloc defines; you can access
|
||||
them with a call like
|
||||
`MallocExtension::instance()->GetNumericProperty("generic.heap_size", &value);`:
|
||||
|
||||
[cols=",",]
|
||||
|===
|
||||
|
||||
|`generic.current_allocated_bytes` |Number of bytes used by the
|
||||
application. This will not typically match the memory use reported by
|
||||
the OS, because it does not include TCMalloc overhead or memory
|
||||
fragmentation.
|
||||
|
||||
|`generic.heap_size` |Bytes of system memory reserved by TCMalloc.
|
||||
|
||||
|`tcmalloc.pageheap_free_bytes` |Number of bytes in free, mapped pages
|
||||
in page heap. These bytes can be used to fulfill allocation requests.
|
||||
They always count towards virtual memory usage, and unless the
|
||||
underlying memory is swapped out by the OS, they also count towards
|
||||
physical memory usage.
|
||||
|
||||
|`tcmalloc.pageheap_unmapped_bytes` |Number of bytes in free, unmapped
|
||||
pages in page heap. These are bytes that have been released back to the
|
||||
OS, possibly by one of the MallocExtension "Release" calls. They can be
|
||||
used to fulfill allocation requests, but typically incur a page fault.
|
||||
They always count towards virtual memory usage, and depending on the OS,
|
||||
typically do not count towards physical memory usage.
|
||||
|
||||
|`tcmalloc.slack_bytes` |Sum of pageheap_free_bytes and
|
||||
pageheap_unmapped_bytes. Provided for backwards compatibility only. Do
|
||||
not use.
|
||||
|
||||
|`tcmalloc.max_total_thread_cache_bytes` |A limit to how much memory
|
||||
TCMalloc dedicates for small objects. Higher numbers trade off more
|
||||
memory use for -- in some situations -- improved efficiency.
|
||||
|
||||
|`tcmalloc.current_total_thread_cache_bytes` |A measure of some of the
|
||||
memory TCMalloc is using (for small objects).
|
||||
|
||||
|`tcmalloc.min_per_thread_cache_bytes` |A lower limit to how much
|
||||
memory TCMalloc dedicates for small objects per thread. Note that this
|
||||
property only shows effect if per-thread cache calculated using
|
||||
tcmalloc.max_total_thread_cache_bytes ended up being less than
|
||||
tcmalloc.min_per_thread_cache_bytes.
|
||||
|
||||
|===
|
||||
|
||||
=== [#caveats]#Caveats#
|
||||
|
||||
TCMalloc may be somewhat more memory hungry than other mallocs, (but
|
||||
tends not to have the huge blowups that can happen with other mallocs).
|
||||
In particular, at startup TCMalloc allocates approximately 240KB of
|
||||
internal memory.
|
||||
|
||||
Don't try to load TCMalloc into a running binary (e.g., using JNI in
|
||||
Java programs). The binary will have allocated some objects using the
|
||||
system malloc, and may try to pass them to TCMalloc for deallocation.
|
||||
TCMalloc will not be able to handle such objects.
|
||||
|
||||
'''''
|
||||
|
||||
Original author: Sanjay Ghemawat +
|
||||
Last updated by: Aliaksei Kandratsenka (Dec 2024)
|
@ -1,788 +0,0 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.01 transitional//en">
|
||||
<!-- $Id: $ -->
|
||||
<html>
|
||||
<head>
|
||||
<title>TCMalloc : Thread-Caching Malloc</title>
|
||||
<link rel="stylesheet" href="designstyle.css">
|
||||
<style type="text/css">
|
||||
em {
|
||||
color: red;
|
||||
font-style: normal;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>TCMalloc : Thread-Caching Malloc</h1>
|
||||
|
||||
<address>Sanjay Ghemawat</address>
|
||||
|
||||
<h2><A name=motivation>Motivation</A></h2>
|
||||
|
||||
<p>TCMalloc is faster than the glibc 2.3 malloc (available as a
|
||||
separate library called ptmalloc2) and other mallocs that I have
|
||||
tested. ptmalloc2 takes approximately 300 nanoseconds to execute a
|
||||
malloc/free pair on a 2.8 GHz P4 (for small objects). The TCMalloc
|
||||
implementation takes approximately 50 nanoseconds for the same
|
||||
operation pair. Speed is important for a malloc implementation
|
||||
because if malloc is not fast enough, application writers are inclined
|
||||
to write their own custom free lists on top of malloc. This can lead
|
||||
to extra complexity, and more memory usage unless the application
|
||||
writer is very careful to appropriately size the free lists and
|
||||
scavenge idle objects out of the free list.</p>
|
||||
|
||||
<p>TCMalloc also reduces lock contention for multi-threaded programs.
|
||||
For small objects, there is virtually zero contention. For large
|
||||
objects, TCMalloc tries to use fine grained and efficient spinlocks.
|
||||
ptmalloc2 also reduces lock contention by using per-thread arenas but
|
||||
there is a big problem with ptmalloc2's use of per-thread arenas. In
|
||||
ptmalloc2 memory can never move from one arena to another. This can
|
||||
lead to huge amounts of wasted space. For example, in one Google
|
||||
application, the first phase would allocate approximately 300MB of
|
||||
memory for its URL canonicalization data structures. When the first
|
||||
phase finished, a second phase would be started in the same address
|
||||
space. If this second phase was assigned a different arena than the
|
||||
one used by the first phase, this phase would not reuse any of the
|
||||
memory left after the first phase and would add another 300MB to the
|
||||
address space. Similar memory blowup problems were also noticed in
|
||||
other applications.</p>
|
||||
|
||||
<p>Another benefit of TCMalloc is space-efficient representation of
|
||||
small objects. For example, N 8-byte objects can be allocated while
|
||||
using space approximately <code>8N * 1.01</code> bytes. I.e., a
|
||||
one-percent space overhead. ptmalloc2 uses a four-byte header for
|
||||
each object and (I think) rounds up the size to a multiple of 8 bytes
|
||||
and ends up using <code>16N</code> bytes.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Usage">Usage</A></h2>
|
||||
|
||||
<p>To use TCMalloc, just link TCMalloc into your application via the
|
||||
"-ltcmalloc" linker flag.</p>
|
||||
|
||||
<p>You can use TCMalloc in applications you didn't compile yourself,
|
||||
by using LD_PRELOAD:</p>
|
||||
<pre>
|
||||
$ LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary>
|
||||
</pre>
|
||||
<p>LD_PRELOAD is tricky, and we don't necessarily recommend this mode
|
||||
of usage.</p>
|
||||
|
||||
<p>TCMalloc includes a <A HREF="heap_checker.html">heap checker</A>
|
||||
and <A HREF="heapprofile.html">heap profiler</A> as well.</p>
|
||||
|
||||
<p>If you'd rather link in a version of TCMalloc that does not include
|
||||
the heap profiler and checker (perhaps to reduce binary size for a
|
||||
static binary), you can link in <code>libtcmalloc_minimal</code>
|
||||
instead.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Overview">Overview</A></h2>
|
||||
|
||||
<p>TCMalloc assigns each thread a thread-local cache. Small
|
||||
allocations are satisfied from the thread-local cache. Objects are
|
||||
moved from central data structures into a thread-local cache as
|
||||
needed, and periodic garbage collections are used to migrate memory
|
||||
back from a thread-local cache into the central data structures.</p>
|
||||
<center><img src="overview.gif"></center>
|
||||
|
||||
<p>TCMalloc treats objects with size <= 256K ("small" objects)
|
||||
differently from larger objects. Large objects are allocated directly
|
||||
from the central heap using a page-level allocator (a page is a 8K
|
||||
aligned region of memory). I.e., a large object is always
|
||||
page-aligned and occupies an integral number of pages.</p>
|
||||
|
||||
<p>A run of pages can be carved up into a sequence of small objects,
|
||||
each equally sized. For example a run of one page (4K) can be carved
|
||||
up into 32 objects of size 128 bytes each.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Small_Object_Allocation">Small Object Allocation</A></h2>
|
||||
|
||||
<p>Each small object size maps to one of approximately 88 allocatable
|
||||
size-classes. For example, all allocations in the range 961 to 1024
|
||||
bytes are rounded up to 1024. The size-classes are spaced so that
|
||||
small sizes are separated by 8 bytes, larger sizes by 16 bytes, even
|
||||
larger sizes by 32 bytes, and so forth. The maximal spacing is
|
||||
controlled so that not too much space is wasted when an allocation
|
||||
request falls just past the end of a size class and has to be rounded
|
||||
up to the next class.</p>
|
||||
|
||||
<p>A thread cache contains a singly linked list of free objects per
|
||||
size-class.</p>
|
||||
<center><img src="threadheap.gif"></center>
|
||||
|
||||
<p>When allocating a small object: (1) We map its size to the
|
||||
corresponding size-class. (2) Look in the corresponding free list in
|
||||
the thread cache for the current thread. (3) If the free list is not
|
||||
empty, we remove the first object from the list and return it. When
|
||||
following this fast path, TCMalloc acquires no locks at all. This
|
||||
helps speed-up allocation significantly because a lock/unlock pair
|
||||
takes approximately 100 nanoseconds on a 2.8 GHz Xeon.</p>
|
||||
|
||||
<p>If the free list is empty: (1) We fetch a bunch of objects from a
|
||||
central free list for this size-class (the central free list is shared
|
||||
by all threads). (2) Place them in the thread-local free list. (3)
|
||||
Return one of the newly fetched objects to the applications.</p>
|
||||
|
||||
<p>If the central free list is also empty: (1) We allocate a run of
|
||||
pages from the central page allocator. (2) Split the run into a set
|
||||
of objects of this size-class. (3) Place the new objects on the
|
||||
central free list. (4) As before, move some of these objects to the
|
||||
thread-local free list.</p>
|
||||
|
||||
<h3><A NAME="Sizing_Thread_Cache_Free_Lists">
|
||||
Sizing Thread Cache Free Lists</A></h3>
|
||||
|
||||
<p>It is important to size the thread cache free lists correctly. If
|
||||
the free list is too small, we'll need to go to the central free list
|
||||
too often. If the free list is too big, we'll waste memory as objects
|
||||
sit idle in the free list.</p>
|
||||
|
||||
<p>Note that the thread caches are just as important for deallocation
|
||||
as they are for allocation. Without a cache, each deallocation would
|
||||
require moving the memory to the central free list. Also, some threads
|
||||
have asymmetric alloc/free behavior (e.g. producer and consumer threads),
|
||||
so sizing the free list correctly gets trickier.</p>
|
||||
|
||||
<p>To size the free lists appropriately, we use a slow-start algorithm
|
||||
to determine the maximum length of each individual free list. As the
|
||||
free list is used more frequently, its maximum length grows. However,
|
||||
if a free list is used more for deallocation than allocation, its
|
||||
maximum length will grow only up to a point where the whole list can
|
||||
be efficiently moved to the central free list at once.</p>
|
||||
|
||||
<p>The psuedo-code below illustrates this slow-start algorithm. Note
|
||||
that <code>num_objects_to_move</code> is specific to each size class.
|
||||
By moving a list of objects with a well-known length, the central
|
||||
cache can efficiently pass these lists between thread caches. If
|
||||
a thread cache wants fewer than <code>num_objects_to_move</code>,
|
||||
the operation on the central free list has linear time complexity.
|
||||
The downside of always using <code>num_objects_to_move</code> as
|
||||
the number of objects to transfer to and from the central cache is
|
||||
that it wastes memory in threads that don't need all of those objects.
|
||||
|
||||
<pre>
|
||||
Start each freelist max_length at 1.
|
||||
|
||||
Allocation
|
||||
if freelist empty {
|
||||
fetch min(max_length, num_objects_to_move) from central list;
|
||||
if max_length < num_objects_to_move { // slow-start
|
||||
max_length++;
|
||||
} else {
|
||||
max_length += num_objects_to_move;
|
||||
}
|
||||
}
|
||||
|
||||
Deallocation
|
||||
if length > max_length {
|
||||
// Don't try to release num_objects_to_move if we don't have that many.
|
||||
release min(max_length, num_objects_to_move) objects to central list
|
||||
if max_length < num_objects_to_move {
|
||||
// Slow-start up to num_objects_to_move.
|
||||
max_length++;
|
||||
} else if max_length > num_objects_to_move {
|
||||
// If we consistently go over max_length, shrink max_length.
|
||||
overages++;
|
||||
if overages > kMaxOverages {
|
||||
max_length -= num_objects_to_move;
|
||||
overages = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
|
||||
See also the section on <a href="#Garbage_Collection">Garbage Collection</a>
|
||||
to see how it affects the <code>max_length</code>.
|
||||
|
||||
<h2><A NAME="Medium_Object_Allocation">Medium Object Allocation</A></h2>
|
||||
|
||||
<p>A medium object size (256K ≤ size ≤ 1MB) is rounded up to a page
|
||||
size (8K) and is handled by a central page heap. The central page heap
|
||||
includes an array of 128 free lists. The <code>k</code>th entry is a
|
||||
free list of runs that consist of <code>k + 1</code> pages:</p>
|
||||
<center><img src="pageheap.gif"></center>
|
||||
|
||||
<p>An allocation for <code>k</code> pages is satisfied by looking in
|
||||
the <code>k</code>th free list. If that free list is empty, we look
|
||||
in the next free list, and so forth. If no medium-object free list
|
||||
can satisfy the allocation, the allocation is treated as a large object.
|
||||
|
||||
|
||||
<h2><A NAME="Large_Object_Allocation">Large Object Allocation</A></h2>
|
||||
|
||||
Allocations of 1MB or more are considered large allocations. Spans
|
||||
of free memory which can satisfy these allocations are tracked in
|
||||
a red-black tree sorted by size. Allocations follow the <em>best-fit</em>
|
||||
algorithm: the tree is searched to find the smallest span of free
|
||||
space which is larger than the requested allocation. The allocation
|
||||
is carved out of that span, and the remaining space is reinserted
|
||||
either into the large object tree or possibly into one of the smaller
|
||||
free-lists as appropriate.
|
||||
|
||||
If no span of free memory is located that can fit the requested
|
||||
allocation, we fetch memory from the system (using <code>sbrk</code>,
|
||||
<code>mmap</code>, or by mapping in portions of
|
||||
<code>/dev/mem</code>).</p>
|
||||
|
||||
<p>If an allocation for <code>k</code> pages is satisfied by a run
|
||||
of pages of length > <code>k</code>, the remainder of the
|
||||
run is re-inserted back into the appropriate free list in the
|
||||
page heap.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Spans">Spans</A></h2>
|
||||
|
||||
<p>The heap managed by TCMalloc consists of a set of pages. A run of
|
||||
contiguous pages is represented by a <code>Span</code> object. A span
|
||||
can either be <em>allocated</em>, or <em>free</em>. If free, the span
|
||||
is one of the entries in a page heap linked-list. If allocated, it is
|
||||
either a large object that has been handed off to the application, or
|
||||
a run of pages that have been split up into a sequence of small
|
||||
objects. If split into small objects, the size-class of the objects
|
||||
is recorded in the span.</p>
|
||||
|
||||
<p>A central array indexed by page number can be used to find the span to
|
||||
which a page belongs. For example, span <em>a</em> below occupies 2
|
||||
pages, span <em>b</em> occupies 1 page, span <em>c</em> occupies 5
|
||||
pages and span <em>d</em> occupies 3 pages.</p>
|
||||
<center><img src="spanmap.gif"></center>
|
||||
|
||||
<p>In a 32-bit address space, the central array is represented by a a
|
||||
2-level radix tree where the root contains 32 entries and each leaf
|
||||
contains 2^14 entries (a 32-bit address space has 2^19 8K pages, and
|
||||
the first level of tree divides the 2^19 pages by 2^5). This leads to
|
||||
a starting memory usage of 64KB of space (2^14*4 bytes) for the
|
||||
central array, which seems acceptable.</p>
|
||||
|
||||
<p>On 64-bit machines, we use a 3-level radix tree.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Deallocation">Deallocation</A></h2>
|
||||
|
||||
<p>When an object is deallocated, we compute its page number and look
|
||||
it up in the central array to find the corresponding span object. The
|
||||
span tells us whether or not the object is small, and its size-class
|
||||
if it is small. If the object is small, we insert it into the
|
||||
appropriate free list in the current thread's thread cache. If the
|
||||
thread cache now exceeds a predetermined size (2MB by default), we run
|
||||
a garbage collector that moves unused objects from the thread cache
|
||||
into central free lists.</p>
|
||||
|
||||
<p>If the object is large, the span tells us the range of pages covered
|
||||
by the object. Suppose this range is <code>[p,q]</code>. We also
|
||||
lookup the spans for pages <code>p-1</code> and <code>q+1</code>. If
|
||||
either of these neighboring spans are free, we coalesce them with the
|
||||
<code>[p,q]</code> span. The resulting span is inserted into the
|
||||
appropriate free list in the page heap.</p>
|
||||
|
||||
|
||||
<h2>Central Free Lists for Small Objects</h2>
|
||||
|
||||
<p>As mentioned before, we keep a central free list for each
|
||||
size-class. Each central free list is organized as a two-level data
|
||||
structure: a set of spans, and a linked list of free objects per
|
||||
span.</p>
|
||||
|
||||
<p>An object is allocated from a central free list by removing the
|
||||
first entry from the linked list of some span. (If all spans have
|
||||
empty linked lists, a suitably sized span is first allocated from the
|
||||
central page heap.)</p>
|
||||
|
||||
<p>An object is returned to a central free list by adding it to the
|
||||
linked list of its containing span. If the linked list length now
|
||||
equals the total number of small objects in the span, this span is now
|
||||
completely free and is returned to the page heap.</p>
|
||||
|
||||
|
||||
<h2><A NAME="Garbage_Collection">Garbage Collection of Thread Caches</A></h2>
|
||||
|
||||
<p>Garbage collecting objects from a thread cache keeps the size of
|
||||
the cache under control and returns unused objects to the central free
|
||||
lists. Some threads need large caches to perform well while others
|
||||
can get by with little or no cache at all. When a thread cache goes
|
||||
over its <code>max_size</code>, garbage collection kicks in and then the
|
||||
thread competes with the other threads for a larger cache.</p>
|
||||
|
||||
<p>Garbage collection is run only during a deallocation. We walk over
|
||||
all free lists in the cache and move some number of objects from the
|
||||
free list to the corresponding central list.</p>
|
||||
|
||||
<p>The number of objects to be moved from a free list is determined
|
||||
using a per-list low-water-mark <code>L</code>. <code>L</code>
|
||||
records the minimum length of the list since the last garbage
|
||||
collection. Note that we could have shortened the list by
|
||||
<code>L</code> objects at the last garbage collection without
|
||||
requiring any extra accesses to the central list. We use this past
|
||||
history as a predictor of future accesses and move <code>L/2</code>
|
||||
objects from the thread cache free list to the corresponding central
|
||||
free list. This algorithm has the nice property that if a thread
|
||||
stops using a particular size, all objects of that size will quickly
|
||||
move from the thread cache to the central free list where they can be
|
||||
used by other threads.</p>
|
||||
|
||||
<p>If a thread consistently deallocates more objects of a certain size
|
||||
than it allocates, this <code>L/2</code> behavior will cause at least
|
||||
<code>L/2</code> objects to always sit in the free list. To avoid
|
||||
wasting memory this way, we shrink the maximum length of the freelist
|
||||
to converge on <code>num_objects_to_move</code> (see also
|
||||
<a href="#Sizing_Thread_Cache_Free_Lists">Sizing Thread Cache Free Lists</a>).
|
||||
|
||||
<pre>
|
||||
Garbage Collection
|
||||
if (L != 0 && max_length > num_objects_to_move) {
|
||||
max_length = max(max_length - num_objects_to_move, num_objects_to_move)
|
||||
}
|
||||
</pre>
|
||||
|
||||
<p>The fact that the thread cache went over its <code>max_size</code> is
|
||||
an indication that the thread would benefit from a larger cache. Simply
|
||||
increasing <code>max_size</code> would use an inordinate amount of memory
|
||||
in programs that have lots of active threads. Developers can bound the
|
||||
memory used with the flag --tcmalloc_max_total_thread_cache_bytes.</p>
|
||||
|
||||
<p>Each thread cache starts with a small <code>max_size</code>
|
||||
(e.g. 64KB) so that idle threads won't pre-allocate memory they don't
|
||||
need. Each time the cache runs a garbage collection, it will also try
|
||||
to grow its <code>max_size</code>. If the sum of the thread cache
|
||||
sizes is less than --tcmalloc_max_total_thread_cache_bytes,
|
||||
<code>max_size</code> grows easily. If not, thread cache 1 will try
|
||||
to steal from thread cache 2 (picked round-robin) by decreasing thread
|
||||
cache 2's <code>max_size</code>. In this way, threads that are more
|
||||
active will steal memory from other threads more often than they are
|
||||
have memory stolen from themselves. Mostly idle threads end up with
|
||||
small caches and active threads end up with big caches. Note that
|
||||
this stealing can cause the sum of the thread cache sizes to be
|
||||
greater than --tcmalloc_max_total_thread_cache_bytes until thread
|
||||
cache 2 deallocates some memory to trigger a garbage collection.</p>
|
||||
|
||||
<h2><A NAME="performance">Performance Notes</A></h2>
|
||||
|
||||
<h3>PTMalloc2 unittest</h3>
|
||||
|
||||
<p>The PTMalloc2 package (now part of glibc) contains a unittest
|
||||
program <code>t-test1.c</code>. This forks a number of threads and
|
||||
performs a series of allocations and deallocations in each thread; the
|
||||
threads do not communicate other than by synchronization in the memory
|
||||
allocator.</p>
|
||||
|
||||
<p><code>t-test1</code> (included in
|
||||
<code>tests/tcmalloc/</code>, and compiled as
|
||||
<code>ptmalloc_unittest1</code>) was run with a varying numbers of
|
||||
threads (1-20) and maximum allocation sizes (64 bytes -
|
||||
32Kbytes). These tests were run on a 2.4GHz dual Xeon system with
|
||||
hyper-threading enabled, using Linux glibc-2.3.2 from RedHat 9, with
|
||||
one million operations per thread in each test. In each case, the test
|
||||
was run once normally, and once with
|
||||
<code>LD_PRELOAD=libtcmalloc.so</code>.
|
||||
|
||||
<p>The graphs below show the performance of TCMalloc vs PTMalloc2 for
|
||||
several different metrics. Firstly, total operations (millions) per
|
||||
elapsed second vs max allocation size, for varying numbers of
|
||||
threads. The raw data used to generate these graphs (the output of the
|
||||
<code>time</code> utility) is available in
|
||||
<code>t-test1.times.txt</code>.</p>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.1.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.2.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.3.threads.png"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.4.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.5.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.8.threads.png"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.12.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.16.threads.png"></td>
|
||||
<td><img src="tcmalloc-opspersec.vs.size.20.threads.png"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
<ul>
|
||||
<li> TCMalloc is much more consistently scalable than PTMalloc2 - for
|
||||
all thread counts >1 it achieves ~7-9 million ops/sec for small
|
||||
allocations, falling to ~2 million ops/sec for larger
|
||||
allocations. The single-thread case is an obvious outlier,
|
||||
since it is only able to keep a single processor busy and hence
|
||||
can achieve fewer ops/sec. PTMalloc2 has a much higher variance
|
||||
on operations/sec - peaking somewhere around 4 million ops/sec
|
||||
for small allocations and falling to <1 million ops/sec for
|
||||
larger allocations.
|
||||
|
||||
<li> TCMalloc is faster than PTMalloc2 in the vast majority of
|
||||
cases, and particularly for small allocations. Contention
|
||||
between threads is less of a problem in TCMalloc.
|
||||
|
||||
<li> TCMalloc's performance drops off as the allocation size
|
||||
increases. This is because the per-thread cache is
|
||||
garbage-collected when it hits a threshold (defaulting to
|
||||
2MB). With larger allocation sizes, fewer objects can be stored
|
||||
in the cache before it is garbage-collected.
|
||||
|
||||
<li> There is a noticeable drop in TCMalloc's performance at ~32K
|
||||
maximum allocation size; at larger sizes performance drops less
|
||||
quickly. This is due to the 32K maximum size of objects in the
|
||||
per-thread caches; for objects larger than this TCMalloc
|
||||
allocates from the central page heap.
|
||||
</ul>
|
||||
|
||||
<p>Next, operations (millions) per second of CPU time vs number of
|
||||
threads, for max allocation size 64 bytes - 128 Kbytes.</p>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.64.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.256.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.1024.bytes.png"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.4096.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.8192.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.16384.bytes.png"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.32768.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.65536.bytes.png"></td>
|
||||
<td><img src="tcmalloc-opspercpusec.vs.threads.131072.bytes.png"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>Here we see again that TCMalloc is both more consistent and more
|
||||
efficient than PTMalloc2. For max allocation sizes <32K, TCMalloc
|
||||
typically achieves ~2-2.5 million ops per second of CPU time with a
|
||||
large number of threads, whereas PTMalloc achieves generally 0.5-1
|
||||
million ops per second of CPU time, with a lot of cases achieving much
|
||||
less than this figure. Above 32K max allocation size, TCMalloc drops
|
||||
to 1-1.5 million ops per second of CPU time, and PTMalloc drops almost
|
||||
to zero for large numbers of threads (i.e. with PTMalloc, lots of CPU
|
||||
time is being burned spinning waiting for locks in the heavily
|
||||
multi-threaded case).</p>
|
||||
|
||||
|
||||
<H2><A NAME="runtime">Modifying Runtime Behavior</A></H2>
|
||||
|
||||
<p>You can more finely control the behavior of the tcmalloc via
|
||||
environment variables.</p>
|
||||
|
||||
<p>Generally useful flags:</p>
|
||||
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_SAMPLE_PARAMETER</code></td>
|
||||
<td>default: 0</td>
|
||||
<td>
|
||||
The approximate gap between sampling actions. That is, we
|
||||
take one sample approximately once every
|
||||
<code>tcmalloc_sample_parmeter</code> bytes of allocation.
|
||||
This sampled heap information is available via
|
||||
<code>MallocExtension::GetHeapSample()</code> or
|
||||
<code>MallocExtension::ReadStackTraces()</code>. A reasonable
|
||||
value is 524288.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_RELEASE_RATE</code></td>
|
||||
<td>default: 1.0</td>
|
||||
<td>
|
||||
Rate at which we release unused memory to the system, via
|
||||
<code>madvise(MADV_DONTNEED)</code>, on systems that support
|
||||
it. Zero means we never release memory back to the system.
|
||||
Increase this flag to return memory faster; decrease it
|
||||
to return memory slower. Reasonable rates are in the
|
||||
range [0,10].
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD</code></td>
|
||||
<td>default: 1073741824</td>
|
||||
<td>
|
||||
Allocations larger than this value cause a stack trace to be
|
||||
dumped to stderr. The threshold for dumping stack traces is
|
||||
increased by a factor of 1.125 every time we print a message so
|
||||
that the threshold automatically goes up by a factor of ~1000
|
||||
every 60 messages. This bounds the amount of extra logging
|
||||
generated by this flag. Default value of this flag is very large
|
||||
and therefore you should see no extra logging unless the flag is
|
||||
overridden.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES</code></td>
|
||||
<td>default: 33554432</td>
|
||||
<td>
|
||||
Bound on the total amount of bytes allocated to thread caches. This
|
||||
bound is not strict, so it is possible for the cache to go over this
|
||||
bound in certain circumstances. This value defaults to 16MB. For
|
||||
applications with many threads, this may not be a large enough cache,
|
||||
which can affect performance. If you suspect your application is not
|
||||
scaling to many threads due to lock contention in TCMalloc, you can
|
||||
try increasing this value. This may improve performance, at a cost
|
||||
of extra memory use by TCMalloc. See <a href="#Garbage_Collection">
|
||||
Garbage Collection</a> for more details.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
<p>Advanced "tweaking" flags, that control more precisely how tcmalloc
|
||||
tries to allocate memory from the kernel.</p>
|
||||
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_SKIP_MMAP</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
If true, do not try to use <code>mmap</code> to obtain memory
|
||||
from the kernel.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_SKIP_SBRK</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
If true, do not try to use <code>sbrk</code> to obtain memory
|
||||
from the kernel.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_DEVMEM_START</code></td>
|
||||
<td>default: 0</td>
|
||||
<td>
|
||||
Physical memory starting location in MB for <code>/dev/mem</code>
|
||||
allocation. Setting this to 0 disables <code>/dev/mem</code>
|
||||
allocation.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_DEVMEM_LIMIT</code></td>
|
||||
<td>default: 0</td>
|
||||
<td>
|
||||
Physical memory limit location in MB for <code>/dev/mem</code>
|
||||
allocation. Setting this to 0 means no limit.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_DEVMEM_DEVICE</code></td>
|
||||
<td>default: /dev/mem</td>
|
||||
<td>
|
||||
Device to use for allocating unmanaged memory.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MEMFS_MALLOC_PATH</code></td>
|
||||
<td>default: ""</td>
|
||||
<td>
|
||||
If set, specify a path where hugetlbfs or tmpfs is mounted.
|
||||
This may allow for speedier allocations.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MEMFS_LIMIT_MB</code></td>
|
||||
<td>default: 0</td>
|
||||
<td>
|
||||
Limit total memfs allocation size to specified number of MB.
|
||||
0 means "no limit".
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MEMFS_ABORT_ON_FAIL</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
If true, abort() whenever memfs_malloc fails to satisfy an allocation.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MEMFS_IGNORE_MMAP_FAIL</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
If true, ignore failures from mmap.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>TCMALLOC_MEMFS_MAP_PRIVATE</code></td>
|
||||
<td>default: false</td>
|
||||
<td>
|
||||
If true, use MAP_PRIVATE when mapping via memfs, not MAP_SHARED.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
<H2><A NAME="compiletime">Modifying Behavior In Code</A></H2>
|
||||
|
||||
<p>The <code>MallocExtension</code> class, in
|
||||
<code>malloc_extension.h</code>, provides a few knobs that you can
|
||||
tweak in your program, to affect tcmalloc's behavior.</p>
|
||||
|
||||
<h3>Releasing Memory Back to the System</h3>
|
||||
|
||||
<p>By default, tcmalloc will release no-longer-used memory back to the
|
||||
kernel gradually, over time. The <a
|
||||
href="#runtime">tcmalloc_release_rate</a> flag controls how quickly
|
||||
this happens. You can also force a release at a given point in the
|
||||
progam execution like so:</p>
|
||||
<pre>
|
||||
MallocExtension::instance()->ReleaseFreeMemory();
|
||||
</pre>
|
||||
|
||||
<p>You can also call <code>SetMemoryReleaseRate()</code> to change the
|
||||
<code>tcmalloc_release_rate</code> value at runtime, or
|
||||
<code>GetMemoryReleaseRate</code> to see what the current release rate
|
||||
is.</p>
|
||||
|
||||
<h3>Memory Introspection</h3>
|
||||
|
||||
<p>There are several routines for getting a human-readable form of the
|
||||
current memory usage:</p>
|
||||
<pre>
|
||||
MallocExtension::instance()->GetStats(buffer, buffer_length);
|
||||
MallocExtension::instance()->GetHeapSample(&string);
|
||||
MallocExtension::instance()->GetHeapGrowthStacks(&string);
|
||||
</pre>
|
||||
|
||||
<p>The last two create files in the same format as the heap-profiler,
|
||||
and can be passed as data files to pprof. The first is human-readable
|
||||
and is meant for debugging.</p>
|
||||
|
||||
<h3>Generic Tcmalloc Status</h3>
|
||||
|
||||
<p>TCMalloc has support for setting and retrieving arbitrary
|
||||
'properties':</p>
|
||||
<pre>
|
||||
MallocExtension::instance()->SetNumericProperty(property_name, value);
|
||||
MallocExtension::instance()->GetNumericProperty(property_name, &value);
|
||||
</pre>
|
||||
|
||||
<p>It is possible for an application to set and get these properties,
|
||||
but the most useful is when a library sets the properties so the
|
||||
application can read them. Here are the properties TCMalloc defines;
|
||||
you can access them with a call like
|
||||
<code>MallocExtension::instance()->GetNumericProperty("generic.heap_size",
|
||||
&value);</code>:</p>
|
||||
|
||||
<table frame=box rules=sides cellpadding=5 width=100%>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>generic.current_allocated_bytes</code></td>
|
||||
<td>
|
||||
Number of bytes used by the application. This will not typically
|
||||
match the memory use reported by the OS, because it does not
|
||||
include TCMalloc overhead or memory fragmentation.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>generic.heap_size</code></td>
|
||||
<td>
|
||||
Bytes of system memory reserved by TCMalloc.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.pageheap_free_bytes</code></td>
|
||||
<td>
|
||||
Number of bytes in free, mapped pages in page heap. These bytes
|
||||
can be used to fulfill allocation requests. They always count
|
||||
towards virtual memory usage, and unless the underlying memory is
|
||||
swapped out by the OS, they also count towards physical memory
|
||||
usage.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.pageheap_unmapped_bytes</code></td>
|
||||
<td>
|
||||
Number of bytes in free, unmapped pages in page heap. These are
|
||||
bytes that have been released back to the OS, possibly by one of
|
||||
the MallocExtension "Release" calls. They can be used to fulfill
|
||||
allocation requests, but typically incur a page fault. They
|
||||
always count towards virtual memory usage, and depending on the
|
||||
OS, typically do not count towards physical memory usage.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.slack_bytes</code></td>
|
||||
<td>
|
||||
Sum of pageheap_free_bytes and pageheap_unmapped_bytes. Provided
|
||||
for backwards compatibility only. Do not use.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.max_total_thread_cache_bytes</code></td>
|
||||
<td>
|
||||
A limit to how much memory TCMalloc dedicates for small objects.
|
||||
Higher numbers trade off more memory use for -- in some situations
|
||||
-- improved efficiency.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.current_total_thread_cache_bytes</code></td>
|
||||
<td>
|
||||
A measure of some of the memory TCMalloc is using (for
|
||||
small objects).
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr valign=top>
|
||||
<td><code>tcmalloc.min_per_thread_cache_bytes</code></td>
|
||||
<td>
|
||||
A lower limit to how much memory TCMalloc dedicates for small objects per
|
||||
thread. Note that this property only shows effect if per-thread cache
|
||||
calculated using tcmalloc.max_total_thread_cache_bytes ended up being less
|
||||
than tcmalloc.min_per_thread_cache_bytes.
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
<h2><A NAME="caveats">Caveats</A></h2>
|
||||
|
||||
<p>For some systems, TCMalloc may not work correctly with
|
||||
applications that aren't linked against <code>libpthread.so</code> (or
|
||||
the equivalent on your OS). It should work on Linux using glibc 2.3,
|
||||
but other OS/libc combinations have not been tested.</p>
|
||||
|
||||
<p>TCMalloc may be somewhat more memory hungry than other mallocs,
|
||||
(but tends not to have the huge blowups that can happen with other
|
||||
mallocs). In particular, at startup TCMalloc allocates approximately
|
||||
240KB of internal memory.</p>
|
||||
|
||||
<p>Don't try to load TCMalloc into a running binary (e.g., using JNI
|
||||
in Java programs). The binary will have allocated some objects using
|
||||
the system malloc, and may try to pass them to TCMalloc for
|
||||
deallocation. TCMalloc will not be able to handle such objects.</p>
|
||||
|
||||
<hr>
|
||||
|
||||
<address>Sanjay Ghemawat, Paul Menage<br>
|
||||
<!-- Created: Tue Dec 19 10:43:14 PST 2000 -->
|
||||
<!-- hhmts start -->
|
||||
Last modified: Sat Feb 24 13:11:38 PST 2007 (csilvers)
|
||||
<!-- hhmts end -->
|
||||
</address>
|
||||
|
||||
</body>
|
||||
</html>
|