diff --git a/COPYING b/COPYING
index e69de29..e4956cf 100644
--- a/COPYING
+++ b/COPYING
@@ -0,0 +1,28 @@
+Copyright (c) 2005, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ChangeLog b/ChangeLog
index 22597c3..90bf766 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -85,3 +85,18 @@ Thu Apr 13 20:59:09 2006  Google Inc. <opensource@google.com>
 	* Syscall support for older kernels, including _syscall6 (markus)
 	* Support PIC mode (markus, mbland, iant)
 	* Better support for running in non-threaded contexts (csilvers)
+
+Wed Jun 14 15:11:14 2006  Google Inc. <opensource@google.com>
+
+	* google-perftools: version 0.8 release
+	* Experimental support for remote profiling added to pprof (many)
+	* Fixed race condition in ProfileData::FlushTable (etune)
+	* Better support for weird /proc maps (maxim, mec)
+	* Fix heap-checker interaction with gdb (markus)
+	* Better 64-bit support in pprof (aruns)
+	* Reduce scavenging cost in tcmalloc by capping NumMoveSize (sanjay)
+	* Cast syscall(SYS_mmap); works on more 64-bit systems now (menage)
+	* Document the text output of pprof! (csilvers)
+	* Better compiler support for no-THREADS and for old compilers (csilvers)
+	* Make libunwind the default stack unwinder for x86-64 (aruns)
+	* Somehow the COPYING file got erased.  Regenerate it (csilvers)
diff --git a/Makefile.am b/Makefile.am
index 8d39bbb..83fa966 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -115,21 +115,24 @@ libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS) -export-symbols-regex $(TCMAL
 libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) libstacktrace.la
 
 ### Unittests
-TESTS += malloc_unittest
-MALLOC_UNITEST_INCLUDES = src/config.h \
-                          src/google/malloc_extension.h \
-                          src/google/malloc_hook.h \
-                          src/base/basictypes.h \
-                          src/google/perftools/hash_set.h \
-                          src/maybe_threads.h
-malloc_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
-                          src/malloc_hook.cc \
-                          src/malloc_extension.cc \
-                          src/maybe_threads.cc \
-                          $(MALLOC_UNITTEST_INCLUDES)
-malloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS)
-malloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
-malloc_unittest_LDADD = $(PTHREAD_LIBS)
+
+# Commented out for the moment because malloc(very_big_num) is broken in
+# standard libc!  At least, in some situations, some of the time.
+## TESTS += malloc_unittest
+## MALLOC_UNITEST_INCLUDES = src/config.h \
+##                           src/google/malloc_extension.h \
+##                           src/google/malloc_hook.h \
+##                           src/base/basictypes.h \
+##                           src/google/perftools/hash_set.h \
+##                           src/maybe_threads.h
+## malloc_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
+##                           src/malloc_hook.cc \
+##                           src/malloc_extension.cc \
+##                           src/maybe_threads.cc \
+##                           $(MALLOC_UNITTEST_INCLUDES)
+## malloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS)
+## malloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
+## malloc_unittest_LDADD = $(PTHREAD_LIBS)
 
 TESTS += tcmalloc_unittest
 TCMALLOC_UNITTEST_INCLUDES = src/google/malloc_extension.h
diff --git a/aclocal.m4 b/aclocal.m4
index 0b68740..d98f614 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -6751,7 +6751,61 @@ AC_DEFUN([AC_COMPILER_CHARACTERISTICS],
 	       
 
 # This was retrieved from
-#    http://www.gnu.org/software/ac-archive/htmldoc/acx_pthread.html
+#    http://0pointer.de/cgi-bin/viewcvs.cgi/trunk/common/acx_pthread.m4?rev=1220
+# See also (perhaps for new versions?)
+#    http://0pointer.de/cgi-bin/viewcvs.cgi/trunk/common/acx_pthread.m4
+
+dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl
+dnl @summary figure out how to build C programs using POSIX threads
+dnl
+dnl This macro figures out how to build C programs using POSIX threads.
+dnl It sets the PTHREAD_LIBS output variable to the threads library and
+dnl linker flags, and the PTHREAD_CFLAGS output variable to any special
+dnl C compiler flags that are needed. (The user can also force certain
+dnl compiler flags/libs to be tested by setting these environment
+dnl variables.)
+dnl
+dnl Also sets PTHREAD_CC to any special C compiler that is needed for
+dnl multi-threaded programs (defaults to the value of CC otherwise).
+dnl (This is necessary on AIX to use the special cc_r compiler alias.)
+dnl
+dnl NOTE: You are assumed to not only compile your program with these
+dnl flags, but also link it with them as well. e.g. you should link
+dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS
+dnl $LIBS
+dnl
+dnl If you are only building threads programs, you may wish to use
+dnl these variables in your default LIBS, CFLAGS, and CC:
+dnl
+dnl        LIBS="$PTHREAD_LIBS $LIBS"
+dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+dnl        CC="$PTHREAD_CC"
+dnl
+dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
+dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to
+dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
+dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to
+dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the
+dnl default action will define HAVE_PTHREAD.
+dnl
+dnl Please let the authors know if this macro fails on any platform, or
+dnl if you have any other suggestions or comments. This macro was based
+dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with
+dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros
+dnl posted by Alejandro Forero Cuervo to the autoconf macro repository.
+dnl We are also grateful for the helpful feedback of numerous users.
+dnl
+dnl @category InstalledPackages
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+dnl @version 2005-06-15
+dnl @license GPLWithACException
+dnl 
+dnl Checks for GCC shared/pthread inconsistency based on work by
+dnl Marcin Owsiany <marcin@owsiany.pl>
+
 
 AC_DEFUN([ACX_PTHREAD], [
 AC_REQUIRE([AC_CANONICAL_HOST])
@@ -6809,6 +6863,7 @@ acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -m
 # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
 #      doesn't hurt to check since this sometimes defines pthreads too;
 #      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
 # pthread: Linux, etcetera
 # --thread-safe: KAI C++
 # pthread-config: use pthread-config program (for GNU Pth library)
@@ -6818,13 +6873,13 @@ case "${host_cpu}-${host_os}" in
 
         # On Solaris (at least, for some versions), libc contains stubbed
         # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthread or
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
         # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
         # a function called by this macro, so we could check for that, but
         # who knows whether they'll stub that too in a future libc.)  So,
         # we'll just look for -pthreads and -lpthread first:
 
-        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
         ;;
 esac
 
@@ -6841,12 +6896,12 @@ for flag in $acx_pthread_flags; do
                 PTHREAD_CFLAGS="$flag"
                 ;;
 
-                pthread-config)
-                AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
-                if test x"$acx_pthread_config" = xno; then continue; fi
-                PTHREAD_CFLAGS="`pthread-config --cflags`"
-                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
-                ;;
+		pthread-config)
+		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
 
                 *)
                 AC_MSG_CHECKING([for the pthreads library -l$flag])
@@ -6895,12 +6950,12 @@ if test "x$acx_pthread_ok" = xyes; then
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 
         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
-        AC_MSG_CHECKING([for joinable pthread attribute])
-        attr_name=unknown
-        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
-            AC_TRY_LINK([#include <pthread.h>], [int attr=$attr;],
+	AC_MSG_CHECKING([for joinable pthread attribute])
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
                         [attr_name=$attr; break])
-        done
+	done
         AC_MSG_RESULT($attr_name)
         if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
             AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
@@ -6924,6 +6979,107 @@ if test "x$acx_pthread_ok" = xyes; then
 
         # More AIX lossage: must compile with cc_r
         AC_CHECK_PROG(PTHREAD_CC, cc_r, cc_r, ${CC})
+
+   # The next part tries to detect GCC inconsistency with -shared on some
+   # architectures and systems. The problem is that in certain
+   # configurations, when -shared is specified, GCC "forgets" to
+   # internally use various flags which are still necessary.
+   
+   # First, check whether caller wants us to skip -shared checks
+   # this is useful
+   AC_MSG_CHECKING([whether to check for GCC pthread/shared inconsistencies])
+   if test x"$GCC" != xyes; then
+      AC_MSG_RESULT([no])
+   else
+      AC_MSG_RESULT([yes])
+
+      # In order not to create several levels of indentation, we test
+      # the value of "$ok" until we find out the cure or run out of
+      # ideas.
+      ok="no"
+
+      #
+      # Prepare the flags
+      #
+      save_CFLAGS="$CFLAGS"
+      save_LIBS="$LIBS"
+      save_CC="$CC"
+      # Try with the flags determined by the earlier checks.
+      #
+      # -Wl,-z,defs forces link-time symbol resolution, so that the
+      # linking checks with -shared actually have any value
+      #
+      # FIXME: -fPIC is required for -shared on many architectures,
+      # so we specify it here, but the right way would probably be to
+      # properly detect whether it is actually required.
+      CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS"
+      LIBS="$PTHREAD_LIBS $LIBS"
+      CC="$PTHREAD_CC"
+
+      AC_MSG_CHECKING([whether -pthread is sufficient with -shared])
+      AC_TRY_LINK([#include <pthread.h>],
+         [pthread_t th; pthread_join(th, 0);
+         pthread_attr_init(0); pthread_cleanup_push(0, 0);
+         pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+         [ok=yes])
+      
+      if test "x$ok" = xyes; then
+         AC_MSG_RESULT([yes])
+      else
+         AC_MSG_RESULT([no])
+      fi
+   
+      #
+      # Linux gcc on some architectures such as mips/mipsel forgets
+      # about -lpthread
+      #
+      if test x"$ok" = xno; then
+         AC_MSG_CHECKING([whether -lpthread fixes that])
+         LIBS="-lpthread $PTHREAD_LIBS $save_LIBS"
+         AC_TRY_LINK([#include <pthread.h>],
+            [pthread_t th; pthread_join(th, 0);
+            pthread_attr_init(0); pthread_cleanup_push(0, 0);
+            pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+            [ok=yes])
+   
+         if test "x$ok" = xyes; then
+            AC_MSG_RESULT([yes])
+            PTHREAD_LIBS="-lpthread $PTHREAD_LIBS"
+         else
+            AC_MSG_RESULT([no])
+         fi
+      fi
+      #
+      # FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc
+      #
+      if test x"$ok" = xno; then
+         AC_MSG_CHECKING([whether -lc_r fixes that])
+         LIBS="-lc_r $PTHREAD_LIBS $save_LIBS"
+         AC_TRY_LINK([#include <pthread.h>],
+             [pthread_t th; pthread_join(th, 0);
+              pthread_attr_init(0); pthread_cleanup_push(0, 0);
+              pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+             [ok=yes])
+   
+         if test "x$ok" = xyes; then
+            AC_MSG_RESULT([yes])
+            PTHREAD_LIBS="-lc_r $PTHREAD_LIBS"
+         else
+            AC_MSG_RESULT([no])
+         fi
+      fi
+      if test x"$ok" = xno; then
+         # OK, we have run out of ideas
+         AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries])
+
+         # so it's not safe to assume that we may use pthreads
+         acx_pthread_ok=no
+      fi
+
+      CFLAGS="$save_CFLAGS"
+      LIBS="$save_LIBS"
+      CC="$save_CC"
+   fi
 else
         PTHREAD_CC="$CC"
 fi
diff --git a/configure b/configure
index 2a11d9e..9147d97 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.57 for google-perftools 0.7.
+# Generated by GNU Autoconf 2.57 for google-perftools 0.8.
 #
 # Report bugs to <opensource@google.com>.
 #
@@ -422,8 +422,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='google-perftools'
 PACKAGE_TARNAME='google-perftools'
-PACKAGE_VERSION='0.7'
-PACKAGE_STRING='google-perftools 0.7'
+PACKAGE_VERSION='0.8'
+PACKAGE_STRING='google-perftools 0.8'
 PACKAGE_BUGREPORT='opensource@google.com'
 
 ac_unique_file="README"
@@ -953,7 +953,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures google-perftools 0.7 to adapt to many kinds of systems.
+\`configure' configures google-perftools 0.8 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1019,7 +1019,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of google-perftools 0.7:";;
+     short | recursive ) echo "Configuration of google-perftools 0.8:";;
    esac
   cat <<\_ACEOF
 
@@ -1125,7 +1125,7 @@ fi
 test -n "$ac_init_help" && exit 0
 if $ac_init_version; then
   cat <<\_ACEOF
-google-perftools configure 0.7
+google-perftools configure 0.8
 generated by GNU Autoconf 2.57
 
 Copyright 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2002
@@ -1140,7 +1140,7 @@ cat >&5 <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by google-perftools $as_me 0.7, which was
+It was created by google-perftools $as_me 0.8, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   $ $0 $@
@@ -1733,7 +1733,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE=google-perftools
- VERSION=0.7
+ VERSION=0.8
 
 
 cat >>confdefs.h <<_ACEOF
@@ -21171,6 +21171,7 @@ acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -m
 # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
 #      doesn't hurt to check since this sometimes defines pthreads too;
 #      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
 # pthread: Linux, etcetera
 # --thread-safe: KAI C++
 # pthread-config: use pthread-config program (for GNU Pth library)
@@ -21180,13 +21181,13 @@ case "${host_cpu}-${host_os}" in
 
         # On Solaris (at least, for some versions), libc contains stubbed
         # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthread or
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
         # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
         # a function called by this macro, so we could check for that, but
         # who knows whether they'll stub that too in a future libc.)  So,
         # we'll just look for -pthreads and -lpthread first:
 
-        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
         ;;
 esac
 
@@ -21205,8 +21206,8 @@ echo $ECHO_N "checking whether pthreads work with $flag... $ECHO_C" >&6
                 PTHREAD_CFLAGS="$flag"
                 ;;
 
-                pthread-config)
-                # Extract the first word of "pthread-config", so it can be a program name with args.
+		pthread-config)
+		# Extract the first word of "pthread-config", so it can be a program name with args.
 set dummy pthread-config; ac_word=$2
 echo "$as_me:$LINENO: checking for $ac_word" >&5
 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6
@@ -21242,10 +21243,10 @@ else
 echo "${ECHO_T}no" >&6
 fi
 
-                if test x"$acx_pthread_config" = xno; then continue; fi
-                PTHREAD_CFLAGS="`pthread-config --cflags`"
-                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
-                ;;
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
 
                 *)
                 echo "$as_me:$LINENO: checking for the pthreads library -l$flag" >&5
@@ -21328,11 +21329,11 @@ if test "x$acx_pthread_ok" = xyes; then
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 
         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
-        echo "$as_me:$LINENO: checking for joinable pthread attribute" >&5
+	echo "$as_me:$LINENO: checking for joinable pthread attribute" >&5
 echo $ECHO_N "checking for joinable pthread attribute... $ECHO_C" >&6
-        attr_name=unknown
-        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
-            cat >conftest.$ac_ext <<_ACEOF
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    cat >conftest.$ac_ext <<_ACEOF
 #line $LINENO "configure"
 /* confdefs.h.  */
 _ACEOF
@@ -21343,7 +21344,7 @@ cat >>conftest.$ac_ext <<_ACEOF
 int
 main ()
 {
-int attr=$attr;
+int attr=$attr; return attr;
   ;
   return 0;
 }
@@ -21367,7 +21368,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
 fi
 rm -f conftest.$ac_objext conftest$ac_exeext conftest.$ac_ext
-        done
+	done
         echo "$as_me:$LINENO: result: $attr_name" >&5
 echo "${ECHO_T}$attr_name" >&6
         if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
@@ -21431,6 +21432,216 @@ else
 echo "${ECHO_T}no" >&6
 fi
 
+
+   # The next part tries to detect GCC inconsistency with -shared on some
+   # architectures and systems. The problem is that in certain
+   # configurations, when -shared is specified, GCC "forgets" to
+   # internally use various flags which are still necessary.
+
+   # First, check whether caller wants us to skip -shared checks
+   # this is useful
+   echo "$as_me:$LINENO: checking whether to check for GCC pthread/shared inconsistencies" >&5
+echo $ECHO_N "checking whether to check for GCC pthread/shared inconsistencies... $ECHO_C" >&6
+   if test x"$GCC" != xyes; then
+      echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+   else
+      echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+
+      # In order not to create several levels of indentation, we test
+      # the value of "$ok" until we find out the cure or run out of
+      # ideas.
+      ok="no"
+
+      #
+      # Prepare the flags
+      #
+      save_CFLAGS="$CFLAGS"
+      save_LIBS="$LIBS"
+      save_CC="$CC"
+      # Try with the flags determined by the earlier checks.
+      #
+      # -Wl,-z,defs forces link-time symbol resolution, so that the
+      # linking checks with -shared actually have any value
+      #
+      # FIXME: -fPIC is required for -shared on many architectures,
+      # so we specify it here, but the right way would probably be to
+      # properly detect whether it is actually required.
+      CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS"
+      LIBS="$PTHREAD_LIBS $LIBS"
+      CC="$PTHREAD_CC"
+
+      echo "$as_me:$LINENO: checking whether -pthread is sufficient with -shared" >&5
+echo $ECHO_N "checking whether -pthread is sufficient with -shared... $ECHO_C" >&6
+      cat >conftest.$ac_ext <<_ACEOF
+#line $LINENO "configure"
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+pthread_t th; pthread_join(th, 0);
+         pthread_attr_init(0); pthread_cleanup_push(0, 0);
+         pthread_create(0,0,0,0); pthread_cleanup_pop(0);
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+         { ac_try='test -s conftest$ac_exeext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ok=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+fi
+rm -f conftest.$ac_objext conftest$ac_exeext conftest.$ac_ext
+
+      if test "x$ok" = xyes; then
+         echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+      else
+         echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+      fi
+
+      #
+      # Linux gcc on some architectures such as mips/mipsel forgets
+      # about -lpthread
+      #
+      if test x"$ok" = xno; then
+         echo "$as_me:$LINENO: checking whether -lpthread fixes that" >&5
+echo $ECHO_N "checking whether -lpthread fixes that... $ECHO_C" >&6
+         LIBS="-lpthread $PTHREAD_LIBS $save_LIBS"
+         cat >conftest.$ac_ext <<_ACEOF
+#line $LINENO "configure"
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+pthread_t th; pthread_join(th, 0);
+            pthread_attr_init(0); pthread_cleanup_push(0, 0);
+            pthread_create(0,0,0,0); pthread_cleanup_pop(0);
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+         { ac_try='test -s conftest$ac_exeext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ok=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+fi
+rm -f conftest.$ac_objext conftest$ac_exeext conftest.$ac_ext
+
+         if test "x$ok" = xyes; then
+            echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+            PTHREAD_LIBS="-lpthread $PTHREAD_LIBS"
+         else
+            echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+         fi
+      fi
+      #
+      # FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc
+      #
+      if test x"$ok" = xno; then
+         echo "$as_me:$LINENO: checking whether -lc_r fixes that" >&5
+echo $ECHO_N "checking whether -lc_r fixes that... $ECHO_C" >&6
+         LIBS="-lc_r $PTHREAD_LIBS $save_LIBS"
+         cat >conftest.$ac_ext <<_ACEOF
+#line $LINENO "configure"
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+pthread_t th; pthread_join(th, 0);
+              pthread_attr_init(0); pthread_cleanup_push(0, 0);
+              pthread_create(0,0,0,0); pthread_cleanup_pop(0);
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+         { ac_try='test -s conftest$ac_exeext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ok=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+fi
+rm -f conftest.$ac_objext conftest$ac_exeext conftest.$ac_ext
+
+         if test "x$ok" = xyes; then
+            echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+            PTHREAD_LIBS="-lc_r $PTHREAD_LIBS"
+         else
+            echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+         fi
+      fi
+      if test x"$ok" = xno; then
+         # OK, we have run out of ideas
+         { echo "$as_me:$LINENO: WARNING: Impossible to determine how to use pthreads with shared libraries" >&5
+echo "$as_me: WARNING: Impossible to determine how to use pthreads with shared libraries" >&2;}
+
+         # so it's not safe to assume that we may use pthreads
+         acx_pthread_ok=no
+      fi
+
+      CFLAGS="$save_CFLAGS"
+      LIBS="$save_LIBS"
+      CC="$save_CC"
+   fi
 else
         PTHREAD_CC="$CC"
 fi
@@ -22393,7 +22604,7 @@ _ASBOX
 } >&5
 cat >&5 <<_CSEOF
 
-This file was extended by google-perftools $as_me 0.7, which was
+This file was extended by google-perftools $as_me 0.8, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22456,7 +22667,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-google-perftools config.status 0.7
+google-perftools config.status 0.8
 configured by $0, generated by GNU Autoconf 2.57,
   with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index 2e1ab8b..f72e687 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5,7 +5,7 @@
 # make sure we're interpreted by some minimal autoconf
 AC_PREREQ(2.57)
 
-AC_INIT(google-perftools, 0.7, opensource@google.com)
+AC_INIT(google-perftools, 0.8, opensource@google.com)
 # The argument here is just something that should be in the current directory
 # (for sanity checking)
 AC_CONFIG_SRCDIR(README)
diff --git a/doc/cpu_profiler.html b/doc/cpu_profiler.html
index bc18940..ff98321 100644
--- a/doc/cpu_profiler.html
+++ b/doc/cpu_profiler.html
@@ -109,6 +109,24 @@ detail below.</p>
   annotated with the flat and cumulative sample counts at each PC value.
 </pre>
 
+<h3>Analyzing Text Output</h3>
+
+<p>Text mode has lines of output that look like this:</p>
+<pre>
+       14   2.1%  17.2%       58   8.7% std::_Rb_tree::find
+</pre>
+
+<p>Here is how to interpret the columns:</p>
+<ol>
+  <li> Number of profiling samples in this function
+  <li> Percentage of profiling samples in this function
+  <li> Percentage of profiling samples in the functions printed so far
+  <li> Number of profiling samples in this function and its callees
+  <li> Percentage of profiling samples in this function and its callees
+  <li> Function name
+</ol>
+
+
 <h3>Node Information</h3>
 
 <p>In the various graphical modes of pprof, the output is a call graph
diff --git a/doc/pprof_remote_servers.html b/doc/pprof_remote_servers.html
new file mode 100644
index 0000000..b93ccd3
--- /dev/null
+++ b/doc/pprof_remote_servers.html
@@ -0,0 +1,190 @@
+<HTML>
+
+<HEAD>
+<title>pprof and Remote Servers</title>
+</HEAD>
+
+<BODY>
+
+<h1><code>pprof</code> and Remote Servers</h2>
+
+<p>In mid-2006, we added an experimental facility to <A
+HREF="cpu_profiler.html">pprof</A>, the tool that analyzes CPU and
+heap profiles.  This facility allows you to collect profile
+information from running applications.  It makes it easy to collect
+profile information without having to stop the program first, and
+without having to log into the machine where the application is
+running.  This is meant to be used on webservers, but will work on any
+application that can be modified to accept TCP connections on a port
+of its choosing, and to respond to HTTP requests on that port.</p>
+
+<p>We do not currently have infrastructure, such as apache modules,
+that you can pop into a webserver or other application to get the
+necessary functionality "for free."  However, it's easy to generate
+the necessary data, which should allow the interested developer to add
+the necessary support into his or her applications.</p>
+
+<p>To use <code>pprof</code> in this experimental "server" mode, you
+give the script a host and port it should query, replacing the normal
+commandline arguments of application + profile file:</p>
+<pre>
+   % pprof internalweb.mycompany.com:80
+</pre>
+
+<p>The host must be listening on that port, and be able to accept HTTP/1.0
+requests -- sent via <code>wget</code> and <code>curl</code> -- for
+several urls.  The following sections list the urls that
+<code>pprof</code> can send, and the responses it expects in
+return.</p>
+
+
+<ul><li> <code><b>/pprof/heap</b></code>
+
+<p><code>pprof</code> asks for the url <code>/pprof/heap</code> to
+get heap information.  The actual url is controlled via the variable
+<code>HEAP_PAGE</code> in the <code>pprof</code> script, so you
+can change it if you'd like.</p>
+
+<p>The server should respond by calling</p>
+<pre>
+    MallocExtension::instance()->GetHeapSample(&output);
+</pre>
+<p>and sending <code>output</code> back as an HTTP response to
+<code>pprof</code>.  <code>MallocExtension</code> is defined in the
+header file <code>google/malloc_extension.h</code>.</p>
+
+<p>Here's an example, from an actual Google webserver, of what the
+output should look like:</p>
+<pre>
+heap profile:   9369: 126987529 [  9369: 126987529] @ heap
+     2:     1024 [     2:     1024] @ 0x87da913 0x8923ad4 0x891d4c2 0x892de12 0x8930519 0x83a16c2 0x836cb38 0x834cd1c 0x8349ba5 0x10a3177 0x8349961
+     1:       36 [     1:       36] @ 0x87da913 0x83a0929 0x836cb38 0x834cd1c 0x8349ba5 0x10a3177 0x8349961
+   308: 10092544 [   308: 10092544] @ 0x87da913 0x8970d66 0x8970e64 0x896e8e2 0x88e69d2 0x88e6add 0x88e6dec 0x88e7384 0x88e73fa 0x8838793 0x8838b36 0x88395f8 0x88f5a4b 0x890d03a 0x890d65a 0x8917666 0x890d1f3 0x890e6e4 0x8349c1b 0x10a3177 0x8349961
+[...]
+</pre>
+
+
+</li><li> <code><b>/pprof/growth</b></code>
+
+<p><code>pprof</code> asks for the url <code>/pprof/growth</code> to
+get heap-profiling delta (growth) information.  The actual url is
+controlled via the variable <code>GROWTH_PAGE</code> in the
+<code>pprof</code> script, so you can change it if you'd like.</p>
+
+<p>The server should respond by calling</p>
+<pre>
+    MallocExtension::instance()->GetHeapGrowthStacks(&output);
+</pre>
+<p>and sending <code>output</code> back as an HTTP response to
+<code>pprof</code>.  <code>MallocExtension</code> is defined in the
+header file <code>google/malloc_extension.h</code>.</p>
+
+<p>Here's an example, from an actual Google webserver, of what the
+output should look like:</p>
+<pre>
+heap profile:    741: 812122112 [   741: 812122112] @ growth
+     1:  1572864 [     1:  1572864] @ 0x87da564 0x87db8a3 0x84787a4 0x846e851 0x836d12f 0x834cd1c 0x8349ba5 0x10a3177 0x8349961
+     1:  1048576 [     1:  1048576] @ 0x87d92e8 0x87d9213 0x87d9178 0x87d94d3 0x87da9da 0x8a364ff 0x8a437e7 0x8ab7d23 0x8ab7da9 0x8ac7454 0x8348465 0x10a3161 0x8349961
+[...]
+</pre>
+
+
+</li><li> <code><b>/pprof/profile</b></code>
+
+<p><code>pprof</code> asks for the url
+<code>/pprof/profile?seconds=XX</code> to get cpu-profiling
+information.  The actual url is controlled via the variable
+<code>PROFILE_PAGE</code> in the <code>pprof</code> script, so you can
+change it if you'd like.</p>
+
+<p>The server should respond by calling
+<code>ProfilerStart(filename)</code>, continuing to do its work, and
+then, XX seconds later, calling <code>ProfilerStop()</code>.  (These
+functions are declared in <code>google/profiler.h</code>.)  The
+application is responsible for picking a unique filename for
+<code>ProfilerStart()</code>.  After calling
+<code>ProfilerStop()</code>, the server should read the contents of
+<code>filename</code> and send them back as an HTTP response to
+<code>pprof</code>.</p>
+
+<p>Obviously, to get useful profile information the application must
+continue to run in the XX seconds that the profiler is running.  Thus,
+the profile start-stop calls should be done in a separate thread, or
+be otherwise non-blocking.</p>
+
+<p>The profiler output file is binary, but near the end of it, it
+should have lines of text somewhat like this:</p>
+<pre>
+01016000-01017000 rw-p 00015000 03:01 59314      /lib/ld-2.2.2.so
+</pre>
+
+
+</li><li> <code><b>/pprof/contention</b></code>
+
+<p>This is intended to be able to profile (thread) lock contention in
+addition to CPU and memory use.  It's not yet usable.</p>
+
+
+</li><li> <code><b>/pprof/cmdline</b></code>
+
+<p><code>pprof</code> asks for the url <code>/pprof/cmdline</code> to
+figure out what application it's profiling.  The actual url is
+controlled via the variable <code>PROGRAM_NAME_PAGE</code> in the
+<code>pprof</code> script, so you can change it if you'd like.</p>
+
+<p>The server should respond by reading the contents of
+<code>/proc/self/cmdline</code>, converting all internal NUL (\0)
+characters to newlines, and sending the result back as an HTTP
+response to <code>pprof</code>.</p>
+
+<p>Here's an example return value:<p>
+<pre>
+/root/server/custom_webserver
+80
+--configfile=/root/server/ws.config
+</pre>
+
+
+</li><li> <code><b>/pprof/symbol</b></code>
+
+<p><code>pprof</code> asks for the url <code>/pprof/symbol</code> to
+map from hex addresses to variable names.  The actual url is
+controlled via the variable <code>SYMBOL_PAGE</code> in the
+<code>pprof</code> script, so you can change it if you'd like.</p>
+
+<p>This is perhaps the hardest request to write code for, because
+it must accept POST requests.  This means that after the HTTP headers,
+pprof will pass in a list of hex addresses connected by
+<code>+</code>, like so:</p>
+<pre>
+   curl -d '0x0824d061+0x0824d1cf' http://remote_host:80/pprof/symbol
+</pre>
+
+<p>The server should read the POST data, which will be in one line,
+and for each hex value, should write one line of output to the output
+stream, like so:</p>
+<pre>
+&lt;hex address&gt;&lt;tab&gt;&lt;function name&gt;
+</pre>
+<p>For instance:</p>
+<pre>
+0x08b2dabd    _Update
+</pre>
+
+<p>The other reason this is the most difficult request to implement,
+is that the application will have to figure out for itself how to map
+from address to function name.  One possibility is to run <code>nm -C
+-n &lt;program name&gt;</code> to get the mappings, either statically
+(say at program-compile time), or dynamically, by having the
+application call out to <code>nm</code> for every
+<code>pprof/symbol</code> call (presumably with some caching!).</p>
+
+<p><code>pprof</code> itself does just this for local profiles (not
+ones that talk to remote servers); look at the subroutine
+<code>GetProcedureBoundaries</code>.</p>
+
+
+<hr>
+Last modified: Mon Jun 12 21:30:14 PDT 2006
+</body>
+</html>
diff --git a/src/base/linux_syscall_support.h b/src/base/linux_syscall_support.h
index 319455e..0dfdd8d 100644
--- a/src/base/linux_syscall_support.h
+++ b/src/base/linux_syscall_support.h
@@ -45,6 +45,14 @@
 #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) && \
     defined(__linux)
 
+#ifdef __cplusplus
+/* Some system header files in older versions of gcc neglect to properly
+ * handle being included from C++. As it appears to be harmless to have
+ * multiple nested 'extern "C"' blocks, just add another one here.
+ */
+extern "C" {
+#endif
+
 #include <errno.h>
 #include <signal.h>
 #include <stdarg.h>
@@ -79,35 +87,47 @@
 
 #if defined(__i386__)
 #ifndef __NR_getdents64
-#define __NR_getdents64   220
+#define __NR_getdents64         220
 #endif
 #ifndef __NR_gettid
-#define __NR_gettid       224
+#define __NR_gettid             224
 #endif
 #ifndef __NR_futex
-#define __NR_futex        240
+#define __NR_futex              240
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity  241
+#define __NR_sched_getaffinity  242
 #endif
 /* End of i386 definitions                                                   */
 #elif defined(__ARM_ARCH_3__)
 #ifndef __NR_getdents64
-#define __NR_getdents64   217
+#define __NR_getdents64         (__NR_SYSCALL_BASE + 217)
 #endif
 #ifndef __NR_gettid
-#define __NR_gettid       224
+#define __NR_gettid             (__NR_SYSCALL_BASE + 224)
 #endif
 #ifndef __NR_futex
-#define __NR_futex        240
+#define __NR_futex              (__NR_SYSCALL_BASE + 240)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity  (__NR_SYSCALL_BASE + 241)
+#define __NR_sched_getaffinity  (__NR_SYSCALL_BASE + 242)
 #endif
 /* End of ARM 3 definitions                                                  */
 #elif defined(__x86_64__)
 #ifndef __NR_getdents64
-#define __NR_getdents64   217
+#define __NR_getdents64         217
 #endif
 #ifndef __NR_gettid
-#define __NR_gettid       186
+#define __NR_gettid             186
 #endif
 #ifndef __NR_futex
-#define __NR_futex        202
+#define __NR_futex              202
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity  203
+#define __NR_sched_getaffinity  204
 #endif
 /* End of x86-64 definitions                                                 */
 #endif
@@ -306,9 +326,11 @@ struct dirent64;
   #endif
   #if defined(__x86_64__)
     struct msghdr;
+    struct sockaddr;
     #define __NR_sys_mmap           __NR_mmap
     #define __NR_sys_recvmsg        __NR_recvmsg
     #define __NR_sys_sendmsg        __NR_sendmsg
+    #define __NR_sys_sendto         __NR_sendto
     #define __NR_sys_shutdown       __NR_shutdown
     #define __NR_sys_rt_sigaction   __NR_rt_sigaction
     #define __NR_sys_rt_sigprocmask __NR_rt_sigprocmask
@@ -322,6 +344,10 @@ struct dirent64;
                             struct msghdr*,          m, int, f);
     static inline _syscall3(int, sys_sendmsg,        int,   s,
                             const struct msghdr*,    m, int, f);
+    static inline _syscall6(int, sys_sendto,         int,   s,
+                            const void*,             m, size_t, l,
+                            int,                     f,
+                            const struct sockaddr*,  a, int, t);
     static inline _syscall2(int, sys_shutdown,       int,   s,
                             int,                     h);
     static inline _syscall4(int, sys_rt_sigaction,   int,   s,
@@ -378,6 +404,8 @@ struct dirent64;
     }
     #define sys_recvmsg(s,m,f)      sys_socketcall(17,      (s), (m), (f))
     #define sys_sendmsg(s,m,f)      sys_socketcall(16,      (s), (m), (f))
+    #define sys_sendto(s,m,l,f,a,t) sys_socketcall(11,      (s), (m), (l),(f),\
+                                                            (a), (t))
     #define sys_shutdown(s,h)       sys_socketcall(13,      (s), (h))
     #define sys_socket(d,t,p)       sys_socketcall(1,       (d), (t), (p))
     #define sys_socketpair(d,t,p,s) sys_socketcall(8,       (d), (t), (p),(s))
@@ -387,39 +415,41 @@ struct dirent64;
     static inline _syscall3(pid_t, sys_waitpid,      pid_t, p,
                             int*,              s,    int,   o);
   #endif
-  #define __NR_sys_close        __NR_close
-  #define __NR_sys_dup          __NR_dup
-  #define __NR_sys_dup2         __NR_dup2
-  #define __NR_sys_execve       __NR_execve
-  #define __NR_sys__exit        __NR_exit
-  #define __NR_sys_fcntl        __NR_fcntl
-  #define __NR_sys_fork         __NR_fork
-  #define __NR_sys_fstat        __NR_fstat
-  #define __NR_sys_getdents     __NR_getdents
-  #define __NR_sys_getdents64   __NR_getdents64
-  #define __NR_sys_getegid      __NR_getegid
-  #define __NR_sys_geteuid      __NR_geteuid
-  #define __NR_sys_getpgrp      __NR_getpgrp
-  #define __NR_sys_getpid       __NR_getpid
-  #define __NR_sys_getppid      __NR_getppid
-  #define __NR_sys_getpriority  __NR_getpriority
-  #define __NR_sys_getrlimit    __NR_getrlimit
-  #define __NR_sys_getsid       __NR_getsid
-  #define __NR__gettid          __NR_gettid
-  #define __NR_sys_kill         __NR_kill
-  #define __NR_sys_lseek        __NR_lseek
-  #define __NR_sys_munmap       __NR_munmap
-  #define __NR_sys_open         __NR_open
-  #define __NR_sys_pipe         __NR_pipe
-  #define __NR_sys_prctl        __NR_prctl
-  #define __NR_sys_ptrace       __NR_ptrace
-  #define __NR_sys_read         __NR_read
-  #define __NR_sys_readlink     __NR_readlink
-  #define __NR_sys_sched_yield  __NR_sched_yield
-  #define __NR_sys_sigaltstack  __NR_sigaltstack
-  #define __NR_sys_stat         __NR_stat
-  #define __NR_sys_write        __NR_write
-  #define __NR_sys_futex        __NR_futex
+  #define __NR_sys_close              __NR_close
+  #define __NR_sys_dup                __NR_dup
+  #define __NR_sys_dup2               __NR_dup2
+  #define __NR_sys_execve             __NR_execve
+  #define __NR_sys__exit              __NR_exit
+  #define __NR_sys_fcntl              __NR_fcntl
+  #define __NR_sys_fork               __NR_fork
+  #define __NR_sys_fstat              __NR_fstat
+  #define __NR_sys_futex              __NR_futex
+  #define __NR_sys_getdents           __NR_getdents
+  #define __NR_sys_getdents64         __NR_getdents64
+  #define __NR_sys_getegid            __NR_getegid
+  #define __NR_sys_geteuid            __NR_geteuid
+  #define __NR_sys_getpgrp            __NR_getpgrp
+  #define __NR_sys_getpid             __NR_getpid
+  #define __NR_sys_getppid            __NR_getppid
+  #define __NR_sys_getpriority        __NR_getpriority
+  #define __NR_sys_getrlimit          __NR_getrlimit
+  #define __NR_sys_getsid             __NR_getsid
+  #define __NR__gettid                __NR_gettid
+  #define __NR_sys_kill               __NR_kill
+  #define __NR_sys_lseek              __NR_lseek
+  #define __NR_sys_munmap             __NR_munmap
+  #define __NR_sys_open               __NR_open
+  #define __NR_sys_pipe               __NR_pipe
+  #define __NR_sys_prctl              __NR_prctl
+  #define __NR_sys_ptrace             __NR_ptrace
+  #define __NR_sys_read               __NR_read
+  #define __NR_sys_readlink           __NR_readlink
+  #define __NR_sys_sched_getaffinity  __NR_sched_getaffinity
+  #define __NR_sys_sched_setaffinity  __NR_sched_setaffinity
+  #define __NR_sys_sched_yield        __NR_sched_yield
+  #define __NR_sys_sigaltstack        __NR_sigaltstack
+  #define __NR_sys_stat               __NR_stat
+  #define __NR_sys_write              __NR_write
   static inline _syscall1(int,     sys_close,       int,         f);
   static inline _syscall1(int,     sys_dup,         int,         f);
   static inline _syscall2(int,     sys_dup2,        int,         s,
@@ -432,6 +462,8 @@ struct dirent64;
   static inline _syscall0(pid_t,   sys_fork);
   static inline _syscall2(int,     sys_fstat,       int,         f,
                           struct stat*,   b);
+  static inline _syscall4(int, sys_futex, int*, addrx, int, opx, int, valx,
+                          struct timespec *, timeoutx);
   static inline _syscall3(int,   sys_getdents,      int,         f,
                           struct dirent*, d, int,    c);
   static inline _syscall3(int,   sys_getdents64,    int,         f,
@@ -464,6 +496,10 @@ struct dirent64;
                           void *,         b, size_t, c);
   static inline _syscall3(int,     sys_readlink,    const char*, p,
                           char*,          b, size_t, s);
+  static inline _syscall3(int, sys_sched_getaffinity, pid_t, pid,
+                          unsigned int, len, unsigned long *, mask);
+  static inline _syscall3(int, sys_sched_setaffinity, pid_t, pid,
+                          unsigned int, len, unsigned long *, mask);
   static inline _syscall0(int,     sys_sched_yield);
   static inline _syscall2(int,     sys_sigaltstack, const stack_t*, s,
                           const stack_t*, o);
@@ -471,8 +507,6 @@ struct dirent64;
                           struct stat*,   b);
   static inline _syscall3(ssize_t, sys_write,        int,        f,
                           const void *,   b, size_t, c);
-  static inline _syscall4(int, sys_futex, int*, addrx, int, opx, int, valx,
-                          struct timespec *, timeoutx);
 
   static inline int sys_sysconf(int name) {
     extern int __getpagesize(void);
@@ -517,6 +551,9 @@ struct dirent64;
   #undef RETURN
 #endif
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif
 #endif
diff --git a/src/base/linuxthreads.c b/src/base/linuxthreads.c
index e721582..3696987 100644
--- a/src/base/linuxthreads.c
+++ b/src/base/linuxthreads.c
@@ -51,6 +51,10 @@
 #include "base/linux_syscall_support.h"
 #include "base/thread_lister.h"
 
+#ifndef CLONE_UNTRACED
+#define CLONE_UNTRACED 0x00800000
+#endif
+
 
 /* itoa() is not a standard function, and we cannot safely call printf()
  * after suspending threads. So, we just implement our own copy. A
@@ -97,8 +101,19 @@ static int local_clone (int (*fn)(void *), void *arg, ...) {
    * Leave 4kB of gap between the callers stack and the new clone. This
    * should be more than sufficient for the caller to call waitpid() until
    * the cloned thread terminates.
+   *
+   * It is important that we set the CLONE_UNTRACED flag, because newer
+   * versions of "gdb" otherwise attempt to attach to our thread, and will
+   * attempt to reap its status codes. This subsequently results in the
+   * caller hanging indefinitely in waitpid(), waiting for a change in
+   * status that will never happen. By setting the CLONE_UNTRACED flag, we
+   * prevent "gdb" from stealing events, but we still expect the thread
+   * lister to fail, because it cannot PTRACE_ATTACH to the process that
+   * is being debugged. This is OK and the error code will be reported
+   * correctly.
    */
-  return clone(fn, (char *)&arg - 4096, CLONE_VM|CLONE_FS|CLONE_FILES, arg);
+  return clone(fn, (char *)&arg - 4096,
+               CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_UNTRACED, arg);
 }
 
 
@@ -209,7 +224,8 @@ struct ListerParams {
 static void ListerThread(struct ListerParams *args) {
   static const int  signals[]  = { SIGABRT, SIGILL, SIGFPE, SIGSEGV, SIGBUS,
                                    SIGXCPU, SIGXFSZ };
-  pid_t             clone_pid  = sys_gettid();
+  int               found_parent = 0;
+  pid_t             clone_pid  = sys_gettid(), ppid = sys_getppid();
   char              proc_self_task[80], marker_name[48], *marker_path;
   const char        *proc_paths[3];
   const char *const *proc_path = proc_paths;
@@ -239,8 +255,7 @@ static void ListerThread(struct ListerParams *args) {
   }
 
   /* Compute search paths for finding thread directories in /proc            */
-  local_itoa(strrchr(strcpy(proc_self_task, "/proc/"), '\000'),
-             sys_getppid());
+  local_itoa(strrchr(strcpy(proc_self_task, "/proc/"), '\000'), ppid);
   marker_path = strrchr(strcpy(marker_name, proc_self_task), '\000');
   strcat(proc_self_task, "/task/");
   proc_paths[0] = proc_self_task; /* /proc/$$/task/                          */
@@ -417,6 +432,7 @@ static void ListerThread(struct ListerParams *args) {
                   num_threads--;
                   sig_num_threads = num_threads;
                 } else {
+                  found_parent |= pid == ppid;
                   added_entries++;
                 }
               }
@@ -435,6 +451,16 @@ static void ListerThread(struct ListerParams *args) {
         NO_INTR(sys_close(marker));
         sig_marker = marker = -1;
 
+        /* If we never found the parent process, something is very wrong.
+         * Most likely, we are running in debugger. Any attempt to operate
+         * on the threads would be very incomplete. Let's just report an
+         * error to the caller.
+         */
+        if (!found_parent) {
+          ResumeAllProcessThreads(num_threads, pids);
+          sys__exit(3);
+        }
+
         /* Now we are ready to call the callback,
          * which takes care of resuming the threads for us.
          */
@@ -530,6 +556,9 @@ int ListAllProcessThreads(void *parameter,
       case 2: args.err = EFAULT; /* Some fault (e.g. SIGSEGV) detected       */
               args.result = -1;
               break;
+      case 3: args.err = EPERM;  /* Process is already being traced          */
+              args.result = -1;
+              break;
       default:args.err = ECHILD; /* Child died unexpectedly                  */
               args.result = -1;
               break;
diff --git a/src/base/thread_lister.c b/src/base/thread_lister.c
index 6def758..f3df16b 100644
--- a/src/base/thread_lister.c
+++ b/src/base/thread_lister.c
@@ -31,7 +31,8 @@
  * Author: Markus Gutschke
  */
 
-#include <stdio.h>         // needed for NULL on some powerpc platforms (?!)
+#include <stdio.h>         /* needed for NULL on some powerpc platforms (?!) */
+#include <sys/prctl.h>
 #include "base/thread_lister.h"
 #include "base/linuxthreads.h"
 /* Include other thread listers here that define THREADS macro
@@ -46,16 +47,23 @@
 
 int ListAllProcessThreads(void *parameter,
                           ListAllProcessThreadsCallBack callback, ...) {
-  int     rc;
+  int rc;
   va_list ap;
 
+  int dumpable = prctl(PR_GET_DUMPABLE, 0);
+  if (!dumpable)
+    prctl(PR_SET_DUMPABLE, 1);
   va_start(ap, callback);
-  rc = callback(parameter, 0, NULL, ap);
+  pid_t pid = getpid();
+  rc = callback(parameter, 1, &pid, ap);
   va_end(ap);
+  if (!dumpable)
+    prctl(PR_SET_DUMPABLE, 0);
   return rc;
 }
 
-void ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
+int ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
+  return 1;
 }
 
 #endif
diff --git a/src/google/heap-checker.h b/src/google/heap-checker.h
index 66d23de..f888ae0 100644
--- a/src/google/heap-checker.h
+++ b/src/google/heap-checker.h
@@ -255,6 +255,19 @@ class HeapCleaner {
 };
 
 class HeapLeakChecker {
+ public:  // Static functions for working with (whole-program) leak checking.
+ 
+  // If heap leak checking is currently active in some mode
+  // e.g. if leak checking was started (and is still active now)
+  // due to any valid non-empty --heap_check flag value
+  // (including "local") on the command-line
+  // or via a dependency on //base:heapcheck.
+  // The return value reflects iff HeapLeakChecker objects manually 
+  // constructed right now will be doing leak checking or nothing.
+  // Note that we can go from active to inactive state during InitGoogle()
+  // if FLAGS_heap_check gets set to "" by some code before/during InitGoogle().
+  static bool IsActive();
+
  public:  // Non-static functions for starting and doing leak checking.
 
   // Start checking and name the leak check performed.
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index dc9c46d..4e8e2dc 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -468,6 +468,18 @@ static bool RecordGlobalDataLocked(uint64 start_address,
   if (inode == 0)
     return true;
 
+  // Sometimes people mmap their own files read-write.  That would cause
+  // the strict ELF checker later to reject them.  We do not want to loosen
+  // up the ELF checker, because we need to catch freaky files if they
+  // show up.  So, make an exception for common files that we have seen.
+  //
+  // TODO(mec): the longer this gets, the more attractive it is to
+  // check for the ELF header and just accept all non-ELF files.
+  if (inode != 0) {
+    if (filename && strcmp(filename, "/dev/zero") == 0)
+      return true;
+  }
+
   // Grab some ELF types.
 #ifdef _LP64
   typedef Elf64_Ehdr ElfFileHeader;
@@ -692,8 +704,15 @@ HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
                           "Looking at /proc/self/maps line:\n  %s\n",
                           proc_map_line);
 
-    if (start_address >= end_address)
-      abort();
+    if (start_address >= end_address) {
+      // Crash if a line we can be interested in is ill-formed:
+      if (inode != 0)  abort();
+      // Skip other ill-formed lines: some are possible
+      // probably due to the interplay of how /proc/self/maps is updated
+      // while we read it in chunks in ProcMapsIterator and
+      // do things in this loop.
+      continue;
+    }
 
     // Determine if any shared libraries are present.
     if (inode != 0 && strstr(filename, "lib") && strstr(filename, ".so")) {
@@ -738,6 +757,14 @@ static int64 live_bytes_total = 0;
 // (protected by our lock; IgnoreAllLiveObjectsLocked sets it)
 static pid_t self_thread_pid = 0;
 
+// Status of our thread listing callback execution
+// (protected by our lock; used from within IgnoreAllLiveObjectsLocked)
+static enum {
+  CALLBACK_NOT_STARTED,
+  CALLBACK_STARTED,
+  CALLBACK_COMPLETED,
+} thread_listing_status = CALLBACK_NOT_STARTED;
+
 // Ideally to avoid deadlocks this function should not result in any libc
 // or other function calls that might need to lock a mutex:
 // It is called when all threads of a process are stopped
@@ -774,6 +801,7 @@ int HeapLeakChecker::IgnoreLiveThreads(void* parameter,
                                        int num_threads,
                                        pid_t* thread_pids,
                                        va_list ap) {
+  thread_listing_status = CALLBACK_STARTED;
   if (HeapProfiler::kMaxLogging) {
     HeapProfiler::MESSAGE(2, "HeapChecker: Found %d threads (from pid %d)\n",
                           num_threads, getpid());
@@ -838,6 +866,7 @@ int HeapLeakChecker::IgnoreLiveThreads(void* parameter,
   IgnoreNonThreadLiveObjectsLocked();
   // Can now resume the threads:
   ResumeAllProcessThreads(num_threads, thread_pids);
+  thread_listing_status = CALLBACK_COMPLETED;
   return failures;
 }
 
@@ -928,7 +957,8 @@ IgnoreAllLiveObjectsLocked(const StackExtent& self_stack) {
     UseProcMaps(RECORD_GLOBAL_DATA_LOCKED);
   }
   // Ignore all thread stacks:
-  bool executed_with_threads_stopped = false;
+  thread_listing_status = CALLBACK_NOT_STARTED;
+  bool need_to_ignore_non_thread_objects = true;
   self_thread_pid = getpid();
   self_thread_stack = self_stack;
   if (FLAGS_heap_check_ignore_thread_live) {
@@ -939,10 +969,22 @@ IgnoreAllLiveObjectsLocked(const StackExtent& self_stack) {
     //  if not suspended they could still mess with the pointer
     //  graph while we walk it).
     int r = ListAllProcessThreads(NULL, IgnoreLiveThreads);
-    executed_with_threads_stopped = (r >= 0);
-    if (r == -1) {
-      HeapProfiler::MESSAGE(0, "HeapChecker: Could not find thread stacks; "
-                               "may get false leak reports\n");
+    need_to_ignore_non_thread_objects = r < 0;
+    if (r < 0) {
+      HeapProfiler::MESSAGE(0, "HeapChecker: thread finding failed "
+                               "with %d errno=%d\n", r, errno);
+      if (thread_listing_status == CALLBACK_COMPLETED) {
+        HeapProfiler::MESSAGE(0, "HeapChecker: thread finding callback "
+                                 "finished ok; hopefully everything is fine\n");
+        need_to_ignore_non_thread_objects = false;
+      } else if (thread_listing_status == CALLBACK_STARTED) {
+        HeapProfiler::MESSAGE(0, "HeapChecker: thread finding callback was "
+                                 "interrupted or crashed; can't fix this\n");
+        abort();
+      } else {  // CALLBACK_NOT_STARTED
+        HeapProfiler::MESSAGE(0, "HeapChecker: Could not find thread stacks; "
+                                 "may get false leak reports\n");
+      }
     } else if (r != 0) {
       HeapProfiler::MESSAGE(0, "HeapChecker: Thread stacks not found "
                                "for %d threads; may get false leak reports\n",
@@ -960,7 +1002,7 @@ IgnoreAllLiveObjectsLocked(const StackExtent& self_stack) {
   }
   // Do all other live data ignoring here if we did not do it
   // within thread listing callback with all threads stopped.
-  if (!executed_with_threads_stopped)  IgnoreNonThreadLiveObjectsLocked();
+  if (need_to_ignore_non_thread_objects)  IgnoreNonThreadLiveObjectsLocked();
   if (live_objects_total) {
     HeapProfiler::MESSAGE(0, "HeapChecker: "
                           "Ignoring "LLD" reachable "
@@ -1349,10 +1391,13 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
       (same_heap ? (inuse_bytes_increase_ != 0 || inuse_allocs_increase_ != 0)
                  : (inuse_bytes_increase_ > 0 || inuse_allocs_increase_ > 0));
     if (see_leaks || do_full) {
+      bool pprof_can_ignore = false;
+      const char* command_tail = " --text 2>/dev/null";  // normal command
       const char* gv_command_tail
         = " --edgefraction=1e-10 --nodefraction=1e-10 --gv 2>/dev/null";
       string ignore_re;
       if (disabled_regexp) {
+        pprof_can_ignore = true;
         ignore_re += " --ignore='^";
         ignore_re += *disabled_regexp;
         ignore_re += "$'";
@@ -1361,22 +1406,29 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
       // some STLs can give us spurious leak alerts (since the STL tries to
       // do its own memory pooling), so we avoid it by using STL as little
       // as possible for "big" objects that might require "lots" of memory.
-      char command[6 * PATH_MAX + 200];
+      char base_command[6 * PATH_MAX + 200];
+      char beg_profile[PATH_MAX+1], end_profile[PATH_MAX+1];
       if (use_initial_profile) {
+        snprintf(beg_profile, sizeof(beg_profile), "%s.%s-beg.heap",
+                 profile_prefix->c_str(), name_);
         // compare against initial profile only if need to
         const char* drop_negative = same_heap ? "" : " --drop_negative";
-        snprintf(command, sizeof(command), "%s --base=\"%s.%s-beg.heap\" %s ",
-                 pprof_path(), profile_prefix->c_str(), name_,
-                 drop_negative);
+        snprintf(base_command, sizeof(base_command),
+                 "%s --base=\"%s\" %s ",
+                 pprof_path(), beg_profile, drop_negative);
       } else {
-        snprintf(command, sizeof(command), "%s",
+        beg_profile[0] = '\0';
+        snprintf(base_command, sizeof(base_command), "%s",
                  pprof_path());
       }
-      snprintf(command + strlen(command), sizeof(command) - strlen(command),
-               " %s \"%s.%s-end.heap\" %s --inuse_objects --lines",
-               invocation_path(), profile_prefix->c_str(),
-               name_, ignore_re.c_str());
+      snprintf(end_profile, sizeof(end_profile), "%s.%s-end.heap",
+               profile_prefix->c_str(), name_);
+      snprintf(base_command + strlen(base_command),
+               sizeof(base_command) - strlen(base_command),
+               " %s \"%s\" %s --inuse_objects --lines",
+               invocation_path(), end_profile, ignore_re.c_str());
                    // --lines is important here to catch leaks when !see_leaks
+
       char cwd[PATH_MAX+1];
       if (getcwd(cwd, sizeof(cwd)) != cwd)  abort();
       if (see_leaks) {
@@ -1390,7 +1442,7 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
                               "To investigate leaks manually use e.g.\n"
                               "cd %s; "  // for proper symbol resolution
                               "%s%s\n\n",
-                              cwd, command, gv_command_tail);
+                              cwd, base_command, gv_command_tail);
       }
       string output;
       int checked_leaks = 0;
@@ -1403,14 +1455,18 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
         } else {
           // We don't care about pprof's stderr as long as it
           // succeeds with empty report:
-          checked_leaks = GetStatusOutput(command, &output);
+          char full_command[6 * PATH_MAX + 200];   // needed to concatenate
+          snprintf(full_command, sizeof(full_command), "%s%s",
+                   base_command, command_tail);
+          checked_leaks = GetStatusOutput(full_command, &output);
           if (checked_leaks != 0) {
             HeapProfiler::MESSAGE(-1, "ERROR: Could not run pprof at %s\n",
                                   pprof_path());
             abort();
           }
         }
-        if (see_leaks && output.empty() && checked_leaks == 0) {
+        if (see_leaks && pprof_can_ignore &&
+            output.empty() && checked_leaks == 0) {
           HeapProfiler::MESSAGE(-1, "HeapChecker: "
                                 "These must be leaks that we disabled"
                                 " (pprof succeeded)! This check WILL FAIL"
@@ -1420,7 +1476,24 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
         // do not fail the check just due to us being a stripped binary
         if (!see_leaks  &&  strstr(output.c_str(), "nm: ") != NULL  &&
             strstr(output.c_str(), ": no symbols") != NULL)  output.resize(0);
-        if (!(see_leaks || checked_leaks == 0))  abort();
+      }
+      // Make sure the profiles we created are still there.
+      // They can get deleted e.g. if the program forks/executes itself
+      // and FLAGS_cleanup_old_heap_profiles was kept as true.
+      if (access(end_profile, R_OK) != 0  ||
+          (beg_profile[0]  &&  access(beg_profile, R_OK) != 0)) {
+        HeapProfiler::MESSAGE(-1, "HeapChecker: "
+                              "One of the heap profiles is gone: %s %s\n",
+                              beg_profile, end_profile);
+        abort();
+      }
+      if (!(see_leaks || checked_leaks == 0)) {
+        // Crash if something went wrong with executing pprof
+        // and we rely on pprof to do its work:
+        HeapProfiler::MESSAGE(-1, "HeapChecker: "
+                              "pprof command failed: %s%s\n",
+                              base_command, command_tail);
+        abort();
       }
       if (see_leaks  &&  use_initial_profile) {
         HeapProfiler::MESSAGE(-1, "HeapChecker: "
@@ -1438,7 +1511,7 @@ bool HeapLeakChecker::DoNoLeaks(bool same_heap,
                               "To investigate leaks manually uge e.g.\n"
                               "cd %s; "  // for proper symbol resolution
                               "%s%s\n\n",
-                              name_, cwd, command, gv_command_tail);
+                              name_, cwd, base_command, gv_command_tail);
         if (use_initial_profile) {
           HeapProfiler::MESSAGE(-1, "HeapChecker: "
                                 "CAVEAT: Some of the reported leaks might have "
@@ -1491,6 +1564,10 @@ HeapLeakChecker::~HeapLeakChecker() {
 // HeapLeakChecker overall heap check components
 //----------------------------------------------------------------------
 
+bool HeapLeakChecker::IsActive() {
+  return heap_checker_on;
+}
+
 vector<HeapCleaner::void_function>* HeapCleaner::heap_cleanups_ = NULL;
 
 // When a HeapCleaner object is intialized, add its function to the static list
@@ -1653,7 +1730,7 @@ void HeapLeakChecker::DoMainHeapCheck() {
     HeapProfiler::MESSAGE(0, "HeapChecker: "
                              "Checking for whole-program memory leaks\n");
     if (!main_heap_checker->DoNoLeaks(same_heap, do_full, do_report)) {
-      HeapProfiler::MESSAGE(-1, "ERROR: Leaks found in main heap check, aborting\n");
+      HeapProfiler::MESSAGE(-1, "HeapChecker: crashing because of leaks\n");
       abort();
     }
     delete main_heap_checker;
diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
index 0260a34..686b4bc 100644
--- a/src/malloc_extension.cc
+++ b/src/malloc_extension.cc
@@ -166,6 +166,21 @@ struct StackTraceHash {
     }
     return h;
   }
+  // Less operator for MSVC's hash containers.
+  bool operator()(void** entry1, void** entry2) const {
+    if (Depth(entry1) != Depth(entry2))
+      return Depth(entry1) < Depth(entry2);
+    for (int i = 0; i < Depth(entry1); i++) {
+      if (PC(entry1, i) != PC(entry2, i)) {
+        return PC(entry1, i) < PC(entry2, i);
+      }
+    }
+    return false;  // entries are equal
+  }
+  // These two public members are required by msvc.  4 and 8 are the
+  // default values.
+  static const size_t bucket_size = 4;
+  static const size_t min_buckets = 8;
 };
 
 struct StackTraceEqual {
diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc
index 8499c73..613e612 100644
--- a/src/malloc_hook.cc
+++ b/src/malloc_hook.cc
@@ -115,7 +115,7 @@ extern "C" void* mmap64(void *start, size_t length,
                         int fd, __off64_t offset) __THROW {
 
   void *result;
-  result = syscall(SYS_mmap, start, length, prot, flags, fd, offset);
+  result = (void *)syscall(SYS_mmap, start, length, prot, flags, fd, offset);
   MallocHook::InvokeMmapHook(result, start, length, prot, flags, fd, offset);
   return result;
 }
diff --git a/src/pprof b/src/pprof
index 5df1798..24b5b74 100755
--- a/src/pprof
+++ b/src/pprof
@@ -41,6 +41,9 @@
 # Examples:
 #
 # % tools/pprof "program" "profile"
+#   Enters "interactive" mode
+#
+# % tools/pprof --text "program" "profile"
 #   Generates one line per procedure
 #
 # % tools/pprof --gv "program" "profile"
@@ -68,6 +71,8 @@
 use strict;
 use Getopt::Long;
 
+my $PPROF_VERSION = "0.8";
+
 # These are the object tools we use, which come from various sources.
 # We want to invoke them directly, rather than via users' aliases and/or
 # search paths, because some people have colorizing versions of them that
@@ -79,9 +84,22 @@ my %obj_tool_map = (
   "objdump" => "objdump",
   "nm" => "nm",
   "addr2line" => "addr2line",
+  "c++filt" => "c++filt",
 );
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
+# These are used for dynamic profiles
+my $WGET = "wget";
+my $CURL = "curl";
+
+# These are the web pages that servers need to support for dynamic profiles
+my $HEAP_PAGE = "/pprof/heap";
+my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
+my $GROWTH_PAGE = "/pprof/growth";
+my $CONTENTION_PAGE = "/pprof/contention";
+my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
+my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
+
 
 # There is a pervasive dependency on the length (in hex characters, i.e.,
 # nibbles) of an address, distinguishing between 32-bit and 64-bit profiles:
@@ -90,23 +108,40 @@ my $address_length = 8;   # Hope for 32-bit, reset if 64-bit detected.
 ##### Argument parsing #####
 
 sub usage_string {
-  return <<'EOF';
-Usage: pprof [options] <program> <profile> ...
-   Prints specified cpu- or heap-profile
-   
+  return <<EOF;
+Usage:
+pprof [options] <program> <profiles>
+   <profiles> is a space separated list of profile names.
+pprof [options] <profile>
+   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
+
+   Each profile name can be:
+   /path/to/profile        - a path to a profile file
+   host:port[/<service>]   - a location of a service to get profile from
+
+   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, $GROWTH_PAGE, or $CONTENTION_PAGE.
+   For instance: "pprof http://myserver.com:80$HEAP_PAGE".
+   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
+
+   For more help with querying remote servers, including how to add the
+   necessary server-side support code, see this filename (or one like it):
+
+   /usr/doc/google-perftools-$PPROF_VERSION/pprof_remote_servers.html
+
 Options:
    --cum               Sort by cumulative data
    --base=<base>       Subtract <base> from <profile> before display
-   --interactive       Run in interactive mode (interactive "help" gives help)
-   
+   --interactive       Run in interactive mode (interactive "help" gives help) [default]
+   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
+
 Reporting Granularity:
    --addresses         Report at address level
    --lines             Report at source line level
    --functions         Report at function level [default]
    --files             Report at source file level
-   
+
 Output type:
-   --text              Generate text report [default]
+   --text              Generate text report
    --gv                Generate Postscript and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
@@ -114,7 +149,7 @@ Output type:
    --ps                Generate Postcript to stdout
    --pdf               Generate PDF to stdout
    --gif               Generate GIF to stdout
-   
+
 Heap-Profile Options:
    --inuse_space       Display in-use (mega)bytes [default]
    --inuse_objects     Display in-use objects
@@ -122,7 +157,12 @@ Heap-Profile Options:
    --alloc_objects     Display allocated objects
    --show_bytes        Display space in bytes
    --drop_negative     Ignore negative differences
-   
+
+Contention-profile options:
+   --total_delay      Display total delay at each region [default]
+   --contentions      Display number of delays at each region
+   --mean_delay       Display mean delay at each region
+
 Call-graph Options:
    --nodecount=<n>     Show at most so many nodes [default=80]
    --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
@@ -130,7 +170,7 @@ Call-graph Options:
    --focus=<regexp>    Focus on nodes matching <regexp>
    --ignore=<regexp>   Ignore nodes matching <regexp>
    --scale=<n>         Set GV scaling [default=0]
-   
+
 Miscellaneous:
    --tools=<prefix>    Prefix for object tool pathnames
    --test              Run unit tests
@@ -138,7 +178,7 @@ Miscellaneous:
    --version           Version information
 
 Examples:
-   
+
 pprof /bin/ls ls.prof
                        Outputs one line per procedure
 pprof --gv /bin/ls ls.prof
@@ -151,12 +191,14 @@ pprof --list=getdir /bin/ls ls.prof
                        (Per-line) annotated source listing for getdir()
 pprof --disasm=getdir /bin/ls ls.prof
                        (Per-PC) annotated disassembly for getdir()
+pprof localhost:1234
+                       Outputs one line per procedure for localhost:1234
 EOF
 }
 
 sub version_string {
-  return <<'EOF'
-pprof (part of google-perftools 0.7)
+  return <<EOF
+pprof (part of google-perftools $PPROF_VERSION)
 
 Copyright 1998-2006 Google Inc.
 
@@ -175,301 +217,387 @@ sub usage {
   exit(1);
 }
 
+sub Init() {
+  # Setup tmp-file name and handler to clean it up.
+  # We do this in the very beginning so that we can use
+  # error() and cleanup() function anytime here after.
+  $main::tmpfile_sym = "/tmp/pprof$$.sym";
+  $main::tmpfile_ps = "/tmp/pprof$$";
+  $main::next_tmpfile = 0;
+  $SIG{'INT'} = \&sighandler;
 
-$main::opt_help = 0;
-$main::opt_version = 0;
 
-$main::opt_cum = 0;
-$main::opt_base = '';
-$main::opt_addresses = 0;
-$main::opt_lines = 0;
-$main::opt_functions = 0;
-$main::opt_files = 0;
+  $main::opt_help = 0;
+  $main::opt_version = 0;
 
-$main::opt_text = 0;
-$main::opt_list = "";
-$main::opt_disasm = "";
-$main::opt_gv = 0;
-$main::opt_dot = 0;
-$main::opt_ps = 0;
-$main::opt_pdf = 0;
-$main::opt_gif = 0;
-
-$main::opt_nodecount = 80;
-$main::opt_nodefraction = 0.005;
-$main::opt_edgefraction = 0.001;
-$main::opt_focus = '';
-$main::opt_ignore = '';
-$main::opt_scale = 0;
-
-$main::opt_inuse_space   = 0;
-$main::opt_inuse_objects = 0;
-$main::opt_alloc_space   = 0;
-$main::opt_alloc_objects = 0;
-$main::opt_show_bytes    = 0;
-$main::opt_drop_negative = 0;
-$main::opt_interactive   = 0;
-
-$main::opt_tools   = "";
-$main::opt_debug   = 0;
-$main::opt_test    = 0;
-
-# Are we printing a heap profile?
-$main::heap_profile = 0;
-
-# Are we printing a lock profile?
-$main::lock_profile = 0;
-
-GetOptions("help!"          => \$main::opt_help,
-	   "version!"       => \$main::opt_version,
-	   "cum!"           => \$main::opt_cum,
-	   "base=s"         => \$main::opt_base,
-	   "functions!"     => \$main::opt_functions,
-	   "lines!"         => \$main::opt_lines,
-	   "addresses!"     => \$main::opt_addresses,
-	   "files!"         => \$main::opt_files,
-	   "text!"          => \$main::opt_text,
-	   "list=s"         => \$main::opt_list,
-	   "disasm=s"       => \$main::opt_disasm,
-	   "gv!"            => \$main::opt_gv,
-	   "dot!"           => \$main::opt_dot,
-	   "ps!"            => \$main::opt_ps,
-	   "pdf!"           => \$main::opt_pdf,
-	   "gif!"           => \$main::opt_gif,
-	   "interactive!"   => \$main::opt_interactive,
-	   "nodecount=i"    => \$main::opt_nodecount,
-	   "nodefraction=f" => \$main::opt_nodefraction,
-	   "edgefraction=f" => \$main::opt_edgefraction,
-	   "focus=s"        => \$main::opt_focus,
-	   "ignore=s"       => \$main::opt_ignore,
-	   "scale=i"        => \$main::opt_scale,
-	   "inuse_space!"   => \$main::opt_inuse_space,
-	   "inuse_objects!" => \$main::opt_inuse_objects,
-	   "alloc_space!"   => \$main::opt_alloc_space,
-	   "alloc_objects!" => \$main::opt_alloc_objects,
-	   "show_bytes!"    => \$main::opt_show_bytes,
-	   "drop_negative!" => \$main::opt_drop_negative,
-           "tools=s"        => \$main::opt_tools,
-           "test!"          => \$main::opt_test,
-           "debug!"         => \$main::opt_debug,
-	   ) || usage("Invalid option(s)");
-
-# Deal with the standard --help and --version
-if ($main::opt_help) {
-  print usage_string();
-  exit(0);
-}
-
-if ($main::opt_version) {
-  print version_string();
-  exit(0);
-}
-
-# Disassembly/listing mode requires address-level info
-if ($main::opt_disasm || $main::opt_list) {
-  $main::opt_functions = 0;
+  $main::opt_cum = 0;
+  $main::opt_base = '';
+  $main::opt_addresses = 0;
   $main::opt_lines = 0;
-  $main::opt_addresses = 1;
+  $main::opt_functions = 0;
   $main::opt_files = 0;
-}
 
-# Check heap-profiling flags
-if ($main::opt_inuse_space +
-    $main::opt_inuse_objects +
-    $main::opt_alloc_space +
-    $main::opt_alloc_objects > 1) {
-  usage("Specify at most on of --inuse/--alloc options");
-}
+  $main::opt_text = 0;
+  $main::opt_list = "";
+  $main::opt_disasm = "";
+  $main::opt_gv = 0;
+  $main::opt_dot = 0;
+  $main::opt_ps = 0;
+  $main::opt_pdf = 0;
+  $main::opt_gif = 0;
 
-# Check output granularities
-my $grains =
-  $main::opt_functions +
-  $main::opt_lines +
-  $main::opt_addresses +
-  $main::opt_files +
-  0;
-if ($grains > 1) {
-  usage("Only specify one output granularity option");
-}
-if ($grains == 0) {
-  $main::opt_functions = 1;
-}
+  $main::opt_nodecount = 80;
+  $main::opt_nodefraction = 0.005;
+  $main::opt_edgefraction = 0.001;
+  $main::opt_focus = '';
+  $main::opt_ignore = '';
+  $main::opt_scale = 0;
+  $main::opt_seconds = 30;
 
-# Check output modes
-my $modes =
-  $main::opt_text +
-  $main::opt_gv +
-  $main::opt_dot +
-  $main::opt_ps +
-  $main::opt_pdf +
-  $main::opt_gif +
-  0;
-if ($modes > 1) {
-  usage("Only specify one output mode");
-}
-if ($modes == 0) {
-  $main::opt_text = 1;
-}
+  $main::opt_inuse_space   = 0;
+  $main::opt_inuse_objects = 0;
+  $main::opt_alloc_space   = 0;
+  $main::opt_alloc_objects = 0;
+  $main::opt_show_bytes    = 0;
+  $main::opt_drop_negative = 0;
+  $main::opt_interactive   = 0;
 
-if ($main::opt_test) {
-  RunUnitTests();
-  # Should not return
-  exit(1);
-}
+  $main::opt_total_delay = 0;
+  $main::opt_contentions = 0;
+  $main::opt_mean_delay = 0;
 
-# Binary name and profile arguments list
-$main::prog = "";
-@main::pfile_args = ();
+  $main::opt_tools   = "";
+  $main::opt_debug   = 0;
+  $main::opt_test    = 0;
 
-$main::prog = shift || usage("Did not specify program");
-scalar(@ARGV) || usage("Did not specify profile file");
+  # Are we using $SYMBOL_PAGE?
+  $main::use_symbol_page = 0;
 
-# Parse profile file/location arguments
-foreach my $farg (@ARGV) {
-  unshift(@main::pfile_args, $farg);
-}
-ConfigureObjTools($main::prog);
+  # Are we printing a heap profile?
+  $main::heap_profile = 0;
 
-##### Main section #####
+  # Are we printing a lock profile?
+  $main::lock_profile = 0;
 
-# Setup tmp-file name and handler to clean it up
-$main::tmpfile_sym = "/tmp/pprof$$.sym";
-$main::tmpfile_ps = "/tmp/pprof$$";
-$main::next_tmpfile = 0;
-$main::collected_profile = undef;
-@main::profile_files = ();
-#$main::op_time = time();
-$SIG{'INT'} = \&sighandler;
+  GetOptions("help!"          => \$main::opt_help,
+             "version!"       => \$main::opt_version,
+             "cum!"           => \$main::opt_cum,
+             "base=s"         => \$main::opt_base,
+             "seconds=i"      => \$main::opt_seconds,
+             "functions!"     => \$main::opt_functions,
+             "lines!"         => \$main::opt_lines,
+             "addresses!"     => \$main::opt_addresses,
+             "files!"         => \$main::opt_files,
+             "text!"          => \$main::opt_text,
+             "list=s"         => \$main::opt_list,
+             "disasm=s"       => \$main::opt_disasm,
+             "gv!"            => \$main::opt_gv,
+             "dot!"           => \$main::opt_dot,
+             "ps!"            => \$main::opt_ps,
+             "pdf!"           => \$main::opt_pdf,
+             "gif!"           => \$main::opt_gif,
+             "interactive!"   => \$main::opt_interactive,
+             "nodecount=i"    => \$main::opt_nodecount,
+             "nodefraction=f" => \$main::opt_nodefraction,
+             "edgefraction=f" => \$main::opt_edgefraction,
+             "focus=s"        => \$main::opt_focus,
+             "ignore=s"       => \$main::opt_ignore,
+             "scale=i"        => \$main::opt_scale,
+             "inuse_space!"   => \$main::opt_inuse_space,
+             "inuse_objects!" => \$main::opt_inuse_objects,
+             "alloc_space!"   => \$main::opt_alloc_space,
+             "alloc_objects!" => \$main::opt_alloc_objects,
+             "show_bytes!"    => \$main::opt_show_bytes,
+             "drop_negative!" => \$main::opt_drop_negative,
+             "total_delay!"   => \$main::opt_total_delay,
+             "contentions!"   => \$main::opt_contentions,
+             "mean_delay!"    => \$main::opt_mean_delay,
+             "tools=s"        => \$main::opt_tools,
+             "test!"          => \$main::opt_test,
+             "debug!"         => \$main::opt_debug,
+      ) || usage("Invalid option(s)");
 
-# Fetch all profile data
-FetchDynamicProfiles();
-
-# Read one profile, pick the last item on the list
-my $data = ReadProfile($main::prog, pop(@main::profile_files));
-my $profile = $data->{profile};
-my $libs = $data->{libs};       # Info about main program and shared libraries
-
-# List of function names to skip
-$main::skip = ();
-$main::skip_regexp = 'NOMATCH';
-if ($main::heap_profile) {
-  foreach my $name ('calloc',
-                    'cfree',
-                    'malloc',
-                    'free',
-                    'memalign',
-                    'pvalloc',
-                    'valloc',
-                    'realloc',
-                    'do_malloc',
-		    'DoSampledAllocation',
-                    '__builtin_delete',
-                    '__builtin_new',
-                    '__builtin_vec_delete',
-                    '__builtin_vec_new') {
-    $main::skip{$name} = 1;
+  # Deal with the standard --help and --version
+  if ($main::opt_help) {
+    print usage_string();
+    exit(0);
   }
-  $main::skip_regexp = "TCMalloc";
-}
-if ($main::lock_profile) {
-  foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') {
-    $main::skip{$vname} = 1;
+
+  if ($main::opt_version) {
+    print version_string();
+    exit(0);
   }
-}
 
-# Add additional profiles, if available.
-if (scalar(@main::profile_files) > 0) {
-  foreach my $pname (@main::profile_files) {
-    my $p = ReadProfile($main::prog, $pname)->{profile};
-    $profile = AddProfile($profile, $p);
+  # Disassembly/listing mode requires address-level info
+  if ($main::opt_disasm || $main::opt_list) {
+    $main::opt_functions = 0;
+    $main::opt_lines = 0;
+    $main::opt_addresses = 1;
+    $main::opt_files = 0;
   }
-}
 
-# Subtract base from profile, if specified
-if ($main::opt_base ne '') {
-  my $base = ReadProfile($main::prog, $main::opt_base)->{profile};
-  $profile = SubtractProfile($profile, $base);
-}
+  # Check heap-profiling flags
+  if ($main::opt_inuse_space +
+      $main::opt_inuse_objects +
+      $main::opt_alloc_space +
+      $main::opt_alloc_objects > 1) {
+    usage("Specify at most on of --inuse/--alloc options");
+  }
 
-# Get total data in profile
-my $total = TotalProfile($profile);
+  # Check output granularities
+  my $grains =
+      $main::opt_functions +
+      $main::opt_lines +
+      $main::opt_addresses +
+      $main::opt_files +
+      0;
+  if ($grains > 1) {
+    usage("Only specify one output granularity option");
+  }
+  if ($grains == 0) {
+    $main::opt_functions = 1;
+  }
 
-# Extract symbols
-my $symbols = ExtractSymbols($libs, $profile, $data->{pcs});
-
-# Focus?
-if ($main::opt_focus ne '') {
-  $profile = FocusProfile($symbols, $profile, $main::opt_focus);
-}
-
-# Ignore?
-if ($main::opt_ignore ne '') {
-  $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
-}
-
-# Reduce profiles to required output granularity, and also clean
-# each stack trace so a given entry exists at most once.
-my $reduced = ReduceProfile($symbols, $profile);
-
-# Get derived profiles
-my $flat = FlatProfile($reduced);
-my $cumulative = CumulativeProfile($reduced);
-
-# Print
-if (!$main::opt_interactive) {
-  if ($main::opt_disasm) {
-    PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
-  } elsif ($main::opt_list) {
-    PrintListing($libs, $flat, $cumulative, $main::opt_list);
-  } elsif ($main::opt_text) {
-    PrintText($symbols, $flat, $cumulative, $total, -1);
-  } else {
-    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      if ($main::opt_gv) {
-        if (!system("$GV --version >/dev/null 2>&1")) {
-	  # Options using double dash are supported by this gv version.
-	  system("$GV --scale=$main::opt_scale " .
-		 PsTempName($main::next_tmpfile));
-        } else {
-          # Old gv version - only supports options that use single dash.
-	  system("$GV -scale $main::opt_scale " .
-		 PsTempName($main::next_tmpfile));
-        }
-      }
+  # Check output modes
+  my $modes =
+      $main::opt_text +
+      $main::opt_gv +
+      $main::opt_dot +
+      $main::opt_ps +
+      $main::opt_pdf +
+      $main::opt_gif +
+      $main::opt_interactive +
+      0;
+  if ($modes > 1) {
+    usage("Only specify one output mode");
+  }
+  if ($modes == 0) {
+    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
+      $main::opt_interactive = 1;
     } else {
-      exit(1);
+      $main::opt_text = 1;
     }
   }
-} else {
-  InteractiveMode();
+
+  if ($main::opt_test) {
+    RunUnitTests();
+    # Should not return
+    exit(1);
+  }
+
+  # Binary name and profile arguments list
+  $main::prog = "";
+  @main::pfile_args = ();
+
+  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
+  if (IsProfileURL($ARGV[0])) {
+    $main::use_symbol_page = 1;
+  }
+
+  if ($main::use_symbol_page) {  # We don't need a binary!
+    my %disabled = ('--lines' => $main::opt_lines,
+                    '--disasm' => $main::opt_disasm);
+    for my $option (keys %disabled) {
+      usage("$option cannot be used without a binary") if $disabled{$option};
+    }
+    # Set $main::prog later...
+    scalar(@ARGV) || usage("Did not specify profile file");
+  } else {
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+    scalar(@ARGV) || usage("Did not specify profile file");
+  }
+
+  # Parse profile file/location arguments
+  foreach my $farg (@ARGV) {
+    if ($farg =~ m/(.*)\@([0-9]+)/ ) {
+      my $machine = $1;
+      my $num_machines = $2;
+      for (my $i = 0; $i < $num_machines; $i++) {
+        unshift(@main::pfile_args, "$i.$machine");
+      }
+    } else {
+      unshift(@main::pfile_args, $farg);
+    }
+  }
+
+  if ($main::use_symbol_page) {
+    unless (IsProfileURL($main::pfile_args[0])) {
+      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
+    }
+    CheckSymbolPage();
+    $main::prog = FetchProgramName();
+  } else {
+    ConfigureObjTools($main::prog)
+  }
 }
 
-cleanup();
-exit(0);
+sub Main() {
+  Init();
+  $main::collected_profile = undef;
+  @main::profile_files = ();
+  $main::op_time = time();
 
+  # Fetch all profile data
+  FetchDynamicProfiles();
+
+  # Read one profile, pick the last item on the list
+  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $profile = $data->{profile};
+  my $libs = $data->{libs};   # Info about main program and shared libraries
+
+  # List of function names to skip
+  $main::skip = ();
+  $main::skip_regexp = 'NOMATCH';
+  if ($main::heap_profile) {
+    foreach my $name ('calloc',
+                      'cfree',
+                      'malloc',
+                      'free',
+                      'memalign',
+                      'pvalloc',
+                      'valloc',
+                      'realloc',
+                      'do_malloc',
+                      'DoSampledAllocation',
+		      'simple_alloc::allocate',
+		      '__malloc_alloc_template::allocate',
+                      '__builtin_delete',
+                      '__builtin_new',
+                      '__builtin_vec_delete',
+                      '__builtin_vec_new') {
+      $main::skip{$name} = 1;
+    }
+    $main::skip_regexp = "TCMalloc";
+  }
+  if ($main::lock_profile) {
+    foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') {
+      $main::skip{$vname} = 1;
+    }
+  }
+
+  # Add additional profiles, if available.
+  if (scalar(@main::profile_files) > 0) {
+    foreach my $pname (@main::profile_files) {
+      my $p = ReadProfile($main::prog, $pname)->{profile};
+      $profile = AddProfile($profile, $p);
+    }
+  }
+
+  # Subtract base from profile, if specified
+  if ($main::opt_base ne '') {
+    my $base = ReadProfile($main::prog, $main::opt_base)->{profile};
+    $profile = SubtractProfile($profile, $base);
+  }
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Collect symbols
+  my $symbols = undef;
+  if ($main::use_symbol_page) {
+    $symbols = FetchSymbols($data->{pcs});
+  } else {
+    $symbols = ExtractSymbols($libs, $profile, $data->{pcs});
+  }
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($libs, $flat, $cumulative, $main::opt_list);
+    } elsif ($main::opt_text) {
+      PrintText($symbols, $flat, $cumulative, $total, -1);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          if (!system("$GV --version >/dev/null 2>&1")) {
+            # Options using double dash are supported by this gv version.
+            system("$GV --scale=$main::opt_scale " .
+                   PsTempName($main::next_tmpfile));
+          } else {
+            # Old gv version - only supports options that use single dash.
+            system("$GV -scale $main::opt_scale " .
+                   PsTempName($main::next_tmpfile));
+          }
+        }
+      } else {
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+
+  cleanup();
+  exit(0);
+}
+
+##### Entry Point #####
+
+Main();
+
+# Temporary code to detect if we're running on a Goobuntu system.
+# These systems don't have the right stuff installed for the special
+# Readline libraries to work, so as a temporary workaround, we default
+# to using the normal stdio code, rather than the fancier readline-based
+# code
+sub ReadlineMightFail {
+  if (-e '/lib/libtermcap.so.2') {
+    return 0;  # libtermcap exists, so readline should be okay
+  } else {
+    return 1;
+  }
+}
 
 ##### Interactive helper routines #####
 
 sub InteractiveMode {
-  $| = 1;	# Make output unbuffered for interactive mode
-  my $orig_profile = $profile;
+  $| = 1;  # Make output unbuffered for interactive mode
+  my ($orig_profile, $symbols, $libs, $total) = @_;
 
   # Use ReadLine if it's installed.
-  if ( defined(eval {require Term::ReadLine}) ) {
+  if ( !ReadlineMightFail() &&
+       defined(eval {require Term::ReadLine}) ) {
     my $term = new Term::ReadLine 'pprof';
     while ( defined ($_ = $term->readline('(pprof) '))) {
       $term->addhistory($_) if /\S/;
-      if (!InteractiveCommand($orig_profile, $_)) {
-	last;    # exit when we get an interactive command to quit
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
       }
     }
   } else {       # don't have readline
     while (1) {
       print "(pprof) ";
       $_ = <STDIN>;
-      if (!InteractiveCommand($orig_profile, $_)) {
-	last;    # exit when we get an interactive command to quit
+
+      # Save some flags that might be reset by InteractiveCommand()
+      my $save_opt_lines = $main::opt_lines;
+
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
       }
+
+      # Restore flags
+      $main::opt_lines = $save_opt_lines;
     }
   }
 }
@@ -477,7 +605,7 @@ sub InteractiveMode {
 # Takes two args: orig profile, and command to run.
 # Returns 1 if we should keep going, or 0 if we were asked to quit
 sub InteractiveCommand {
-  my($orig_profile, $command) = @_;
+  my($orig_profile, $symbols, $libs, $total, $command) = @_;
   $_ = $command;                # just to make future m//'s easier
   if (!defined($_)) {
     print "\n";
@@ -490,8 +618,7 @@ sub InteractiveCommand {
     InteractiveHelpMessage();
     return 1;
   }
-  # Clear all the options
-  $main::opt_lines = 0;
+  # Clear all the mode options -- mode is controlled by "$command"
   $main::opt_text = 0;
   $main::opt_disasm = 0;
   $main::opt_list = 0;
@@ -507,7 +634,7 @@ sub InteractiveCommand {
     my $ignore;
     ($routine, $ignore) = ParseInteractiveArgs($3);
 
-    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $profile = ProcessProfile($orig_profile, $symbols, "", $ignore);
     my $reduced = ReduceProfile($symbols, $profile);
 
     # Get derived profiles
@@ -524,7 +651,7 @@ sub InteractiveCommand {
     my $ignore;
     ($routine, $ignore) = ParseInteractiveArgs($1);
 
-    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $profile = ProcessProfile($orig_profile, $symbols, "", $ignore);
     my $reduced = ReduceProfile($symbols, $profile);
 
     # Get derived profiles
@@ -542,7 +669,7 @@ sub InteractiveCommand {
     ($routine, $ignore) = ParseInteractiveArgs($1);
 
     # Process current profile to account for various settings
-    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $profile = ProcessProfile($orig_profile, $symbols, "", $ignore);
     my $reduced = ReduceProfile($symbols, $profile);
 
     # Get derived profiles
@@ -560,7 +687,7 @@ sub InteractiveCommand {
     ($focus, $ignore) = ParseInteractiveArgs($1);
 
     # Process current profile to account for various settings
-    my $profile = ProcessProfile($orig_profile, $focus, $ignore);
+    my $profile = ProcessProfile($orig_profile, $symbols, $focus, $ignore);
     my $reduced = ReduceProfile($symbols, $profile);
 
     # Get derived profiles
@@ -587,6 +714,7 @@ sub InteractiveCommand {
 
 sub ProcessProfile {
   my $orig_profile = shift;
+  my $symbols = shift;
   my $focus = shift;
   my $ignore = shift;
 
@@ -598,18 +726,18 @@ sub ProcessProfile {
     $profile = FocusProfile($symbols, $profile, $focus);
     my $focus_count = TotalProfile($profile);
     printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
-	   $focus,
-	   Unparse($focus_count), Units(),
-	   Unparse($total_count), ($focus_count*100.0) / $total_count);
+           $focus,
+           Unparse($focus_count), Units(),
+           Unparse($total_count), ($focus_count*100.0) / $total_count);
   }
   if ($ignore ne '') {
     $profile = IgnoreProfile($symbols, $profile, $ignore);
     my $ignore_count = TotalProfile($profile);
     printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
-	   $ignore,
-	   Unparse($ignore_count), Units(),
-	   Unparse($total_count),
-	   ($ignore_count*100.0) / $total_count);
+           $ignore,
+           Unparse($ignore_count), Units(),
+           Unparse($total_count),
+           ($ignore_count*100.0) / $total_count);
   }
 
   return $profile;
@@ -637,7 +765,7 @@ Commands:
       Show top lines ordered by flat profile count, or cumulative count
       if --cum is specified.  If a number is present after 'top', the
       top K routines will be shown (defaults to showing the top 10)
-  
+
   disasm [routine_regexp] [-ignore1] [-ignore2]
       Show disassembly of routines whose names match "routine_regexp",
       annotated with sample counts.
@@ -649,6 +777,10 @@ For commands that accept optional -ignore tags, samples where any routine in
 the stack trace matches the regular expression in any of the -ignore
 parameters will be ignored.
 
+Further pprof details are available at this location (or one similar):
+
+ /usr/doc/google-perftools-$PPROF_VERSION/cpu_profiler.html
+
 ENDOFHELP
 }
 sub ParseInteractiveArgs {
@@ -1023,8 +1155,8 @@ sub PrintDot {
 
   if ($nodelimit > 0 || $edgelimit > 0) {
     printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
-		   Unparse($nodelimit), Units(),
-		   Unparse($edgelimit), Units());
+                   Unparse($nodelimit), Units(),
+                   Unparse($edgelimit), Units());
   }
 
   # Open DOT output file
@@ -1160,8 +1292,6 @@ sub OutputKey {
   # Skip large addresses since they sometimes show up as fake entries on RH9
   if (length($a) > 8) {
     if ($a gt "7fffffffffffffff") { return ''; }
-  } else {
-    if (hex($a) > 0x7fffffff) { return ''; }
   }
 
   # Extract symbolic info for address
@@ -1220,7 +1350,7 @@ sub Unparse {
         return sprintf("%.1f", $num / 1048576.0);
       }
     }
-  } elsif ($main::lock_profile) {
+  } elsif ($main::lock_profile && !$main::opt_contentions) {
     return sprintf("%.3f", $num / 1e9);	# Convert nanoseconds to seconds
   } else {
     return sprintf("%d", $num);
@@ -1249,7 +1379,7 @@ sub Units {
         return "MB";
       }
     }
-  } elsif ($main::lock_profile) {
+  } elsif ($main::lock_profile && !$main::opt_contentions) {
     return "seconds";
   } else {
     return "samples";
@@ -1267,7 +1397,9 @@ sub FlatProfile {
   foreach my $k (keys(%{$profile})) {
     my $count = $profile->{$k};
     my @addrs = split(/\n/, $k);
-    AddEntry($result, $addrs[0], $count);
+    if ($#addrs >= 0) {
+      AddEntry($result, $addrs[0], $count);
+    }
   }
   return $result;
 }
@@ -1458,14 +1590,191 @@ sub AddEntries {
 
 ##### Code to profile a server dynamically #####
 
+sub CheckSymbolPage {
+  my $url = SymbolPageURL();
+  open(SYMBOL, "$WGET -qO- '$url' |");
+  my $line = <SYMBOL>;
+  close(SYMBOL);
+  unless (defined($line)) {
+    error("$url doesn't exist\n");
+  }
+
+  if ($line =~ /^num_symbols:\s+(\d+)$/) {
+    if ($1 == 0) {
+      error("Stripped binary. No symbols available.\n");
+    }
+  } else {
+    error("Failed to get the number of symbols from $url\n");
+  }
+}
+
+sub IsProfileURL {
+  my $profile_name = shift;
+  my ($host, $port, $type) = ParseProfileURL($profile_name);
+  return defined($host) and defined($port) and defined($type);
+}
+
+sub ParseProfileURL {
+  my $profile_name = shift;
+  if ($profile_name =~ m,^(http://|)([^/:]+):(\d+)(|/|$PROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$CONTENTION_PAGE)$,o) {
+    return ($2, $3, $4);
+  }
+  return ();
+}
+
+# We fetch symbols from the first profile argument.
+sub SymbolPageURL {
+  my ($host, $port, $type) = ParseProfileURL($main::pfile_args[0]);
+  return "http://$host:$port$SYMBOL_PAGE";
+}
+
+sub FetchProgramName() {
+  my ($host, $port, $type) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "http://$host:$port$PROGRAM_NAME_PAGE";
+  my $command_line = "$WGET -qO- '$url'";
+  open(CMDLINE, "$command_line |") or error($command_line);
+  my $cmdline = <CMDLINE>;
+  close(CMDLINE);
+  error("Failed to get program name from $url\n") unless defined($cmdline);
+  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
+  $cmdline =~ s!\n!!g;  # Remove LFs.
+  return $cmdline;
+}
+
+# Gee, curl's -L (--location) option isn't reliable at least
+# with its 7.12.3 version.  Curl will forget to post data if
+# there is a redirection.  This function is a workaround for
+# curl.  Redirection happens on borg hosts.
+sub ResolveRedirectionForCurl {
+  my $url = shift;
+  my $command_line = "$CURL -s --head '$url'";
+  open(CMDLINE, "$command_line |") or error($command_line);
+  while (<CMDLINE>) {
+    if (/^Location: (.*)/) {
+      $url = $1;
+    }
+  }
+  close(CMDLINE);
+  return $url;
+}
+
+# Fetch symbols from $SYMBOL_PAGE for all PC values found in profile
+sub FetchSymbols {
+  my $pcset = shift;
+
+  my %seen = ();
+  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
+  my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
+  open(POSTFILE, ">$main::tmpfile_sym");
+  print POSTFILE $post_data;
+  close(POSTFILE);
+
+  my $url = SymbolPageURL();
+  # Here we use curl for sending data via POST since old
+  # wgets don't't have --post-file option.
+  $url = ResolveRedirectionForCurl($url);
+  my $command_line = "$CURL -sd '\@$main::tmpfile_sym' '$url'";
+  # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
+  my $cppfilt = $obj_tool_map{"c++filt"};
+  open(SYMBOL, "$command_line | $cppfilt |") or error($command_line);
+
+  my %map;
+  while (<SYMBOL>) {
+    if (m/^0x([0-9a-f]+)\s+(.+)/) {
+      $map{$1} = $2;
+    }
+  }
+  close(SYMBOL);
+
+  my $symbols = {};
+  for my $pc (@pcs) {
+    my $fullname;
+    if (defined($map{$pc})) {
+      $fullname = $map{$pc};
+    } else {
+      $fullname = "0x" . $pc;  # Just use addresses
+    }
+    my $name = ShortFunctionName($fullname);
+    $symbols->{$pc} = [$name, "?", $fullname];
+  }
+  return $symbols;
+}
+
+sub BaseName {
+  my $file_name = shift;
+  $file_name =~ s!^.*/!!;  # Remove directory name
+  return $file_name;
+}
+
+sub MakeProfileBaseName {
+  my ($binary_name, $profile_name) = @_;
+  my ($host, $port, $type) = ParseProfileURL($profile_name);
+  my $binary_shortname = BaseName($binary_name);
+  return sprintf("%s.%s.%s-port%s",
+                 $binary_shortname, $main::op_time, $host, $port);
+}
+
 sub FetchDynamicProfile {
   my $binary_name = shift;
   my $profile_name = shift;
   my $fetch_name_only = shift;
   my $encourage_patience = shift;
 
-  # TODO: Add support for fetching profiles dynamically from a server
-  return $profile_name;
+  my $user_dir = $ENV{HOME};
+  my $profile_dir = $user_dir . "/pprof";
+  if (!(-d $profile_dir)) {
+    mkdir($profile_dir) || die("Unable to create profile directory $profile_dir\n");
+  }
+  if (!IsProfileURL($profile_name)) {
+    return $profile_name;
+  } else {
+    my ($host, $port, $type) = ParseProfileURL($profile_name);
+    if ($type eq "" || $type eq "/") {
+      # Missing type specifier defaults to cpu-profile
+      $type = $PROFILE_PAGE;
+    }
+
+    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
+
+    my $url;
+    my $wget_timeout;
+    if ($type eq $PROFILE_PAGE) {
+      $url = sprintf("http://$host:$port$PROFILE_PAGE?seconds=%d",
+                     $main::opt_seconds);
+      $wget_timeout = sprintf("--timeout=%d",
+                              int($main::opt_seconds * 1.01 + 60));
+    } else {
+      # For non-CPU profiles, we add a type-extension to
+      # the target profile file name.
+      my $suffix = $type;
+      $suffix =~ s,/,.,g;
+      $profile_file .= "$suffix";
+      $url = "http://$host:$port$type";
+      $wget_timeout = "";
+    }
+    my $tmp_profile = "$profile_dir/.tmp.$profile_file";
+    my $real_profile = "$profile_dir/$profile_file";
+
+    if ($fetch_name_only > 0) {
+      return $real_profile;
+    }
+
+    my $cmd = "$WGET $wget_timeout -q -O $tmp_profile '$url'";
+    if ($type eq $PROFILE_PAGE) {
+      print STDERR "Gathering CPU profile from $host:$port for $main::opt_seconds seconds to\n  ${real_profile}\n";
+      if ($encourage_patience) {
+        print STDERR "Be patient...\n";
+      }
+    } else {
+      print STDERR "Fetching $type profile from $host:$port to\n  ${real_profile}\n";
+    }
+
+    (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
+    (system("mv $tmp_profile $real_profile") == 0) || error("Unable to rename profile\n");
+    print STDERR "Wrote profile to $real_profile\n";
+    $main::collected_profile = $real_profile;
+    return $main::collected_profile;
+  }
 }
 
 # Collect profiles in parallel
@@ -1543,10 +1852,11 @@ sub ReadProfile {
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
   my $header = <PROFILE>;
+  my $contention_marker = substr($CONTENTION_PAGE, 1);   # remove leading /
   if ($header =~ m/^heap profile:/) {
     $main::heap_profile = 1;
     return ReadHeapProfile($prog, $fname, $header);
-  } elsif ($header =~ m/^--- *contentionz/ ) {
+  } elsif ($header =~ m/^--- *$contention_marker/o ) {
     $main::lock_profile = 1;
     return ReadSynchProfile($prog, $fname);
   } elsif ($header =~ m/^--- *Stacks:/ ) {
@@ -1581,17 +1891,11 @@ sub ReadCPUProfile {
   my $pcs = {};
 
   # Parse string into array of slots.
-  # L! is needed for 64-bit # platforms, but not supported on 5.005
-  # (despite the manpage claims)
+  # L! cannot be used because with a native 64-bit build, it will cause
+  # 1) a valid 64-bit profile to use the 32-bit codepath, and
+  # 2) a valid 32-bit profile to be unrecognized.
 
-  my $format;
-  if ($] >= 5.008) {
-      $format = "L!*";
-  } else {
-      $format = "L*";
-  }
-
-  my @slots = unpack($format, $str);
+  my @slots = unpack("L*", $str);
 
   # Read header.  The current header version is a 5-element structure
   # containing:
@@ -1713,15 +2017,55 @@ sub ReadHeapProfile {
     $index = 2;
   }
 
-  # Find the type of this profile
+  # Find the type of this profile.  The header line looks like:
+  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
+  # There are two pairs <count: size>, the first inuse objects/space, and the
+  # second allocated objects/space.  This is followed optionally by a profile
+  # type, and if that is present, optionally by a sampling frequency.  The
+  # interpretation of the sampling frequency is that the profiler, for each
+  # sample, calculates a uniformly distributed random integer less than the
+  # given value, and records the next sample after that many bytes have been
+  # allocated.  Therefore, the expected sample interval is half of the given
+  # frequency.  By default, if not specified, the expected sample interval is
+  # 128KB.  Only remote-heap-page profiles are adjusted for sample size.
+  my $should_adjust_sample = 0;
+  my $sample_adjustment = 0;
   chomp($header);
   my $type = "unknown";
-  if ($header =~ m/^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*(.*))?/) {
+  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
     if (defined($6) && ($6 ne '')) {
       $type = $6;
+      # The regex test here is to see if type is a substring of HEAP_PAGE
+      if (($HEAP_PAGE =~ /$type/)) {
+	$should_adjust_sample = 1;
+	if (defined($8) && ($8 ne '')) {
+	  $sample_adjustment = int($8)/2;
+	  printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
+			 $sample_adjustment);
+	}
+      }
+    } else {
+      # We detect whether or not this is a remote-heap profile by checking
+      # that the total-allocated stats ($n2,$s2) are exactly the
+      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
+      # that a non-remote-heap profile may pass this check, but it is hard
+      # to imagine how that could happen.
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      if (($n1 == $n2) && ($s1 == $s2)) {
+        # This is likely to be a remote-heap based sample profile
+	$should_adjust_sample = 1;
+      }
     }
   }
 
+  # For remote-heap generated profiles, adjust the counts and sizes to
+  # account for the sample rate (we sample once every 128KB by default).
+  if ($should_adjust_sample && ($sample_adjustment == 0)) {
+    # Turn on profile adjustment.
+    $sample_adjustment = 128*1024;
+    print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
+  }
+
   my $profile = {};
   my $pcs = {};
   my $map = "";
@@ -1739,13 +2083,13 @@ sub ReadHeapProfile {
       # Read /proc/self/maps data as formatted by DumpAddressMap()
       my $buildvar = "";
       while (<PROFILE>) {
-	# Parse "build=<dir>" specification if supplied
-	if (m/^\s*build=(.*)\n/) {
-	  $buildvar = $1;
-	}
+        # Parse "build=<dir>" specification if supplied
+        if (m/^\s*build=(.*)\n/) {
+          $buildvar = $1;
+        }
 
-	# Expand "$build" variable if available
-	$_ =~ s/\$build\b/$buildvar/g;
+        # Expand "$build" variable if available
+        $_ =~ s/\$build\b/$buildvar/g;
 
         $map .= $_;
       }
@@ -1760,6 +2104,20 @@ sub ReadHeapProfile {
       my $stack = $5;
       my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
 
+      if ($sample_adjustment) {
+        my $ratio;
+        $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        if ($ratio < 1) {
+          $n1 /= $ratio;
+          $s1 /= $ratio;
+        }
+        $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        if ($ratio < 1) {
+          $n2 /= $ratio;
+          $s2 /= $ratio;
+        }
+      }
+
       my @counts = ($n1, $s1, $n2, $s2);
       AddEntries($profile, $pcs, $stack, $counts[$index]);
     }
@@ -1785,17 +2143,35 @@ sub ReadSynchProfile {
   my $seen_clockrate = 0;
   my $line;
 
+  my $index = 0;
+  if ($main::opt_total_delay) {
+    $index = 0;
+  } elsif ($main::opt_contentions) {
+    $index = 1;
+  } elsif ($main::opt_mean_delay) {
+    $index = 2;
+  }
+
   while ( $line = <PROFILE> ) {
-    if ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
-         $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
-      my ($count, $stack) = ($1, $2);
-      if ($count !~ /^\d+$/) {
+    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $count, $stack) = ($1, $2, $3);
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      my @values = ($cycles, $count, $cycles / $count);
+      AddEntries($profile, $pcs, $stack, $values[$index]);
+
+    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
+              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $stack) = ($1, $2);
+      if ($cycles !~ /^\d+$/) {
         next;
       }
 
       # Convert cycles to nanoseconds
-      $count /= $cyclespernanosec;
-      AddEntries($profile, $pcs, $stack, $count);
+      $cycles /= $cyclespernanosec;
+      AddEntries($profile, $pcs, $stack, $cycles);
 
     } elsif ( $line =~ m|cycles/second = (\d+)|) {
       $cyclespernanosec = $1 / 1e9;
@@ -1838,6 +2214,7 @@ sub HexExtend {
 
 # Split /proc/pid/maps dump into a list of libraries
 sub ParseLibraries {
+  return if $main::use_symbol_page;  # We don't need libraries info.
   my $prog = shift;
   my $map = shift;
   my $pcs = shift;
diff --git a/src/profiler.cc b/src/profiler.cc
index 5843720..8ddcc41 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -460,12 +460,12 @@ void ProfileData::SetHandler(void (*handler)(int)) {
 }
 
 void ProfileData::FlushTable() {
-  if (out_ < 0) {
-    // Profiling is not enabled
-    return;
-  }
-
   LOCK(&state_lock_); {
+    if (out_ < 0) {
+      // Profiling is not enabled
+      UNLOCK(&state_lock_);
+      return;
+    }
     SetHandler(SIG_IGN);       // Disable timer interrupts while we're flushing
     LOCK(&table_lock_); {
       // Move data from hash table to eviction buffer
diff --git a/src/stacktrace.cc b/src/stacktrace.cc
index 859d52a..da20659 100644
--- a/src/stacktrace.cc
+++ b/src/stacktrace.cc
@@ -45,17 +45,14 @@
 #include "stacktrace_x86-inl.h"
 #endif
 
-#if !defined(IMPLEMENTED_STACK_TRACE) && defined(USE_LIBUNWIND) && HAVE_LIBUNWIND_H
+#if !defined(IMPLEMENTED_STACK_TRACE) && defined(__x86_64__) && HAVE_LIBUNWIND_H
 #define IMPLEMENTED_STACK_TRACE
-// This is turned off by default. Possible reasons for turning on in the
-// future:
-// 1. Compiler independence
-// 2. Architecture independence
-// 3. A more liberal MIT license, which allows use with multiple compilers
+#define UNW_LOCAL_ONLY
 #include "stacktrace_libunwind-inl.h"
 #endif
 
 #if !defined(IMPLEMENTED_STACK_TRACE) && defined(__x86_64__) && HAVE_UNWIND_H
+// This implementation suffers from deadlocks. Don't enable it.
 #define IMPLEMENTED_STACK_TRACE
 #include "stacktrace_x86_64-inl.h"
 #endif
diff --git a/src/stacktrace_libunwind-inl.h b/src/stacktrace_libunwind-inl.h
index 42c28d3..bf39633 100644
--- a/src/stacktrace_libunwind-inl.h
+++ b/src/stacktrace_libunwind-inl.h
@@ -51,14 +51,14 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
 
   do {
     ret = unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip);
-    assert(ret == 0);
+    if (ret < 0)
+      break;
     if (skip_count > 0) {
       skip_count--;
     } else {
       result[n++] = ip;
     }
     ret = unw_step(&cursor);
-    assert(ret >= 0);
   } while ((n < max_depth) && (ret > 0));
 
   return n;
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 5dc062e..bf45dfb 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -79,6 +79,7 @@
 #include <unistd.h>
 #include <errno.h>
 #include <stdarg.h>
+#include "base/commandlineflags.h"
 #include "google/malloc_hook.h"
 #include "google/malloc_extension.h"
 #include "google/stacktrace.h"
@@ -147,12 +148,27 @@ static const size_t kDefaultOverallThreadCacheSize = 16 << 20;
 // REQUIRED: kMaxPages >= kMinSystemAlloc;
 static const size_t kMaxPages = kMinSystemAlloc;
 
+/* The smallest prime > 2^n */
+static unsigned int primes_list[] = {
+	// Small values might cause high rates of sampling
+	// and hence commented out.
+	// 2, 5, 11, 17, 37, 67, 131, 257,
+	// 521, 1031, 2053, 4099, 8209, 16411,
+	32771, 65537, 131101, 262147, 524309, 1048583,
+	2097169, 4194319, 8388617, 16777259, 33554467 };
+
 // Twice the approximate gap between sampling actions.
 // I.e., we take one sample approximately once every
-//      kSampleParameter/2
+//      tcmalloc_sample_parameter/2
 // bytes of allocation, i.e., ~ once every 128KB.
 // Must be a prime number.
-static const size_t kSampleParameter = 266053;
+DEFINE_int64(tcmalloc_sample_parameter, 262147,
+	     "Twice the approximate gap between sampling actions."
+	     " Must be a prime number. Otherwise will be rounded up to a "
+	     " larger prime number");
+static size_t sample_period = 262147;
+// Protects sample_period above
+static SpinLock sample_period_lock = SPINLOCK_INITIALIZER;
 
 //-------------------------------------------------------------------
 // Mapping from size to size_class and vice versa
@@ -303,6 +319,17 @@ static int NumMoveSize(size_t size) {
   // and thread caches.
   if (num > static_cast<int>(0.8 * kMaxFreeListLength))
     num = static_cast<int>(0.8 * kMaxFreeListLength);
+
+  // Also, avoid bringing in too many objects into small object free
+  // lists.  There are lots of such lists, and if we allow each one to
+  // fetch too many at a time, we end up having to scavenge too often
+  // (especially when there are lots of threads and each thread gets a
+  // small allowance for its thread cache).
+  //
+  // TODO: Make thread cache free list sizes dynamic so that we do not
+  // have to equally divide a fixed resource amongst lots of threads.
+  if (num > 32) num = 32;
+
   return num;
 }
 
@@ -918,7 +945,7 @@ void TCMalloc_PageHeap::Dump(TCMalloc_Printer* out) {
   uint64_t large_pages = 0;
   int large_spans = 0;
   for (Span* s = large_.next; s != &large_; s = s->next) {
-    out->printf("   [ %6" PRIuS " spans ]\n", s->length);
+    out->printf("   [ %6" PRIuS " pages ]\n", s->length);
     large_pages += s->length;
     large_spans++;
   }
@@ -1057,6 +1084,7 @@ class TCMalloc_ThreadCache_FreeList {
     SLL_PopRange(&list_, N, start, end);
     ASSERT(length_ >= N);
     length_ -= N;
+    if (length_ < lowater_) lowater_ = length_;
   }
 };
 
@@ -1669,9 +1697,23 @@ void TCMalloc_ThreadCache::PickNextSample() {
   uint32_t r = rnd_;
   rnd_ = (r << 1) ^ ((static_cast<int32_t>(r) >> 31) & kPoly);
 
-  // Next point is "rnd_ % (2*sample_period)".  I.e., average
-  // increment is "sample_period".
-  bytes_until_sample_ = rnd_ % kSampleParameter;
+  // Next point is "rnd_ % (sample_period)".  I.e., average
+  // increment is "sample_period/2".
+  const int flag_value = FLAGS_tcmalloc_sample_parameter;
+  static int last_flag_value = -1;
+
+  if (flag_value != last_flag_value) {
+    SpinLockHolder h(&sample_period_lock);
+    int i;
+    for (i = 0; i < (sizeof(primes_list)/sizeof(primes_list[0]) - 1); i++) {
+      if (primes_list[i] >= flag_value) {
+        break;
+      }
+    }
+    sample_period = primes_list[i];
+    last_flag_value = flag_value;
+  }
+  bytes_until_sample_ = rnd_ % sample_period;
 }
 
 void TCMalloc_ThreadCache::InitModule() {
@@ -2118,7 +2160,7 @@ static inline void* do_malloc(size_t size) {
   }
   // The following call forces module initialization
   TCMalloc_ThreadCache* heap = TCMalloc_ThreadCache::GetCache();
-  if (heap->SampleAllocation(size)) {
+  if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
     Span* span = DoSampledAllocation(size);
     if (span != NULL) {
       ret = reinterpret_cast<void*>(span->start << kPageShift);
diff --git a/src/tests/heap-checker_unittest.cc b/src/tests/heap-checker_unittest.cc
index e9ec6c3..3e85e7a 100644
--- a/src/tests/heap-checker_unittest.cc
+++ b/src/tests/heap-checker_unittest.cc
@@ -273,6 +273,7 @@ static void DoRunHidden(Closure* c, int n) {
   if (n) {
     run_hidden_ptr(c, n-1);
     wipe_stack_ptr(n);
+    sleep(0);  // undo -foptimize-sibling-calls
   } else {
     c->Run();
   }
@@ -284,6 +285,7 @@ static void DoWipeStack(int n) {
     volatile int arr[sz];
     for (int i = 0; i < sz; ++i)  arr[i] = 0;
     wipe_stack_ptr(n-1);
+    sleep(0);  // undo -foptimize-sibling-calls
   }
 }
 
@@ -463,14 +465,14 @@ static void TestHeapLeakCheckerPProf() {
 // trick heap change: same total # of bytes and objects, but
 // different individual object sizes
 static void TestHeapLeakCheckerTrick() {
-  void* bar1 = AllocHidden(60 * sizeof(int));
+  void* bar1 = AllocHidden(240 * sizeof(int));
   Use(&bar1);
-  void* bar2 = AllocHidden(40 * sizeof(int));
+  void* bar2 = AllocHidden(160 * sizeof(int));
   Use(&bar2);
   HeapLeakChecker check("trick");
-  void* foo1 = AllocHidden(70 * sizeof(int));
+  void* foo1 = AllocHidden(280 * sizeof(int));
   Use(&foo1);
-  void* foo2 = AllocHidden(30 * sizeof(int));
+  void* foo2 = AllocHidden(120 * sizeof(int));
   Use(&foo2);
   DeAllocHidden(&bar1);
   DeAllocHidden(&bar2);
@@ -482,16 +484,16 @@ static void TestHeapLeakCheckerTrick() {
 
 // no false negatives from pprof
 static void TestHeapLeakCheckerDeathTrick() {
-  void* bar1 = AllocHidden(60 * sizeof(int));
+  void* bar1 = AllocHidden(240 * sizeof(int));
   Use(&bar1);
-  void* bar2 = AllocHidden(40 * sizeof(int));
+  void* bar2 = AllocHidden(160 * sizeof(int));
   Use(&bar2);
   HeapLeakChecker check("death_trick");
   DeAllocHidden(&bar1);
   DeAllocHidden(&bar2);
-  void* foo1 = AllocHidden(70 * sizeof(int));
+  void* foo1 = AllocHidden(280 * sizeof(int));
   Use(&foo1);
-  void* foo2 = AllocHidden(30 * sizeof(int));
+  void* foo2 = AllocHidden(120 * sizeof(int));
   Use(&foo2);
   // TODO(maxim): use the above if we make pprof work in automated test runs
   if (!FLAGS_maybe_stripped) {
@@ -733,13 +735,19 @@ static void* HeapBusyThreadBody(void* a) {
       }
     }
     if (FLAGS_test_register_leak) {
-      // Hide the register pointer value with an xor mask.
+      // Hide the register "ptr" value with an xor mask.
       // If one provides --test_register_leak flag, the test should
       // (with very high probability) crash on some leak check
       // with a leak report (of some x * sizeof(int) + y * sizeof(int*) bytes)
       // pointing at the two lines above in this function
       // with "new (initialized) int" in them as the allocators
       // of the leaked objects.
+      // CAVEAT: We can't really prevent a compiler to save some
+      // temporary values of "ptr" on the stack and thus let us find
+      // the heap objects not via the register.
+      // Hence it's normal if for certain compilers or optimization modes
+      // --test_register_leak does not cause a leak crash of the above form
+      // (this happens e.g. for gcc 4.0.1 in opt mode).
       ptr = reinterpret_cast<int **>(
           reinterpret_cast<uintptr_t>(ptr) ^ kHideMask);
       // busy loop to get the thread interrupted at:
diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
index b030e32..9f2df59 100644
--- a/src/tests/tcmalloc_unittest.cc
+++ b/src/tests/tcmalloc_unittest.cc
@@ -399,11 +399,14 @@ static void TestHugeAllocations() {
   for (size_t i = 0; i < 10000; i++) {
     TryHugeAllocation(kMaxSize - i);
   }
-
-  // Check that asking for stuff near signed/unsigned boundary returns NULL
+  // Asking for memory sizes near signed/unsigned boundary (kMaxSignedSize)
+  // might work or not, depending on the amount of virtual memory.
   for (size_t i = 0; i < 100; i++) {
-    TryHugeAllocation(kMaxSignedSize - i);
-    TryHugeAllocation(kMaxSignedSize + i);
+    void* p = NULL;
+    p = malloc(kMaxSignedSize + i);
+    if (p) free(p);    // if: free(NULL) is not necessarily defined
+    p = malloc(kMaxSignedSize - i);
+    if (p) free(p);
   }
 }
 
@@ -560,18 +563,6 @@ int main(int argc, char** argv) {
     free(p);
   }
 
-  // Check that large allocations fail with NULL instead of crashing
-  fprintf(LOGSTREAM, "==== Testing out of memory\n");
-  for (int s = 0; ; s += (10<<20)) {
-    void* large_object = malloc(s);
-    if (large_object == NULL) break;
-    free(large_object);
-  }
-
-  // Check that huge allocations fail with NULL instead of crashing
-  fprintf(LOGSTREAM, "==== Testing huge allocations\n");
-  TestHugeAllocations();
-
   // Check calloc() with various arguments
   fprintf(LOGSTREAM, "==== Testing calloc\n");
   TestCalloc(0, 0, true);
@@ -611,10 +602,16 @@ int main(int argc, char** argv) {
     threads[i] = new TesterThread(i);
   }
 
-  // Start
+  // Start the threads.
+  // Set the stack size to a small value to avoid inheriting 120MB+
+  // limit when running under the google make system.
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setstacksize(&attr, 1 << 20);
   for (int i = 0; i < FLAGS_numthreads; ++i) {
-    CHECK_EQ(pthread_create(&thread_ids[i], NULL, RunThread, threads[i]), 0);
+    CHECK_EQ(pthread_create(&thread_ids[i], &attr, RunThread, threads[i]), 0);
   }
+  pthread_attr_destroy(&attr);
 
   // Wait
   for (int i = 0; i < FLAGS_numthreads; ++i) {
@@ -624,6 +621,21 @@ int main(int argc, char** argv) {
 
   for (int i = 0; i < FLAGS_numthreads; ++i) delete threads[i];    // Cleanup
 
+  // Do the memory intensive tests after threads are done, since exhausting
+  // the available address space can make pthread_create to fail.
+
+  // Check that huge allocations fail with NULL instead of crashing
+  fprintf(LOGSTREAM, "==== Testing huge allocations\n");
+  TestHugeAllocations();
+
+  // Check that large allocations fail with NULL instead of crashing
+  fprintf(LOGSTREAM, "==== Testing out of memory\n");
+  for (int s = 0; ; s += (10<<20)) {
+    void* large_object = malloc(s);
+    if (large_object == NULL) break;
+    free(large_object);
+  }
+
   fprintf(LOGSTREAM, "PASS\n");
   return 0;
 }