gperftools/src/thread_cache.h

// Copyright (c) 2008, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// ---
// Author: Sanjay Ghemawat <opensource@google.com>

#ifndef TCMALLOC_THREAD_CACHE_H_
#define TCMALLOC_THREAD_CACHE_H_

#include "config.h"
#include "common.h"
#include "linked_list.h"
#include "maybe_threads.h"
#include "page_heap_allocator.h"
#include "static_vars.h"

namespace tcmalloc {

// Even if we have support for thread-local storage in the compiler
// and linker, the OS may not support it.  We need to check that at
// runtime.  Right now, we have to keep a manual set of "bad" OSes.
#if defined(HAVE_TLS)
extern bool kernel_supports_tls;   // defined in thread_cache.cc
void CheckIfKernelSupportsTLS();
inline bool KernelSupportsTLS() {
  return kernel_supports_tls;
}
#endif    // HAVE_TLS

//-------------------------------------------------------------------
// Data kept per thread
//-------------------------------------------------------------------

class ThreadCache {
 public:
  // All ThreadCache objects are kept in a linked list (for stats collection)
  ThreadCache* next_;
  ThreadCache* prev_;

  void Init(pthread_t tid);
  void Cleanup();

  // Accessors (mostly just for printing stats)
  int freelist_length(size_t cl) const { return list_[cl].length(); }

  // Total byte size in cache
  size_t Size() const { return size_; }

  void* Allocate(size_t size);
  void Deallocate(void* ptr, size_t size_class);

  void Scavenge();
  void Print() const;

  // Record allocation of "k" bytes.  Return true iff allocation
  // should be sampled
  bool SampleAllocation(size_t k);

  // Pick next sampling point
  void PickNextSample(size_t k);

  static void         InitModule();
  static void         InitTSD();
  static ThreadCache* GetThreadHeap();
  static ThreadCache* GetCache();
  static ThreadCache* GetCacheIfPresent();
  static ThreadCache* CreateCacheIfNecessary();
  static void         DeleteCache(ThreadCache* heap);
  static void         BecomeIdle();
  static void         RecomputeThreadCacheSize();

  // Return the number of thread heaps in use.
  static inline int HeapsInUse();

  // Writes to total_bytes the total number of bytes used by all thread heaps.
  // class_count must be an array of size kNumClasses.  Writes the number of
  // items on the corresponding freelist.  class_count may be NULL.
  // The storage of both parameters must be zero intialized.
  // Requires Static::pageheap_lock is held.
  static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);

  // Sets the total thread cache size to new_size, recomputing the
  // individual thread cache sizes as necessary.
  // REQUIRES: Static::pageheap lock is held.
  static void set_overall_thread_cache_size(size_t new_size);
  static size_t overall_thread_cache_size() {
    return overall_thread_cache_size_;
  }

 private:
  class FreeList {
   private:
    void*    list_;       // Linked list of nodes

#ifdef _LP64
    // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
    // Since it won't cost any space, let's make these fields 32 bits each.
    uint32_t length_;     // Current length
    uint32_t lowater_;    // Low water mark for list length
#else
    // If we aren't using 64-bit pointers then pack these into less space.
    uint16_t length_;
    uint16_t lowater_;
#endif

   public:
    void Init() {
      list_ = NULL;
      length_ = 0;
      lowater_ = 0;
    }

    // Return current length of list
    size_t length() const {
      return length_;
    }

    // Is list empty?
    bool empty() const {
      return list_ == NULL;
    }

    // Low-water mark management
    int lowwatermark() const { return lowater_; }
    void clear_lowwatermark() { lowater_ = length_; }

    void Push(void* ptr) {
      SLL_Push(&list_, ptr);
      length_++;
    }

    void* Pop() {
      ASSERT(list_ != NULL);
      length_--;
      if (length_ < lowater_) lowater_ = length_;
      return SLL_Pop(&list_);
    }

    void PushRange(int N, void *start, void *end) {
      SLL_PushRange(&list_, start, end);
      length_ += N;
    }

    void PopRange(int N, void **start, void **end) {
      SLL_PopRange(&list_, N, start, end);
      ASSERT(length_ >= N);
      length_ -= N;
      if (length_ < lowater_) lowater_ = length_;
    }
  };

  // Default bound on the total amount of thread caches
  static const size_t kDefaultOverallThreadCacheSize = 16 << 20;

  // Lower and upper bounds on the per-thread cache sizes
  static const size_t kMinThreadCacheSize = kMaxSize * 2;
  static const size_t kMaxThreadCacheSize = 2 << 20;

  // Gets and returns an object from the central cache, and, if possible,
  // also adds some objects of that size class to this thread cache.
  void* FetchFromCentralCache(size_t cl, size_t byte_size);

  // Releases N items from this thread cache.  Returns size_.
  size_t ReleaseToCentralCache(FreeList* src, size_t cl, int N);

  // If TLS is available, we also store a copy of the per-thread object
  // in a __thread variable since __thread variables are faster to read
  // than pthread_getspecific().  We still need pthread_setspecific()
  // because __thread variables provide no way to run cleanup code when
  // a thread is destroyed.
  // We also give a hint to the compiler to use the "initial exec" TLS
  // model.  This is faster than the default TLS model, at the cost that
  // you cannot dlopen this library.  (To see the difference, look at
  // the CPU use of __tls_get_addr with and without this attribute.)
  // Since we don't really use dlopen in google code -- and using dlopen
  // on a malloc replacement is asking for trouble in any case -- that's
  // a good tradeoff for us.
#ifdef HAVE_TLS
  static __thread ThreadCache* threadlocal_heap_
# ifdef HAVE___ATTRIBUTE__
   __attribute__ ((tls_model ("initial-exec")))
# endif
   ;
#endif

  // Thread-specific key.  Initialization here is somewhat tricky
  // because some Linux startup code invokes malloc() before it
  // is in a good enough state to handle pthread_keycreate().
  // Therefore, we use TSD keys only after tsd_inited is set to true.
  // Until then, we use a slow path to get the heap object.
  static bool tsd_inited_;
  static pthread_key_t heap_key_;

  // Linked list of heap objects.  Protected by Static::pageheap_lock.
  static ThreadCache* thread_heaps_;
  static int thread_heap_count_;

  // Overall thread cache size.  Protected by Static::pageheap_lock.
  static size_t overall_thread_cache_size_;

  // Global per-thread cache size.  Writes are protected by
  // Static::pageheap_lock.  Reads are done without any locking, which should be
  // fine as long as size_t can be written atomically and we don't place
  // invariants between this variable and other pieces of state.
  static volatile size_t per_thread_cache_size_;

  // Warning: the offset of list_ affects performance.  On general
  // principles, we don't like list_[x] to span multiple L1 cache
  // lines.  However, merely placing list_ at offset 0 here seems to
  // cause cache conflicts.

  // We sample allocations, biased by the size of the allocation
  size_t        bytes_until_sample_;    // Bytes until we sample next
  uint32_t      rnd_;                   // Cheap random number generator

  size_t        size_;                  // Combined size of data
  pthread_t     tid_;                   // Which thread owns it
  FreeList      list_[kNumClasses];     // Array indexed by size-class
  bool          in_setspecific_;        // In call to pthread_setspecific?

  // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
  static inline ThreadCache* NewHeap(pthread_t tid);

  // Use only as pthread thread-specific destructor function.
  static void DestroyThreadCache(void* ptr);
};

// Allocator for thread heaps
// This is logically part of the ThreadCache class, but MSVC, at
// least, does not like using ThreadCache as a template argument
// before the class is fully defined.  So we put it outside the class.
extern PageHeapAllocator<ThreadCache> threadcache_allocator;

inline int ThreadCache::HeapsInUse() {
  return threadcache_allocator.inuse();
}

inline bool ThreadCache::SampleAllocation(size_t k) {
  if (bytes_until_sample_ < k) {
    PickNextSample(k);
    return true;
  } else {
    bytes_until_sample_ -= k;
    return false;
  }
}

inline void* ThreadCache::Allocate(size_t size) {
  ASSERT(size <= kMaxSize);
  const size_t cl = Static::sizemap()->SizeClass(size);
  const size_t alloc_size = Static::sizemap()->ByteSizeForClass(cl);
  FreeList* list = &list_[cl];
  if (list->empty()) {
    return FetchFromCentralCache(cl, alloc_size);
  }
  size_ -= alloc_size;
  return list->Pop();
}

inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
  FreeList* list = &list_[cl];
  ssize_t list_headroom =
      static_cast<ssize_t>(kMaxFreeListLength - 1) - list->length();
  size_ += Static::sizemap()->ByteSizeForClass(cl);
  size_t cache_size = size_;
  ssize_t size_headroom = per_thread_cache_size_ - cache_size - 1;
  list->Push(ptr);

  // There are two relatively uncommon things that require further work.
  // In the common case we're done, and in that case we need a single branch
  // because of the bitwise-or trick that follows.
  if ((list_headroom | size_headroom) < 0) {
    if (list_headroom < 0) {
      cache_size = ReleaseToCentralCache(
          list, cl, Static::sizemap()->num_objects_to_move(cl));
    }
    if (cache_size >= per_thread_cache_size_) Scavenge();
  }
}

inline ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
  // Create the heap and add it to the linked list
  ThreadCache *heap = threadcache_allocator.New();
  heap->Init(tid);
  heap->next_ = thread_heaps_;
  heap->prev_ = NULL;
  if (thread_heaps_ != NULL) thread_heaps_->prev_ = heap;
  thread_heaps_ = heap;
  thread_heap_count_++;
  RecomputeThreadCacheSize();
  return heap;
}

inline ThreadCache* ThreadCache::GetThreadHeap() {
#ifdef HAVE_TLS
  // __thread is faster, but only when the kernel supports it
  if (KernelSupportsTLS())
    return threadlocal_heap_;
#endif
  return reinterpret_cast<ThreadCache *>(
      perftools_pthread_getspecific(heap_key_));
}

inline ThreadCache* ThreadCache::GetCache() {
  ThreadCache* ptr = NULL;
  if (!tsd_inited_) {
    InitModule();
  } else {
    ptr = GetThreadHeap();
  }
  if (ptr == NULL) ptr = CreateCacheIfNecessary();
  return ptr;
}

// In deletion paths, we do not try to create a thread-cache.  This is
// because we may be in the thread destruction code and may have
// already cleaned up the cache for this thread.
inline ThreadCache* ThreadCache::GetCacheIfPresent() {
  if (!tsd_inited_) return NULL;
  return GetThreadHeap();
}

}  // namespace tcmalloc

#endif  // TCMALLOC_THREAD_CACHE_H_
Thu Dec 11 16:01:32 2008 Google Inc. <opensource@google.com> * google-perftools: version 1.0rc1 release * Replace API for selectively disabling heap-checker in code (sanjay) * Add a pre-mmap hook (daven, adlr) * Add MallocExtension interface to set memory-releasing rate (fikes) * Augment pprof to allow any string ending in /pprof/profile (csilvers) * PORTING: Rewrite -- and fix -- malloc patching for windows (dvitek) * PORTING: Add nm-pdb and addr2line-pdb for use by pprof (dvitek) * PORTING: Improve cygwin and mingw support (jperkins, csilvers) * PORTING: Fix pprof for mac os x, other pprof improvements (csilvers) * PORTING: Fix some PPC bugs in our locking code (anton.blanchard) * A new unittest, smapling_test, to verify tcmalloc-profiles (csilvers) * Turn off TLS for gcc < 4.1.2, due to a TLS + -fPIC bug (csilvers) * Prefer __builtin_frame_address to assembly for stacktraces (nlewycky) * Separate tcmalloc.cc out into multiple files -- finally! (kash) * Make our locking code work with -fPIC on 32-bit x86 (aruns) * Fix an initialization-ordering bug for tcmalloc/profiling (csilvers) * Use "initial exec" model of TLS to speed up tcmalloc (csilvers) * Enforce 16-byte alignment for tcmalloc, for SSE (sanjay) git-svn-id: http://gperftools.googlecode.com/svn/trunk@60 6b5cf1ce-ec42-a296-1ba9-69fdba395a50 2008-12-13 01:35:42 +00:00			`// Copyright (c) 2008, Google Inc.`
			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions are`
			`// met:`
			`//`
			`// * Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// * Redistributions in binary form must reproduce the above`
			`// copyright notice, this list of conditions and the following disclaimer`
			`// in the documentation and/or other materials provided with the`
			`// distribution.`
			`// * Neither the name of Google Inc. nor the names of its`
			`// contributors may be used to endorse or promote products derived from`
			`// this software without specific prior written permission.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`

			`// ---`
			`// Author: Sanjay Ghemawat <opensource@google.com>`

			`#ifndef TCMALLOC_THREAD_CACHE_H_`
			`#define TCMALLOC_THREAD_CACHE_H_`

			`#include "config.h"`
			`#include "common.h"`
			`#include "linked_list.h"`
			`#include "maybe_threads.h"`
			`#include "page_heap_allocator.h"`
			`#include "static_vars.h"`

			`namespace tcmalloc {`

			`// Even if we have support for thread-local storage in the compiler`
			`// and linker, the OS may not support it. We need to check that at`
			`// runtime. Right now, we have to keep a manual set of "bad" OSes.`
			`#if defined(HAVE_TLS)`
			`extern bool kernel_supports_tls; // defined in thread_cache.cc`
			`void CheckIfKernelSupportsTLS();`
			`inline bool KernelSupportsTLS() {`
			`return kernel_supports_tls;`
			`}`
			`#endif // HAVE_TLS`

			`//-------------------------------------------------------------------`
			`// Data kept per thread`
			`//-------------------------------------------------------------------`

			`class ThreadCache {`
			`public:`
			`// All ThreadCache objects are kept in a linked list (for stats collection)`
			`ThreadCache* next_;`
			`ThreadCache* prev_;`

			`void Init(pthread_t tid);`
			`void Cleanup();`

			`// Accessors (mostly just for printing stats)`
			`int freelist_length(size_t cl) const { return list_[cl].length(); }`

			`// Total byte size in cache`
			`size_t Size() const { return size_; }`

			`void* Allocate(size_t size);`
			`void Deallocate(void* ptr, size_t size_class);`

			`void Scavenge();`
			`void Print() const;`

			`// Record allocation of "k" bytes. Return true iff allocation`
			`// should be sampled`
			`bool SampleAllocation(size_t k);`

			`// Pick next sampling point`
			`void PickNextSample(size_t k);`

			`static void InitModule();`
			`static void InitTSD();`
			`static ThreadCache* GetThreadHeap();`
			`static ThreadCache* GetCache();`
			`static ThreadCache* GetCacheIfPresent();`
			`static ThreadCache* CreateCacheIfNecessary();`
			`static void DeleteCache(ThreadCache* heap);`
			`static void BecomeIdle();`
			`static void RecomputeThreadCacheSize();`

			`// Return the number of thread heaps in use.`
			`static inline int HeapsInUse();`

			`// Writes to total_bytes the total number of bytes used by all thread heaps.`
			`// class_count must be an array of size kNumClasses. Writes the number of`
			`// items on the corresponding freelist. class_count may be NULL.`
			`// The storage of both parameters must be zero intialized.`
			`// Requires Static::pageheap_lock is held.`
			`static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);`

			`// Sets the total thread cache size to new_size, recomputing the`
			`// individual thread cache sizes as necessary.`
			`// REQUIRES: Static::pageheap lock is held.`
			`static void set_overall_thread_cache_size(size_t new_size);`
			`static size_t overall_thread_cache_size() {`
			`return overall_thread_cache_size_;`
			`}`

			`private:`
			`class FreeList {`
			`private:`
			`void* list_; // Linked list of nodes`

			`#ifdef _LP64`
			`// On 64-bit hardware, manipulating 16-bit values may be slightly slow.`
			`// Since it won't cost any space, let's make these fields 32 bits each.`
			`uint32_t length_; // Current length`
			`uint32_t lowater_; // Low water mark for list length`
			`#else`
			`// If we aren't using 64-bit pointers then pack these into less space.`
			`uint16_t length_;`
			`uint16_t lowater_;`
			`#endif`

			`public:`
			`void Init() {`
			`list_ = NULL;`
			`length_ = 0;`
			`lowater_ = 0;`
			`}`

			`// Return current length of list`
			`size_t length() const {`
			`return length_;`
			`}`

			`// Is list empty?`
			`bool empty() const {`
			`return list_ == NULL;`
			`}`

			`// Low-water mark management`
			`int lowwatermark() const { return lowater_; }`
			`void clear_lowwatermark() { lowater_ = length_; }`

			`void Push(void* ptr) {`
			`SLL_Push(&list_, ptr);`
			`length_++;`
			`}`

			`void* Pop() {`
			`ASSERT(list_ != NULL);`
			`length_--;`
			`if (length_ < lowater_) lowater_ = length_;`
			`return SLL_Pop(&list_);`
			`}`

			`void PushRange(int N, void start, void end) {`
			`SLL_PushRange(&list_, start, end);`
			`length_ += N;`
			`}`

			`void PopRange(int N, void start, void end) {`
			`SLL_PopRange(&list_, N, start, end);`
			`ASSERT(length_ >= N);`
			`length_ -= N;`
			`if (length_ < lowater_) lowater_ = length_;`
			`}`
			`};`

			`// Default bound on the total amount of thread caches`
			`static const size_t kDefaultOverallThreadCacheSize = 16 << 20;`

			`// Lower and upper bounds on the per-thread cache sizes`
			`static const size_t kMinThreadCacheSize = kMaxSize * 2;`
			`static const size_t kMaxThreadCacheSize = 2 << 20;`

			`// Gets and returns an object from the central cache, and, if possible,`
			`// also adds some objects of that size class to this thread cache.`
			`void* FetchFromCentralCache(size_t cl, size_t byte_size);`

			`// Releases N items from this thread cache. Returns size_.`
			`size_t ReleaseToCentralCache(FreeList* src, size_t cl, int N);`

			`// If TLS is available, we also store a copy of the per-thread object`
			`// in a __thread variable since __thread variables are faster to read`
			`// than pthread_getspecific(). We still need pthread_setspecific()`
			`// because __thread variables provide no way to run cleanup code when`
			`// a thread is destroyed.`
			`// We also give a hint to the compiler to use the "initial exec" TLS`
			`// model. This is faster than the default TLS model, at the cost that`
			`// you cannot dlopen this library. (To see the difference, look at`
			`// the CPU use of __tls_get_addr with and without this attribute.)`
			`// Since we don't really use dlopen in google code -- and using dlopen`
			`// on a malloc replacement is asking for trouble in any case -- that's`
			`// a good tradeoff for us.`
			`#ifdef HAVE_TLS`
			`static __thread ThreadCache* threadlocal_heap_`
			`# ifdef HAVE___ATTRIBUTE__`
			`__attribute__ ((tls_model ("initial-exec")))`
			`# endif`
			`;`
			`#endif`

			`// Thread-specific key. Initialization here is somewhat tricky`
			`// because some Linux startup code invokes malloc() before it`
			`// is in a good enough state to handle pthread_keycreate().`
			`// Therefore, we use TSD keys only after tsd_inited is set to true.`
			`// Until then, we use a slow path to get the heap object.`
			`static bool tsd_inited_;`
			`static pthread_key_t heap_key_;`

			`// Linked list of heap objects. Protected by Static::pageheap_lock.`
			`static ThreadCache* thread_heaps_;`
			`static int thread_heap_count_;`

			`// Overall thread cache size. Protected by Static::pageheap_lock.`
			`static size_t overall_thread_cache_size_;`

			`// Global per-thread cache size. Writes are protected by`
			`// Static::pageheap_lock. Reads are done without any locking, which should be`
			`// fine as long as size_t can be written atomically and we don't place`
			`// invariants between this variable and other pieces of state.`
			`static volatile size_t per_thread_cache_size_;`

			`// Warning: the offset of list_ affects performance. On general`
			`// principles, we don't like list_[x] to span multiple L1 cache`
			`// lines. However, merely placing list_ at offset 0 here seems to`
			`// cause cache conflicts.`

			`// We sample allocations, biased by the size of the allocation`
			`size_t bytes_until_sample_; // Bytes until we sample next`
			`uint32_t rnd_; // Cheap random number generator`

			`size_t size_; // Combined size of data`
			`pthread_t tid_; // Which thread owns it`
			`FreeList list_[kNumClasses]; // Array indexed by size-class`
			`bool in_setspecific_; // In call to pthread_setspecific?`

			`// Allocate a new heap. REQUIRES: Static::pageheap_lock is held.`
			`static inline ThreadCache* NewHeap(pthread_t tid);`

			`// Use only as pthread thread-specific destructor function.`
			`static void DestroyThreadCache(void* ptr);`
			`};`

			`// Allocator for thread heaps`
			`// This is logically part of the ThreadCache class, but MSVC, at`
			`// least, does not like using ThreadCache as a template argument`
			`// before the class is fully defined. So we put it outside the class.`
			`extern PageHeapAllocator<ThreadCache> threadcache_allocator;`

			`inline int ThreadCache::HeapsInUse() {`
			`return threadcache_allocator.inuse();`
			`}`

			`inline bool ThreadCache::SampleAllocation(size_t k) {`
			`if (bytes_until_sample_ < k) {`
			`PickNextSample(k);`
			`return true;`
			`} else {`
			`bytes_until_sample_ -= k;`
			`return false;`
			`}`
			`}`

			`inline void* ThreadCache::Allocate(size_t size) {`
			`ASSERT(size <= kMaxSize);`
			`const size_t cl = Static::sizemap()->SizeClass(size);`
			`const size_t alloc_size = Static::sizemap()->ByteSizeForClass(cl);`
			`FreeList* list = &list_[cl];`
			`if (list->empty()) {`
			`return FetchFromCentralCache(cl, alloc_size);`
			`}`
			`size_ -= alloc_size;`
			`return list->Pop();`
			`}`

			`inline void ThreadCache::Deallocate(void* ptr, size_t cl) {`
			`FreeList* list = &list_[cl];`
			`ssize_t list_headroom =`
			`static_cast<ssize_t>(kMaxFreeListLength - 1) - list->length();`
			`size_ += Static::sizemap()->ByteSizeForClass(cl);`
			`size_t cache_size = size_;`
			`ssize_t size_headroom = per_thread_cache_size_ - cache_size - 1;`
			`list->Push(ptr);`

			`// There are two relatively uncommon things that require further work.`
			`// In the common case we're done, and in that case we need a single branch`
			`// because of the bitwise-or trick that follows.`
			`if ((list_headroom \| size_headroom) < 0) {`
			`if (list_headroom < 0) {`
			`cache_size = ReleaseToCentralCache(`
			`list, cl, Static::sizemap()->num_objects_to_move(cl));`
			`}`
			`if (cache_size >= per_thread_cache_size_) Scavenge();`
			`}`
			`}`

			`inline ThreadCache* ThreadCache::NewHeap(pthread_t tid) {`
			`// Create the heap and add it to the linked list`
			`ThreadCache *heap = threadcache_allocator.New();`
			`heap->Init(tid);`
			`heap->next_ = thread_heaps_;`
			`heap->prev_ = NULL;`
			`if (thread_heaps_ != NULL) thread_heaps_->prev_ = heap;`
			`thread_heaps_ = heap;`
			`thread_heap_count_++;`
			`RecomputeThreadCacheSize();`
			`return heap;`
			`}`

			`inline ThreadCache* ThreadCache::GetThreadHeap() {`
			`#ifdef HAVE_TLS`
			`// __thread is faster, but only when the kernel supports it`
			`if (KernelSupportsTLS())`
			`return threadlocal_heap_;`
			`#endif`
			`return reinterpret_cast<ThreadCache *>(`
			`perftools_pthread_getspecific(heap_key_));`
			`}`

			`inline ThreadCache* ThreadCache::GetCache() {`
			`ThreadCache* ptr = NULL;`
			`if (!tsd_inited_) {`
			`InitModule();`
			`} else {`
			`ptr = GetThreadHeap();`
			`}`
			`if (ptr == NULL) ptr = CreateCacheIfNecessary();`
			`return ptr;`
			`}`

			`// In deletion paths, we do not try to create a thread-cache. This is`
			`// because we may be in the thread destruction code and may have`
			`// already cleaned up the cache for this thread.`
			`inline ThreadCache* ThreadCache::GetCacheIfPresent() {`
			`if (!tsd_inited_) return NULL;`
			`return GetThreadHeap();`
			`}`

			`} // namespace tcmalloc`

			`#endif // TCMALLOC_THREAD_CACHE_H_`