* Speed up patching by not patching modules that are already loaded

git-svn-id: http://gperftools.googlecode.com/svn/trunk@88 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
2025-01-01 20:22:01 +00:00 · 2010-02-11 01:32:42 +00:00 · 2010-02-11 01:32:42 +00:00 · 23dd124970
commit 23dd124970
parent 8f8a010cab
2 changed files with 189 additions and 147 deletions
--- a/src/windows/config.h
+++ b/src/windows/config.h
@ -261,10 +261,12 @@
 // ---------------------------------------------------------------------
 // Extra stuff not found in config.h.in

-// This must be defined before the windows.h is included.  It's needed
-// for mutex.h, to give access to the TryLock method.
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
 #ifndef _WIN32_WINNT
-# define _WIN32_WINNT 0x0400
+# define _WIN32_WINNT 0x0501
 #endif

 // We want to make sure not to ever try to #include heap-checker.h
--- a/src/windows/patch_functions.cc
+++ b/src/windows/patch_functions.cc
@ -83,6 +83,7 @@
 #endif

 #include <windows.h>
+#include <stdio.h>
 #include <malloc.h>       // for _msize and _expand
 #include <Psapi.h>        // for EnumProcessModules, GetModuleInformation, etc.
 #include <set>
@ -96,8 +97,6 @@

 // The maximum number of modules we allow to be in one executable
 const int kMaxModules = 8182;
-// The maximum size of a module's basename
-const int kMaxModuleNameSize = 256;

 // These are hard-coded, unfortunately. :-( They are also probably
 // compiler specific.  See get_mangled_names.cc, in this directory,
@ -145,13 +144,15 @@ class LibcInfo {
  LibcInfo() {
    memset(this, 0, sizeof(*this));  // easiest way to initialize the array
  }
-  bool SameAs(const LibcInfo& that) const;
-  bool SameAsModuleEntry(const ModuleEntryCopy& module_entry) const;
-
-  bool patched() const { return is_valid() && module_name_[0] != '\0'; }
-  const char* module_name() const { return is_valid() ? module_name_ : ""; }

+  bool patched() const { return is_valid(); }
  void set_is_valid(bool b) { is_valid_ = b; }
+  // According to http://msdn.microsoft.com/en-us/library/ms684229(VS.85).aspx:
+  // "The load address of a module (lpBaseOfDll) is the same as the HMODULE
+  // value."
+  HMODULE hmodule() const {
+    return reinterpret_cast<HMODULE>(const_cast<void*>(module_base_address_));
+  }

  // Populates all the windows_fn_[] vars based on our module info.
  // Returns false if windows_fn_ is all NULL's, because there's
@ -167,7 +168,6 @@ class LibcInfo {
    memcpy(this->windows_fn_, that.windows_fn_, sizeof(windows_fn_));
    this->module_base_address_ = that.module_base_address_;
    this->module_base_size_ = that.module_base_size_;
-    memcpy(this->module_name_, that.module_name_, sizeof(module_name_));
  }

  enum {
@ -207,7 +207,6 @@ class LibcInfo {

  const void *module_base_address_;
  size_t module_base_size_;
-  char module_name_[kMaxModuleNameSize];

 public:
  // These shouldn't have to be public, since only subclasses of
@ -285,10 +284,8 @@ template<int> class LibcInfoWithPatchFunctions : public LibcInfo {

 // This is a subset of MODDULEENTRY32, that we need for patching.
 struct ModuleEntryCopy {
-  LPVOID  modBaseAddr;
+  LPVOID  modBaseAddr;     // the same as hmodule
  DWORD   modBaseSize;
-  HMODULE hModule;
-  TCHAR   szModule[kMaxModuleNameSize];
  // This is not part of MODDULEENTRY32, but is needed to avoid making
  // windows syscalls while we're holding patch_all_modules_lock (see
  // lock-inversion comments at patch_all_modules_lock definition, below).
@ -297,26 +294,16 @@ struct ModuleEntryCopy {
  ModuleEntryCopy() {
    modBaseAddr = NULL;
    modBaseSize = 0;
-    hModule = NULL;
-    strcpy(szModule, "<executable>");
    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
      rgProcAddresses[i] = LibcInfo::static_fn(i);
  }
-  ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) {
+  ModuleEntryCopy(const MODULEINFO& mi) {
    this->modBaseAddr = mi.lpBaseOfDll;
    this->modBaseSize = mi.SizeOfImage;
-    this->hModule = hmodule;
-    // TODO(csilvers): we could make more efficient by calling these
-    // lazily (not until the vars are needed, which is often never).
-    // However, there's tricky business with calling windows functions
-    // inside the patch_all_modules_lock (see the lock inversion
-    // comments with the patch_all_modules_lock definition, below), so
-    // it's safest to do it all here, where no lock is needed.
-    ::GetModuleBaseNameA(hprocess, hmodule,
-                         this->szModule, sizeof(this->szModule));
    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
-      rgProcAddresses[i] =
-          (GenericFnPtr)::GetProcAddress(hModule, LibcInfo::function_name(i));
+      rgProcAddresses[i] = (GenericFnPtr)::GetProcAddress(
+          reinterpret_cast<const HMODULE>(mi.lpBaseOfDll),
+          LibcInfo::function_name(i));
  }
 };

@ -479,18 +466,6 @@ const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = {
  { "FreeLibrary", NULL, NULL, (GenericFnPtr)&Perftools_FreeLibrary },
 };

-bool LibcInfo::SameAs(const LibcInfo& that) const {
-  return (is_valid() &&
-          module_base_address_ == that.module_base_address_ &&
-          module_base_size_ == that.module_base_size_);
-}
-
-bool LibcInfo::SameAsModuleEntry(const ModuleEntryCopy& module_entry) const {
-  return (is_valid() &&
-          module_base_address_ == module_entry.modBaseAddr &&
-          module_base_size_ == module_entry.modBaseSize);
-}
-
 bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
  // First, store the location of the function to patch before
  // patching it.  If none of these functions are found in the module,
@ -552,10 +527,9 @@ bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
  CHECK(windows_fn_[kFree]);
  CHECK(windows_fn_[kRealloc]);

-  // OK, we successfully patched.  Let's store our member information.
+  // OK, we successfully populated.  Let's store our member information.
  module_base_address_ = module_entry.modBaseAddr;
  module_base_size_ = module_entry.modBaseSize;
-  strcpy(module_name_, module_entry.szModule);
  return true;
 }

@ -636,14 +610,6 @@ void WindowsInfo::Unpatch() {

 // You should hold the patch_all_modules_lock when calling this.
 void PatchOneModuleLocked(const LibcInfo& me_info) {
-  // Double-check we haven't seen this module before.
-  for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
-    if (g_module_libcs[i]->SameAs(me_info)) {
-      fprintf(stderr, "%s:%d: FATAL PERFTOOLS ERROR: %s double-patched somehow.\n",
-              __FILE__, __LINE__, g_module_libcs[i]->module_name());
-      CHECK(false);
-    }
-  }
  // If we don't already have info on this module, let's add it.  This
  // is where we're sad that each libcX has a different type, so we
  // can't use an array; instead, we have to use a switch statement.
@ -686,52 +652,70 @@ void PatchMainExecutableLocked() {
 // patch_all_modules_lock, inside PatchAllModules().
 static SpinLock patch_all_modules_lock(SpinLock::LINKER_INITIALIZED);

+// last_loaded: The set of modules that were loaded the last time
+// PatchAllModules was called.  This is an optimization for only
+// looking at modules that were added or removed from the last call.
+static std::set<HMODULE> *g_last_loaded;
+
 // Iterates over all the modules currently loaded by the executable,
-// and makes sure they're all patched.  For ones that aren't, we patch
-// them in.  We also check that every module we had patched in the
-// past is still loaded, and update internal data structures if so.
-// We return true if this PatchAllModules did any work, false else.
+// according to windows, and makes sure they're all patched.  Most
+// modules will already be in loaded_modules, meaning we have already
+// loaded and either patched them or determined they did not need to
+// be patched.  Others will not, which means we need to patch them
+// (if necessary).  Finally, we have to go through the existing
+// g_module_libcs and see if any of those are *not* in the modules
+// currently loaded by the executable.  If so, we need to invalidate
+// them.  Returns true if we did any work (patching or invalidating),
+// false if we were a noop.  May update loaded_modules as well.
+// NOTE: you must hold the patch_all_modules_lock to access loaded_modules.
 bool PatchAllModules() {
  std::vector<ModuleEntryCopy> modules;
  bool made_changes = false;

  const HANDLE hCurrentProcess = GetCurrentProcess();
-  MODULEINFO mi;
-  DWORD cbNeeded = 0;
+  DWORD num_modules = 0;
  HMODULE hModules[kMaxModules];  // max # of modules we support in one process
-  if (::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
-                           &cbNeeded)) {
-    for (int i = 0; i < cbNeeded / sizeof(*hModules); ++i) {
-      if (i >= kMaxModules) {
-        printf("PERFTOOLS ERROR: Too many modules in this executable to try"
-               " to patch them all (if you need to, raise kMaxModules in"
-               " patch_functions.cc).\n");
-        break;
-      }
-      if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
-        modules.push_back(ModuleEntryCopy(hCurrentProcess, hModules[i], mi));
-    }
+  if (!::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
+                            &num_modules)) {
+    num_modules = 0;
+  }
+  // EnumProcessModules actually set the bytes written into hModules,
+  // so we need to divide to make num_modules actually be a module-count.
+  num_modules /= sizeof(*hModules);
+  if (num_modules >= kMaxModules) {
+    printf("PERFTOOLS ERROR: Too many modules in this executable to try"
+           " to patch them all (if you need to, raise kMaxModules in"
+           " patch_functions.cc).\n");
+    num_modules = kMaxModules;
  }

-  // Now do the actual patching and unpatching.
+  // Now we handle the unpatching of modules we have in g_module_libcs
+  // but that were not found in EnumProcessModules.  We need to
+  // invalidate them.  To speed that up, we store the EnumProcessModules
+  // output in a set.
+  // At the same time, we prepare for the adding of new modules, by
+  // removing from hModules all the modules we know we've already
+  // patched (or decided don't need to be patched).  At the end,
+  // hModules will hold only the modules that we need to consider patching.
+  std::set<HMODULE> currently_loaded_modules;
  {
    SpinLockHolder h(&patch_all_modules_lock);
-    for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
-      if (!g_module_libcs[i]->is_valid())
-        continue;
-      bool still_loaded = false;
-      for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
-           it != modules.end(); ++it) {
-        if (g_module_libcs[i]->SameAsModuleEntry(*it)) {
-          // Both g_module_libcs[i] and it are still valid.  Mark it by
-          // removing it from the vector; mark g_module_libcs[i] by
-          // setting a bool.
-          modules.erase(it);
-          still_loaded = true;
-          break;
-        }
+    if (!g_last_loaded)  g_last_loaded = new std::set<HMODULE>;
+    // At the end of this loop, currently_loaded_modules contains the
+    // full list of EnumProcessModules, and hModules just the ones we
+    // haven't handled yet.
+    for (int i = 0; i < num_modules; ) {
+      currently_loaded_modules.insert(hModules[i]);
+      if (g_last_loaded->count(hModules[i]) > 0) {
+        hModules[i] = hModules[--num_modules];  // replace element i with tail
+      } else {
+        i++;                                    // keep element i
      }
-      if (!still_loaded) {
+    }
+    // Now we do the unpatching/invalidation.
+    for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
+      if (g_module_libcs[i]->patched() &&
+          currently_loaded_modules.count(g_module_libcs[i]->hmodule()) == 0) {
        // Means g_module_libcs[i] is no longer loaded (no me32 matched).
        // We could call Unpatch() here, but why bother?  The module
        // has gone away, so nobody is going to call into it anyway.
@ -739,14 +723,28 @@ bool PatchAllModules() {
        made_changes = true;
      }
    }
+    // Update the loaded module cache.
+    g_last_loaded->swap(currently_loaded_modules);
+  }

-    // We've handled all the g_module_libcs.  Now let's handle the rest
-    // of the module-entries: those that haven't already been loaded.
-    for (std::vector<ModuleEntryCopy>::const_iterator it = modules.begin();
+  // Now that we know what modules are new, let's get the info we'll
+  // need to patch them.  Note this *cannot* be done while holding the
+  // lock, since it needs to make windows calls (see the lock-inversion
+  // comments before the definition of patch_all_modules_lock).
+  MODULEINFO mi;
+  for (int i = 0; i < num_modules; i++) {
+    if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
+      modules.push_back(ModuleEntryCopy(mi));
+  }
+
+  // Now we can do the patching of new modules.
+  {
+    SpinLockHolder h(&patch_all_modules_lock);
+    for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
         it != modules.end(); ++it) {
      LibcInfo libc_info;
      if (libc_info.PopulateWindowsFn(*it)) { // true==module has libc routines
-        PatchOneModuleLocked(libc_info);      // updates num_patched_modules
+        PatchOneModuleLocked(libc_info);
        made_changes = true;
      }
    }
@ -759,6 +757,10 @@ bool PatchAllModules() {
      made_changes = true;
    }
  }
+  // TODO(csilvers): for this to be reliable, we need to also take
+  // into account if we *would* have patched any modules had they not
+  // already been loaded.  (That is, made_changes should ignore
+  // g_last_loaded.)
  return made_changes;
 }

@ -766,59 +768,9 @@ bool PatchAllModules() {
 }  // end unnamed namespace

 // ---------------------------------------------------------------------
-// PatchWindowsFunctions()
-//    This is the function that is exposed to the outside world.
-//    It should be called before the program becomes multi-threaded,
-//    since main_executable_windows.Patch() is not thread-safe.
-// ---------------------------------------------------------------------
-
-void PatchWindowsFunctions() {
-  // This does the libc patching in every module, and the main executable.
-  PatchAllModules();
-  main_executable_windows.Patch();
-}
-
-#if 0
-// It's possible to unpatch all the functions when we are exiting.
-
-// The idea is to handle properly windows-internal data that is
-// allocated before PatchWindowsFunctions is called.  If all
-// destruction happened in reverse order from construction, then we
-// could call UnpatchWindowsFunctions at just the right time, so that
-// that early-allocated data would be freed using the windows
-// allocation functions rather than tcmalloc.  The problem is that
-// windows allocates some structures lazily, so it would allocate them
-// late (using tcmalloc) and then try to deallocate them late as well.
-// So instead of unpatching, we just modify all the tcmalloc routines
-// so they call through to the libc rountines if the memory in
-// question doesn't seem to have been allocated with tcmalloc.  I keep
-// this unpatch code around for reference.
-
-void UnpatchWindowsFunctions() {
-  // We need to go back to the system malloc/etc at global destruct time,
-  // so objects that were constructed before tcmalloc, using the system
-  // malloc, can destroy themselves using the system free.  This depends
-  // on DLLs unloading in the reverse order in which they load!
-  //
-  // We also go back to the default HeapAlloc/etc, just for consistency.
-  // Who knows, it may help avoid weird bugs in some situations.
-  main_executable_windows.Unpatch();
-  main_executable.Unpatch();
-  if (libc1.is_valid()) libc1.Unpatch();
-  if (libc2.is_valid()) libc2.Unpatch();
-  if (libc3.is_valid()) libc3.Unpatch();
-  if (libc4.is_valid()) libc4.Unpatch();
-  if (libc5.is_valid()) libc5.Unpatch();
-  if (libc6.is_valid()) libc6.Unpatch();
-  if (libc7.is_valid()) libc7.Unpatch();
-  if (libc8.is_valid()) libc8.Unpatch();
-}
-#endif
-
-// ---------------------------------------------------------------------
-// Now that we've done all the patching machinery, let's end the file
-// by actually defining the functions we're patching in.  Mostly these
-// are simple wrappers around the do_* routines in tcmalloc.cc.
+// Now that we've done all the patching machinery, let's actually
+// define the functions we're patching in.  Mostly these are
+// simple wrappers around the do_* routines in tcmalloc.cc.
 //
 // In fact, we #include tcmalloc.cc to get at the tcmalloc internal
 // do_* functions, the better to write our own hook functions.
@ -1029,19 +981,107 @@ BOOL WINAPI WindowsInfo::Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress) {
              lpBaseAddress);
 }

+// g_load_map holds a copy of windows' refcount for how many times
+// each currently loaded module has been loaded and unloaded.  We use
+// it as an optimization when the same module is loaded more than
+// once: as long as the refcount stays above 1, we don't need to worry
+// about patching because it's already patched.  Likewise, we don't
+// need to unpatch until the refcount drops to 0.  load_map is
+// maintained in LoadLibraryExW and FreeLibrary, and only covers
+// modules explicitly loaded/freed via those interfaces.
+static std::map<HMODULE, int>* g_load_map = NULL;
+
 HMODULE WINAPI WindowsInfo::Perftools_LoadLibraryExW(LPCWSTR lpFileName,
                                                     HANDLE hFile,
                                                     DWORD dwFlags) {
-  HMODULE rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
-                function_info_[kLoadLibraryExW].origstub_fn)(
-                    lpFileName, hFile, dwFlags);
-  PatchAllModules();
-  return rv;
+  HMODULE rv;
+  // Check to see if the modules is already loaded, flag 0 gets a
+  // reference if it was loaded.  If it was loaded no need to call
+  // PatchAllModules, just increase the reference count to match
+  // what GetModuleHandleExW does internally inside windows.
+  if (::GetModuleHandleExW(0, lpFileName, &rv)) {
+    return rv;
+  } else {
+    // Not already loaded, so load it.
+    rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
+                  function_info_[kLoadLibraryExW].origstub_fn)(
+                      lpFileName, hFile, dwFlags);
+    // This will patch any newly loaded libraries, if patching needs
+    // to be done.
+    PatchAllModules();
+
+    return rv;
+  }
 }

 BOOL WINAPI WindowsInfo::Perftools_FreeLibrary(HMODULE hLibModule) {
  BOOL rv = ((BOOL (WINAPI *)(HMODULE))
             function_info_[kFreeLibrary].origstub_fn)(hLibModule);
+
+  // Check to see if the module is still loaded by passing the base
+  // address and seeing if it comes back with the same address.  If it
+  // is the same address it's still loaded, so the FreeLibrary() call
+  // was a noop, and there's no need to redo the patching.
+  HMODULE owner = NULL;
+  BOOL result = ::GetModuleHandleExW(
+      (GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+       GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT),
+      (LPCWSTR)hLibModule,
+      &owner);
+  if (result && owner == hLibModule)
+    return rv;
+
  PatchAllModules();    // this will fix up the list of patched libraries
  return rv;
 }
+
+
+// ---------------------------------------------------------------------
+// PatchWindowsFunctions()
+//    This is the function that is exposed to the outside world.
+//    It should be called before the program becomes multi-threaded,
+//    since main_executable_windows.Patch() is not thread-safe.
+// ---------------------------------------------------------------------
+
+void PatchWindowsFunctions() {
+  // This does the libc patching in every module, and the main executable.
+  PatchAllModules();
+  main_executable_windows.Patch();
+}
+
+#if 0
+// It's possible to unpatch all the functions when we are exiting.
+
+// The idea is to handle properly windows-internal data that is
+// allocated before PatchWindowsFunctions is called.  If all
+// destruction happened in reverse order from construction, then we
+// could call UnpatchWindowsFunctions at just the right time, so that
+// that early-allocated data would be freed using the windows
+// allocation functions rather than tcmalloc.  The problem is that
+// windows allocates some structures lazily, so it would allocate them
+// late (using tcmalloc) and then try to deallocate them late as well.
+// So instead of unpatching, we just modify all the tcmalloc routines
+// so they call through to the libc rountines if the memory in
+// question doesn't seem to have been allocated with tcmalloc.  I keep
+// this unpatch code around for reference.
+
+void UnpatchWindowsFunctions() {
+  // We need to go back to the system malloc/etc at global destruct time,
+  // so objects that were constructed before tcmalloc, using the system
+  // malloc, can destroy themselves using the system free.  This depends
+  // on DLLs unloading in the reverse order in which they load!
+  //
+  // We also go back to the default HeapAlloc/etc, just for consistency.
+  // Who knows, it may help avoid weird bugs in some situations.
+  main_executable_windows.Unpatch();
+  main_executable.Unpatch();
+  if (libc1.is_valid()) libc1.Unpatch();
+  if (libc2.is_valid()) libc2.Unpatch();
+  if (libc3.is_valid()) libc3.Unpatch();
+  if (libc4.is_valid()) libc4.Unpatch();
+  if (libc5.is_valid()) libc5.Unpatch();
+  if (libc6.is_valid()) libc6.Unpatch();
+  if (libc7.is_valid()) libc7.Unpatch();
+  if (libc8.is_valid()) libc8.Unpatch();
+}
+#endif