Added Google Benchmark library (nw)

Included sample benchmark for eminline for native and noasm Made GoogleTest compile only if tests are compiled
2025-04-18 22:49:58 +03:00 · 2016-01-29 11:47:40 +01:00 · 2016-01-29 11:47:40 +01:00 · 042050ef67
commit 042050ef67
parent 1319453a84
69 changed files with 6852 additions and 64 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 /*
 /*/
 !/3rdparty/
+!/benchmarks/
 !/artwork/
 !/docs/
 !/hash/
--- a/3rdparty/benchmark/.gitignore
+++ b/3rdparty/benchmark/.gitignore
@ -0,0 +1,46 @@
+*.a
+*.so
+*.so.?*
+*.dll
+*.exe
+*.dylib
+*.cmake
+!/cmake/*.cmake
+*~
+*.pyc
+__pycache__
+
+# lcov
+*.lcov
+/lcov
+
+# cmake files.
+/Testing
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+
+# makefiles.
+Makefile
+
+# in-source build.
+bin/
+lib/
+/test/*_test
+
+# exuberant ctags.
+tags
+
+# YouCompleteMe configuration.
+.ycm_extra_conf.pyc
+
+# ninja generated files.
+.ninja_deps
+.ninja_log
+build.ninja
+install_manifest.txt
+rules.ninja
+
+# out-of-source build top-level folders.
+build/
+_build/
--- a/3rdparty/benchmark/.travis-setup.sh
+++ b/3rdparty/benchmark/.travis-setup.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Before install
+
+sudo add-apt-repository -y ppa:kalakris/cmake
+if [ "$STD" = "c++11" ]; then
+    sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    if [ "$CXX" = "clang++" ]; then
+        wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
+        sudo add-apt-repository -y "deb http://llvm.org/apt/precise/ llvm-toolchain-precise-3.6 main"
+    fi
+fi
+sudo apt-get update -qq
+
+# Install
+sudo apt-get install -qq cmake
+if [ "$STD" = "c++11" ] && [ "$CXX" = "g++" ]; then
+    sudo apt-get install -qq gcc-4.8 g++-4.8
+    sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90
+    sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
+elif [ "$CXX" = "clang++" ]; then
+    sudo apt-get install -qq clang-3.6
+    sudo update-alternatives --install /usr/local/bin/clang   clang   /usr/bin/clang-3.6 90
+    sudo update-alternatives --install /usr/local/bin/clang++ clang++ /usr/bin/clang++-3.6 90
+    export PATH=/usr/local/bin:$PATH
+fi
--- a/3rdparty/benchmark/.travis.yml
+++ b/3rdparty/benchmark/.travis.yml
@ -0,0 +1,41 @@
+language: cpp
+
+# NOTE: The COMPILER variable is unused. It simply makes the display on
+# travis-ci.org more readable.
+matrix:
+    include:
+        - compiler: gcc
+          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Coverage
+        - compiler: gcc
+          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Debug
+        - compiler: gcc
+          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Release
+        - compiler: gcc
+          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Debug
+        - compiler: gcc
+          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Release
+        - compiler: clang
+          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Debug
+        - compiler: clang
+          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Release
+
+before_script:
+    - source .travis-setup.sh
+    - mkdir build && cd build
+
+install:
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      PATH=~/.local/bin:${PATH};
+      pip install --user --upgrade pip;
+      pip install --user cpp-coveralls;
+    fi
+
+script:
+    - cmake .. -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="-std=${STD}"
+    - make
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+
+after_success:
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
+    fi
--- a/3rdparty/benchmark/.ycm_extra_conf.py
+++ b/3rdparty/benchmark/.ycm_extra_conf.py
@ -0,0 +1,115 @@
+import os
+import ycm_core
+
+# These are the compilation flags that will be used in case there's no
+# compilation database set (by default, one is not set).
+# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
+flags = [
+'-Wall',
+'-Werror',
+'-pendantic-errors',
+'-std=c++0x',
+'-fno-strict-aliasing',
+'-O3',
+'-DNDEBUG',
+# ...and the same thing goes for the magic -x option which specifies the
+# language that the files to be compiled are written in. This is mostly
+# relevant for c++ headers.
+# For a C project, you would set this to 'c' instead of 'c++'.
+'-x', 'c++',
+'-I', 'include',
+'-isystem', '/usr/include',
+'-isystem', '/usr/local/include',
+]
+
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if os.path.exists( compilation_database_folder ):
+  database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+  database = None
+
+SOURCE_EXTENSIONS = [ '.cc' ]
+
+def DirectoryOfThisScript():
+  return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+  if not working_directory:
+    return list( flags )
+  new_flags = []
+  make_next_absolute = False
+  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+  for flag in flags:
+    new_flag = flag
+
+    if make_next_absolute:
+      make_next_absolute = False
+      if not flag.startswith( '/' ):
+        new_flag = os.path.join( working_directory, flag )
+
+    for path_flag in path_flags:
+      if flag == path_flag:
+        make_next_absolute = True
+        break
+
+      if flag.startswith( path_flag ):
+        path = flag[ len( path_flag ): ]
+        new_flag = path_flag + os.path.join( working_directory, path )
+        break
+
+    if new_flag:
+      new_flags.append( new_flag )
+  return new_flags
+
+
+def IsHeaderFile( filename ):
+  extension = os.path.splitext( filename )[ 1 ]
+  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+
+
+def GetCompilationInfoForFile( filename ):
+  # The compilation_commands.json file generated by CMake does not have entries
+  # for header files. So we do our best by asking the db for flags for a
+  # corresponding source file, if any. If one exists, the flags for that file
+  # should be good enough.
+  if IsHeaderFile( filename ):
+    basename = os.path.splitext( filename )[ 0 ]
+    for extension in SOURCE_EXTENSIONS:
+      replacement_file = basename + extension
+      if os.path.exists( replacement_file ):
+        compilation_info = database.GetCompilationInfoForFile(
+          replacement_file )
+        if compilation_info.compiler_flags_:
+          return compilation_info
+    return None
+  return database.GetCompilationInfoForFile( filename )
+
+
+def FlagsForFile( filename, **kwargs ):
+  if database:
+    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+    # python list, but a "list-like" StringVec object
+    compilation_info = GetCompilationInfoForFile( filename )
+    if not compilation_info:
+      return None
+
+    final_flags = MakeRelativePathsInFlagsAbsolute(
+      compilation_info.compiler_flags_,
+      compilation_info.compiler_working_dir_ )
+  else:
+    relative_to = DirectoryOfThisScript()
+    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+  return {
+    'flags': final_flags,
+    'do_cache': True
+  }
--- a/3rdparty/benchmark/AUTHORS
+++ b/3rdparty/benchmark/AUTHORS
@ -0,0 +1,30 @@
+# This is the official list of benchmark authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+#
+# Names should be added to this file as:
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+#
+# Please keep the list sorted.
+
+Arne Beer <arne@twobeer.de>
+Christopher Seymour <chris.j.seymour@hotmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Dominic Hamon <dma@stripysock.com>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+Google Inc.
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Paul Redmond <paul.redmond@gmail.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Dirac Research 
+Zbigniew Skowron <zbychs@gmail.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
--- a/3rdparty/benchmark/CMakeLists.txt
+++ b/3rdparty/benchmark/CMakeLists.txt
@ -0,0 +1,112 @@
+cmake_minimum_required (VERSION 2.8.11)
+project (benchmark)
+
+option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+# Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Read the git tags to determine the project version
+include(GetGitVersion)
+get_git_version(GIT_VERSION)
+
+# Tell the user what versions we are using
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
+message("-- Version: ${VERSION}")
+
+# The version of the libraries
+set(GENERIC_LIB_VERSION ${VERSION})
+string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+
+# Import our CMake modules
+include(CheckCXXCompilerFlag)
+include(AddCXXCompilerFlag)
+include(CXXFeatureCheck)
+
+# Try and enable C++11. Don't use C++14 because it doesn't work in some
+# configurations.
+add_cxx_compiler_flag(-std=c++11)
+if (NOT HAVE_CXX_FLAG_STD_CXX11)
+  add_cxx_compiler_flag(-std=c++0x)
+endif()
+
+# Turn compiler warnings up to 11
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  add_cxx_compiler_flag(-W4)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+else()
+  add_cxx_compiler_flag(-Wall)
+endif()
+add_cxx_compiler_flag(-Wextra)
+add_cxx_compiler_flag(-Wshadow)
+add_cxx_compiler_flag(-Werror RELEASE)
+add_cxx_compiler_flag(-pedantic)
+add_cxx_compiler_flag(-pedantic-errors)
+add_cxx_compiler_flag(-Wshorten-64-to-32)
+add_cxx_compiler_flag(-Wfloat-equal)
+add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
+add_cxx_compiler_flag(-fstrict-aliasing)
+if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
+  add_cxx_compiler_flag(-Wstrict-aliasing)
+endif()
+add_cxx_compiler_flag(-Wthread-safety)
+if (HAVE_WTHREAD_SAFETY)
+  add_definitions(-DHAVE_WTHREAD_SAFETY)
+  cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+endif()
+
+# Link time optimisation
+if (BENCHMARK_ENABLE_LTO)
+  add_cxx_compiler_flag(-flto)
+  if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    find_program(GCC_AR gcc-ar)
+    if (GCC_AR)
+      set(CMAKE_AR ${GCC_AR})
+    endif()
+    find_program(GCC_RANLIB gcc-ranlib)
+    if (GCC_RANLIB)
+      set(CMAKE_RANLIB ${GCC_RANLIB})
+    endif()
+  endif()
+endif()
+
+# Coverage build type
+set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING
+  "Flags used by the C++ compiler during coverage builds."
+  FORCE)
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
+  "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING
+  "Flags used for linking binaries during coverage builds."
+  FORCE)
+set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
+  "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING
+  "Flags used by the shared libraries linker during coverage builds."
+  FORCE)
+mark_as_advanced(
+  CMAKE_CXX_FLAGS_COVERAGE
+  CMAKE_EXE_LINKER_FLAGS_COVERAGE
+  CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
+set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
+  "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage."
+  FORCE)
+add_cxx_compiler_flag(--coverage COVERAGE)
+
+# C++ feature checks
+cxx_feature_check(STD_REGEX)
+cxx_feature_check(GNU_POSIX_REGEX)
+cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STEADY_CLOCK)
+
+# Ensure we have pthreads
+find_package(Threads REQUIRED)
+
+# Set up directories
+include_directories(${PROJECT_SOURCE_DIR}/include)
+
+# Build the targets
+add_subdirectory(src)
+
+if (BENCHMARK_ENABLE_TESTING)
+  enable_testing()
+  add_subdirectory(test)
+endif()
--- a/3rdparty/benchmark/CONTRIBUTING.md
+++ b/3rdparty/benchmark/CONTRIBUTING.md
@ -0,0 +1,58 @@
+# How to contribute #
+
+We'd love to accept your patches and contributions to this project.  There are
+a just a few small guidelines you need to follow.
+
+
+## Contributor License Agreement ##
+
+Contributions to any Google project must be accompanied by a Contributor
+License Agreement.  This is not a copyright **assignment**, it simply gives
+Google permission to use and redistribute your contributions as part of the
+project.
+
+  * If you are an individual writing original source code and you're sure you
+    own the intellectual property, then you'll need to sign an [individual
+    CLA][].
+
+  * If you work for a company that wants to allow you to contribute your work,
+    then you'll need to sign a [corporate CLA][].
+
+You generally only need to submit a CLA once, so if you've already submitted
+one (even if it was for a different project), you probably don't need to do it
+again.
+
+[individual CLA]: https://developers.google.com/open-source/cla/individual
+[corporate CLA]: https://developers.google.com/open-source/cla/corporate
+
+Once your CLA is submitted (or if you already submitted one for
+another Google project), make a commit adding yourself to the
+[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part
+of your first [pull request][].
+
+[AUTHORS]: AUTHORS
+[CONTRIBUTORS]: CONTRIBUTORS
+
+
+## Submitting a patch ##
+
+  1. It's generally best to start by opening a new issue describing the bug or
+     feature you're intending to fix.  Even if you think it's relatively minor,
+     it's helpful to know what people are working on.  Mention in the initial
+     issue that you are planning to work on that bug or feature so that it can
+     be assigned to you.
+
+  1. Follow the normal process of [forking][] the project, and setup a new
+     branch to work in.  It's important that each group of changes be done in
+     separate branches in order to ensure that a pull request only includes the
+     commits related to that bug or feature.
+
+  1. Do your best to have [well-formed commit messages][] for each change.
+     This provides consistency throughout the project, and ensures that commit
+     messages are able to be formatted properly by various git tools.
+
+  1. Finally, push the commits to your fork and submit a [pull request][].
+
+[forking]: https://help.github.com/articles/fork-a-repo
+[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
+[pull request]: https://help.github.com/articles/creating-a-pull-request
--- a/3rdparty/benchmark/CONTRIBUTORS
+++ b/3rdparty/benchmark/CONTRIBUTORS
@ -0,0 +1,46 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+#
+# Names should be added to this file as:
+#     Name <email address>
+#
+# Please keep the list sorted.
+
+Arne Beer <arne@twobeer.de>
+Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christopher Seymour <chris.j.seymour@hotmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Dominic Hamon <dma@stripysock.com>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Pascal Leroy <phl@google.com>
+Paul Redmond <paul.redmond@gmail.com>
+Pierre Phaneuf <pphaneuf@google.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Tobias Ulvgård <tobias.ulvgard@dirac.se>
+Zbigniew Skowron <zbychs@gmail.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
--- a/3rdparty/benchmark/LICENSE
+++ b/3rdparty/benchmark/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/3rdparty/benchmark/README.md
+++ b/3rdparty/benchmark/README.md
@ -0,0 +1,295 @@
+benchmark
+=========
+[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
+[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
+[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
+
+A library to support the benchmarking of functions, similar to unit-tests.
+
+Discussion group: https://groups.google.com/d/forum/benchmark-discuss
+
+IRC channel: https://freenode.net #googlebenchmark
+
+Example usage
+-------------
+Define a function that executes the code to be measured a
+specified number of times:
+
+```c++
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
+```
+
+Sometimes a family of microbenchmarks can be implemented with
+just one routine that takes an extra argument to specify which
+one of the family of benchmarks to run.  For example, the following
+code defines a family of microbenchmarks for measuring the speed
+of `memcpy()` calls of different lengths:
+
+```c++
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range_x());
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range_x()));
+  delete[] src;
+  delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+```
+
+The preceding code is quite repetitive, and can be replaced with the
+following short-hand.  The following invocation will pick a few
+appropriate arguments in the specified range and will generate a
+microbenchmark for each such argument.
+
+```c++
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+```
+
+You might have a microbenchmark that depends on two inputs.  For
+example, the following code defines a family of microbenchmarks for
+measuring the speed of set insertion.
+
+```c++
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+    ->ArgPair(1<<10, 1)
+    ->ArgPair(1<<10, 8)
+    ->ArgPair(1<<10, 64)
+    ->ArgPair(1<<10, 512)
+    ->ArgPair(8<<10, 1)
+    ->ArgPair(8<<10, 8)
+    ->ArgPair(8<<10, 64)
+    ->ArgPair(8<<10, 512);
+```
+
+The preceding code is quite repetitive, and can be replaced with
+the following short-hand.  The following macro will pick a few
+appropriate arguments in the product of the two specified ranges
+and will generate a microbenchmark for each such pair.
+
+```c++
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+```
+
+For more complex patterns of inputs, passing a custom function
+to Apply allows programmatic specification of an
+arbitrary set of arguments to run the microbenchmark on.
+The following example enumerates a dense range on one parameter,
+and a sparse range on the second.
+
+```c++
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->ArgPair(i, j);
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+```
+
+Templated microbenchmarks work the same way:
+Produce then consume 'size' messages 'iters' times
+Measures throughput in the absence of multiprogramming.
+
+```c++
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+```
+
+Three macros are provided for adding benchmark templates.
+
+```c++
+#if __cplusplus >= 201103L // C++11 and greater.
+#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
+#else // C++ < C++11
+#define BENCHMARK_TEMPLATE(func, arg1)
+#endif
+#define BENCHMARK_TEMPLATE1(func, arg1)
+#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
+```
+
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have called
+KeepRunning, and all will have finished before KeepRunning returns false. As
+such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` function can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  while (state.KeepRunning()) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+Benchmark Fixtures
+------------------
+Fixture tests are created by
+first defining a type that derives from ::benchmark::Fixture and then
+creating/registering the tests using the following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
+
+```c++
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   while (st.KeepRunning()) {
+     ...
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   while (st.KeepRunning()) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
+```
+
+Output Formats
+--------------
+The library supports multiple output formats. Use the
+`--benchmark_format=<tabular|json>` flag to set the format type. `tabular` is
+the default format.
+
+The Tabular format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the 
+tabular data on stdout. Example tabular output looks like:
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of ever benchmark run. Example json
+output looks like:
+```
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+Linking against the library
+---------------------------
+When using gcc, it is necessary to link against pthread to avoid runtime exceptions. This is due to how gcc implements std::thread. See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
--- a/3rdparty/benchmark/appveyor.yml
+++ b/3rdparty/benchmark/appveyor.yml
@ -0,0 +1,55 @@
+version: '{build}'
+
+configuration:
+  - Static Debug
+  - Static Release
+#  - Shared Debug
+#  - Shared Release
+
+platform:
+  - x86
+  - x64
+
+environment:
+  matrix:
+    - compiler: gcc-4.9.2-posix
+#    - compiler: gcc-4.8.4-posix
+#    - compiler: msvc-12-seh
+
+install:
+  # derive some extra information
+  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "linkage=%%a")
+  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "variant=%%b")
+  - if "%linkage%"=="Shared" (set shared=YES) else (set shared=NO)
+  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_name=%%a")
+  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_version=%%b")
+  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_threading=%%c")
+  - if "%platform%"=="x64" (set arch=x86_64)
+  - if "%platform%"=="x86" (set arch=i686)
+  # download the specific version of MinGW
+  - if "%compiler_name%"=="gcc" (for /f %%a in ('python mingw.py --quiet --version "%compiler_version%" --arch "%arch%" --threading "%compiler_threading%" --location "C:\mingw-builds"') do @set "compiler_path=%%a")
+
+before_build:
+  # Set up mingw commands
+  - if "%compiler_name%"=="gcc" (set "generator=MinGW Makefiles")
+  - if "%compiler_name%"=="gcc" (set "build=mingw32-make -j4")
+  - if "%compiler_name%"=="gcc" (set "test=mingw32-make CTEST_OUTPUT_ON_FAILURE=1 test")
+  # msvc specific commands
+  # TODO :)
+  # add the compiler path if needed
+  - if not "%compiler_path%"=="" (set "PATH=%PATH%;%compiler_path%")
+  # git bash conflicts with MinGW makefiles
+  - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files (x86)\Git\bin=%")
+
+build_script:
+  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%variant%" "-DBUILD_SHARED_LIBS=%shared%"
+  - cmd /c "%build%"
+
+test_script:
+  - cmd /c "%test%"
+
+matrix:
+  fast_finish: true
+
+cache:
+  - C:\mingw-builds
--- a/3rdparty/benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/3rdparty/benchmark/cmake/AddCXXCompilerFlag.cmake
@ -0,0 +1,37 @@
+# - Adds a compiler flag if it is supported by the compiler
+#
+# This function checks that the supplied compiler flag is supported and then
+# adds it to the corresponding compiler flags
+#
+#  add_cxx_compiler_flag(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(AddCXXCompilerFlag)
+# add_cxx_compiler_flag(-Wall)
+# add_cxx_compiler_flag(-no-strict-aliasing RELEASE)
+# Requires CMake 2.6+
+
+if(__add_cxx_compiler_flag)
+  return()
+endif()
+set(__add_cxx_compiler_flag INCLUDED)
+
+include(CheckCXXCompilerFlag)
+
+function(add_cxx_compiler_flag FLAG)
+  string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
+  string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
+  check_cxx_compiler_flag("" ${SANITIZED_FLAG})
+  if(${SANITIZED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+  endif()
+endfunction()
+
--- a/3rdparty/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/3rdparty/benchmark/cmake/CXXFeatureCheck.cmake
@ -0,0 +1,39 @@
+# - Compile and run code to check for C++ features
+#
+# This functions compiles a source file under the `cmake` folder
+# and adds the corresponding `HAVE_[FILENAME]` flag to the CMake
+# environment
+#
+#  cxx_feature_check(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(CXXFeatureCheck)
+# cxx_feature_check(STD_REGEX)
+# Requires CMake 2.6+
+
+if(__cxx_feature_check)
+  return()
+endif()
+set(__cxx_feature_check INCLUDED)
+
+function(cxx_feature_check FILE)
+  string(TOLOWER ${FILE} FILE)
+  string(TOUPPER ${FILE} VAR)
+  string(TOUPPER "HAVE_${VAR}" FEATURE)
+  message("-- Performing Test ${FEATURE}")
+  try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp)
+  if(RUN_${FEATURE} EQUAL 0)
+    message("-- Performing Test ${FEATURE} -- success")
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
+  else()
+    if(NOT COMPILE_${FEATURE})
+      message("-- Performing Test ${FEATURE} -- failed to compile")
+    else()
+      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
+    endif()
+  endif()
+endfunction()
+
--- a/3rdparty/benchmark/cmake/GetGitVersion.cmake
+++ b/3rdparty/benchmark/cmake/GetGitVersion.cmake
@ -0,0 +1,51 @@
+# - Returns a version string from Git tags
+#
+# This function inspects the annotated git tags for the project and returns a string
+# into a CMake variable
+#
+#  get_git_version(<var>)
+#
+# - Example
+#
+# include(GetGitVersion)
+# get_git_version(GIT_VERSION)
+#
+# Requires CMake 2.8.11+
+find_package(Git)
+
+if(__get_git_version)
+  return()
+endif()
+set(__get_git_version INCLUDED)
+
+function(get_git_version var)
+  if(GIT_EXECUTABLE)
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          RESULT_VARIABLE status
+          OUTPUT_VARIABLE GIT_VERSION
+          ERROR_QUIET)
+      if(${status})
+          set(GIT_VERSION "v0.0.0")
+      else()
+          string(STRIP ${GIT_VERSION} GIT_VERSION)
+          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+      endif()
+
+      # Work out if the repository is dirty
+      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          OUTPUT_QUIET
+          ERROR_QUIET)
+      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          OUTPUT_VARIABLE GIT_DIFF_INDEX
+          ERROR_QUIET)
+      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
+      if (${GIT_DIRTY})
+          set(GIT_VERSION "${GIT_VERSION}-dirty")
+      endif()
+  else()
+      set(GIT_VERSION "v0.0.0")
+  endif()
+
+  message("-- git Version: ${GIT_VERSION}")
+  set(${var} ${GIT_VERSION} PARENT_SCOPE)
+endfunction()
--- a/3rdparty/benchmark/cmake/gnu_posix_regex.cpp
+++ b/3rdparty/benchmark/cmake/gnu_posix_regex.cpp
@ -0,0 +1,12 @@
+#include <gnuregex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+}
+
--- a/3rdparty/benchmark/cmake/posix_regex.cpp
+++ b/3rdparty/benchmark/cmake/posix_regex.cpp
@ -0,0 +1,12 @@
+#include <regex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+}
+
--- a/3rdparty/benchmark/cmake/std_regex.cpp
+++ b/3rdparty/benchmark/cmake/std_regex.cpp
@ -0,0 +1,10 @@
+#include <regex>
+#include <string>
+int main() {
+  const std::string str = "test0159";
+  std::regex re;
+  re = std::regex("^[a-z]+[0-9]+$",
+       std::regex_constants::extended | std::regex_constants::nosubs);
+  return std::regex_search(str, re) ? 0 : -1;
+}
+
--- a/3rdparty/benchmark/cmake/steady_clock.cpp
+++ b/3rdparty/benchmark/cmake/steady_clock.cpp
@ -0,0 +1,7 @@
+#include <chrono>
+
+int main() {
+    typedef std::chrono::steady_clock Clock;
+    Clock::time_point tp = Clock::now();
+    ((void)tp);
+}
--- a/3rdparty/benchmark/cmake/thread_safety_attributes.cpp
+++ b/3rdparty/benchmark/cmake/thread_safety_attributes.cpp
@ -0,0 +1,4 @@
+#define HAVE_THREAD_SAFETY_ATTRIBUTES
+#include "../src/mutex.h"
+
+int main() {}
--- a/3rdparty/benchmark/include/benchmark/benchmark.h
+++ b/3rdparty/benchmark/include/benchmark/benchmark.h
@ -0,0 +1,21 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_BENCHMARK_H_
+#define BENCHMARK_BENCHMARK_H_
+
+#include "macros.h"
+#include "benchmark_api.h"
+#include "reporter.h"
+
+#endif // BENCHMARK_BENCHMARK_H_
--- a/3rdparty/benchmark/include/benchmark/benchmark_api.h
+++ b/3rdparty/benchmark/include/benchmark/benchmark_api.h
@ -0,0 +1,602 @@
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range_x());
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range_x()));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->ArgPair(1<<10, 1)
+   ->ArgPair(1<<10, 8)
+   ->ArgPair(1<<10, 64)
+   ->ArgPair(1<<10, 512)
+   ->ArgPair(8<<10, 1)
+   ->ArgPair(8<<10, 8)
+   ->ArgPair(8<<10, 64)
+   ->ArgPair(8<<10, 512);
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->ArgPair(i, j);
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
+benchmark. This option overrides the `benchmark_min_time` flag.
+
+void BM_test(benchmark::State& state) {
+ ... body ...
+}
+BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have called KeepRunning, and all will have finished before KeepRunning
+returns false. As such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+*/
+
+#ifndef BENCHMARK_BENCHMARK_API_H_
+#define BENCHMARK_BENCHMARK_API_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "macros.h"
+
+namespace benchmark {
+class BenchmarkReporter;
+
+void Initialize(int* argc, char** argv);
+
+// Otherwise, run all benchmarks specified by the --benchmark_filter flag,
+// and exit after running the benchmarks.
+void RunSpecifiedBenchmarks();
+void RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
+
+// If this routine is called, peak memory allocation past this point in the
+// benchmark is reported at the end of the benchmark report line. (It is
+// computed by running the benchmark once with a single iteration and a memory
+// tracer.)
+// TODO(dominic)
+// void MemoryUsage();
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+class BenchmarkFamilies;
+
+template <class T> struct Voider {
+    typedef void type;
+};
+
+template <class T, class = void>
+struct EnableIfString {};
+
+template <class T>
+struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
+    typedef int type;
+};
+
+void UseCharPointer(char const volatile*);
+
+// Take ownership of the pointer and register the benchmark. Return the
+// registered benchmark.
+Benchmark* RegisterBenchmarkInternal(Benchmark*);
+
+} // end namespace internal
+
+
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intented to add little to no overhead.
+// See: http://stackoverflow.com/questions/28287064
+#if defined(__clang__) && defined(__GNUC__)
+// TODO(ericwf): Clang has a bug where it tries to always use a register
+// even if value must be stored in memory. This causes codegen to fail.
+// To work around this we remove the "r" modifier so the operand is always
+// loaded into memory.
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+    asm volatile("" : "+m" (const_cast<Tp&>(value)));
+}
+#elif defined(__GNUC__)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+    asm volatile("" : "+rm" (const_cast<Tp&>(value)));
+}
+#else
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+    internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+#endif
+
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+public:
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
+
+  // Returns true iff the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+        ResumeTiming();
+        started_ = true;
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+        assert(started_);
+        PauseTiming();
+        // Total iterations now is one greater than max iterations. Fix this.
+        total_iterations_ = max_iterations;
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will stop when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // For threaded benchmarks the ResumeTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will start when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, the report is printed in MB/sec instead of nanoseconds
+  // per iteration.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) {
+    bytes_processed_ = bytes;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const {
+    return bytes_processed_;
+  }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) {
+    items_processed_ = items;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const {
+    return items_processed_;
+  }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(int iters) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetLabel(const char* label);
+
+  // Allow the use of std::string without actually including <string>.
+  // This function does not participate in overload resolution unless StringType
+  // has the nested typename `basic_string`. This typename should be provided
+  // as an injected class name in the case of std::string.
+  template <class StringType>
+  void SetLabel(StringType const & str,
+                typename internal::EnableIfString<StringType>::type = 1) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int range_x() const {
+    assert(has_range_x_);
+    ((void)has_range_x_); // Prevent unused warning.
+    return range_x_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int range_y() const {
+    assert(has_range_y_);
+    ((void)has_range_y_); // Prevent unused warning.
+    return range_y_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+private:
+  bool started_;
+  size_t total_iterations_;
+
+  bool has_range_x_;
+  int range_x_;
+
+  bool has_range_y_;
+  int range_y_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+public:
+  const int thread_index;
+  const size_t max_iterations;
+
+private:
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+};
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+public:
+  virtual ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int x);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int start, int limit);
+
+  // Run this benchmark once for every value in the range [start..limit]
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int start, int limit);
+
+  // Run this benchmark once with "x,y" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* ArgPair(int x, int y);
+
+  // Pick a set of values A from the range [lo1..hi1] and a set
+  // of values B from the range [lo2..hi2].  Run the benchmark for
+  // every pair of values in the cartesian product of A and B
+  // (i.e., for all combinations of the values in A and B).
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, ArgPair,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Set the minimum amount of time to use when running this benchmark. This
+  // option overrides the `benchmark_min_time` flag.
+  Benchmark* MinTime(double t);
+
+  // If a particular benchmark is I/O bound, runs multiple threads internally or
+  // if for some reason CPU timings are not representative, call this method. If
+  // called, the elapsed time will be used to control how many iterations are
+  // run, and in the printing of items/second or MB/seconds values.  If not
+  // called, the cpu time used by the benchmark will be used.
+  Benchmark* UseRealTime();
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  virtual void Run(State& state) = 0;
+
+  // Used inside the benchmark implementation
+  struct Instance;
+
+protected:
+  explicit Benchmark(const char* name);
+  Benchmark(Benchmark const&);
+  void SetName(const char* name);
+
+private:
+  friend class BenchmarkFamilies;
+  BenchmarkImp* imp_;
+
+  Benchmark& operator=(Benchmark const&);
+};
+
+// The class used to hold all Benchmarks created from static function.
+// (ie those created using the BENCHMARK(...) macros.
+class FunctionBenchmark : public Benchmark {
+public:
+    FunctionBenchmark(const char* name, Function* func)
+        : Benchmark(name), func_(func)
+    {}
+
+    virtual void Run(State& st);
+private:
+    Function* func_;
+};
+
+}  // end namespace internal
+
+// The base class for all fixture tests.
+class Fixture: public internal::Benchmark {
+public:
+    Fixture() : internal::Benchmark("") {}
+
+    virtual void Run(State& st) {
+      this->SetUp();
+      this->BenchmarkCase(st);
+      this->TearDown();
+    }
+
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+
+protected:
+    virtual void BenchmarkCase(State&) = 0;
+};
+
+}  // end namespace benchmark
+
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
+// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
+// empty. If X is empty the expression becomes (+1 == +0).
+#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
+#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
+#else
+#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
+#endif
+
+// Helpers for generating unique variable names
+#define BENCHMARK_PRIVATE_NAME(n) \
+    BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
+#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+
+#define BENCHMARK_PRIVATE_DECLARE(n)       \
+  static ::benchmark::internal::Benchmark* \
+  BENCHMARK_PRIVATE_NAME(n) BENCHMARK_UNUSED
+
+#define BENCHMARK(n) \
+    BENCHMARK_PRIVATE_DECLARE(n) =                               \
+        (::benchmark::internal::RegisterBenchmarkInternal(       \
+            new ::benchmark::internal::FunctionBenchmark(#n, n)))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE1(n, a) \
+  BENCHMARK_PRIVATE_DECLARE(n) =  \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark(    \
+            #n "<" #a "," #b ">", n<a, b>)))
+
+#if __cplusplus >= 201103L
+#define BENCHMARK_TEMPLATE(n, ...)           \
+  BENCHMARK_PRIVATE_DECLARE(n) =             \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark( \
+        #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+#else
+#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
+#endif
+
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)      \
+class BaseClass##_##Method##_Benchmark : public BaseClass { \
+public:                                                     \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {      \
+        this->SetName(#BaseClass "/" #Method);}             \
+protected:                                                  \
+    virtual void BenchmarkCase(::benchmark::State&);        \
+};
+
+#define BENCHMARK_DEFINE_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+    void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_REGISTER_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
+    BENCHMARK_PRIVATE_DECLARE(TestName) = \
+        (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+
+// This macro will define and register a benchmark within a fixture class.
+#define BENCHMARK_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+    BENCHMARK_REGISTER_F(BaseClass, Method); \
+    void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                   \
+  int main(int argc, char** argv) {        \
+    ::benchmark::Initialize(&argc, argv);  \
+    ::benchmark::RunSpecifiedBenchmarks(); \
+  }
+
+#endif  // BENCHMARK_BENCHMARK_API_H_
--- a/3rdparty/benchmark/include/benchmark/macros.h
+++ b/3rdparty/benchmark/include/benchmark/macros.h
@ -0,0 +1,48 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_MACROS_H_
+#define BENCHMARK_MACROS_H_
+
+#if __cplusplus < 201103L
+# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
+    TypeName(const TypeName&);                         \
+    TypeName& operator=(const TypeName&)
+#else
+# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
+    TypeName(const TypeName&) = delete;                \
+    TypeName& operator=(const TypeName&) = delete
+#endif
+
+#if defined(__GNUC__)
+# define BENCHMARK_UNUSED __attribute__((unused))
+# define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+# define BENCHMARK_NOEXCEPT noexcept
+#elif defined(_MSC_VER) && !defined(__clang__)
+# define BENCHMARK_UNUSED
+# define BENCHMARK_ALWAYS_INLINE __forceinline
+# define BENCHMARK_NOEXCEPT
+# define __func__ __FUNCTION__
+#else
+# define BENCHMARK_UNUSED
+# define BENCHMARK_ALWAYS_INLINE
+# define BENCHMARK_NOEXCEPT
+#endif
+
+#if defined(__GNUC__)
+# define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#else
+# define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#endif
+
+#endif  // BENCHMARK_MACROS_H_
--- a/3rdparty/benchmark/include/benchmark/reporter.h
+++ b/3rdparty/benchmark/include/benchmark/reporter.h
@ -0,0 +1,122 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_REPORTER_H_
+#define BENCHMARK_REPORTER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "benchmark_api.h" // For forward declaration of BenchmarkReporter
+
+namespace benchmark {
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    int num_cpus;
+    double mhz_per_cpu;
+    bool cpu_scaling_enabled;
+
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+  };
+
+  struct Run {
+    Run() :
+      iterations(1),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0) {}
+
+    std::string benchmark_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    int64_t iterations;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Zero if not set by benchmark.
+    double bytes_per_second;
+    double items_per_second;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+  };
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run.
+  // Note that all the grouped benchmark runs should refer to the same
+  // benchmark, thus have the same name.
+  virtual void ReportRuns(const std::vector<Run>& report) = 0;
+
+  // Called once and only once after ever group of benchmarks is run and
+  // reported.
+  virtual void Finalize();
+
+  virtual ~BenchmarkReporter();
+protected:
+    static void ComputeStats(std::vector<Run> const& reports, Run* mean, Run* stddev);
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+protected:
+  virtual void PrintRunData(const Run& report);
+
+  size_t name_field_width_;
+};
+
+class JSONReporter : public BenchmarkReporter {
+public:
+  JSONReporter() : first_report_(true) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual void Finalize();
+
+private:
+  void PrintRunData(const Run& report);
+
+  bool first_report_;
+};
+
+class CSVReporter : public BenchmarkReporter {
+public:
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+private:
+  void PrintRunData(const Run& report);
+};
+
+} // end namespace benchmark
+#endif // BENCHMARK_REPORTER_H_
--- a/3rdparty/benchmark/mingw.py
+++ b/3rdparty/benchmark/mingw.py
@ -0,0 +1,320 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+import argparse
+import errno
+import logging
+import os
+import platform
+import re
+import sys
+import subprocess
+import tempfile
+
+try:
+    import winreg
+except ImportError:
+    import _winreg as winreg
+try:
+    import urllib.request as request
+except ImportError:
+    import urllib as request
+try:
+    import urllib.parse as parse
+except ImportError:
+    import urlparse as parse
+
+class EmptyLogger(object):
+    '''
+    Provides an implementation that performs no logging
+    '''
+    def debug(self, *k, **kw):
+        pass
+    def info(self, *k, **kw):
+        pass
+    def warn(self, *k, **kw):
+        pass
+    def error(self, *k, **kw):
+        pass
+    def critical(self, *k, **kw):
+        pass
+    def setLevel(self, *k, **kw):
+        pass
+
+urls = (
+    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
+        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
+        'repository.txt',
+    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
+        'repository.txt'
+)
+'''
+A list of mingw-build repositories
+'''
+
+def repository(urls = urls, log = EmptyLogger()):
+    '''
+    Downloads and parse mingw-build repository files and parses them
+    '''
+    log.info('getting mingw-builds repository')
+    versions = {}
+    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
+    re_sub = r'http://downloads.sourceforge.net/project/\1'
+    for url in urls:
+        log.debug(' - requesting: %s', url)
+        socket = request.urlopen(url)
+        repo = socket.read()
+        if not isinstance(repo, str):
+            repo = repo.decode();
+        socket.close()
+        for entry in repo.split('\n')[:-1]:
+            value = entry.split('|')
+            version = tuple([int(n) for n in value[0].strip().split('.')])
+            version = versions.setdefault(version, {})
+            arch = value[1].strip()
+            if arch == 'x32':
+                arch = 'i686'
+            elif arch == 'x64':
+                arch = 'x86_64'
+            arch = version.setdefault(arch, {})
+            threading = arch.setdefault(value[2].strip(), {})
+            exceptions = threading.setdefault(value[3].strip(), {})
+            revision = exceptions.setdefault(int(value[4].strip()[3:]),
+                re_sourceforge.sub(re_sub, value[5].strip()))
+    return versions
+
+def find_in_path(file, path=None):
+    '''
+    Attempts to find an executable in the path
+    '''
+    if platform.system() == 'Windows':
+        file += '.exe'
+    if path is None:
+        path = os.environ.get('PATH', '')
+    if type(path) is type(''):
+        path = path.split(os.pathsep)
+    return list(filter(os.path.exists,
+        map(lambda dir, file=file: os.path.join(dir, file), path)))
+
+def find_7zip(log = EmptyLogger()):
+    '''
+    Attempts to find 7zip for unpacking the mingw-build archives
+    '''
+    log.info('finding 7zip')
+    path = find_in_path('7z')
+    if not path:
+        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
+        path, _ = winreg.QueryValueEx(key, 'Path')
+        path = [os.path.join(path, '7z.exe')]
+    log.debug('found \'%s\'', path[0])
+    return path[0]
+
+find_7zip()
+
+def unpack(archive, location, log = EmptyLogger()):
+    '''
+    Unpacks a mingw-builds archive
+    '''
+    sevenzip = find_7zip(log)
+    log.info('unpacking %s', os.path.basename(archive))
+    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
+    log.debug(' - %r', cmd)
+    with open(os.devnull, 'w') as devnull:
+        subprocess.check_call(cmd, stdout = devnull)
+
+def download(url, location, log = EmptyLogger()):
+    '''
+    Downloads and unpacks a mingw-builds archive
+    '''
+    log.info('downloading MinGW')
+    log.debug(' - url: %s', url)
+    log.debug(' - location: %s', location)
+
+    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
+
+    stream = request.urlopen(url)
+    try:
+        content = stream.getheader('Content-Disposition') or ''
+    except AttributeError:
+        content = stream.headers.getheader('Content-Disposition') or ''
+    matches = re_content.match(content)
+    if matches:
+        filename = matches.group(2)
+    else:
+        parsed = parse.urlparse(stream.geturl())
+        filename = os.path.basename(parsed.path)
+
+    try:
+        os.makedirs(location)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(location):
+            pass
+        else:
+            raise
+
+    archive = os.path.join(location, filename)
+    with open(archive, 'wb') as out:
+        while True:
+            buf = stream.read(1024)
+            if not buf:
+                break
+            out.write(buf)
+    unpack(archive, location, log = log)
+    os.remove(archive)
+
+    possible = os.path.join(location, 'mingw64')
+    if not os.path.exists(possible):
+        possible = os.path.join(location, 'mingw32')
+        if not os.path.exists(possible):
+            raise ValueError('Failed to find unpacked MinGW: ' + possible)
+    return possible
+
+def root(location = None, arch = None, version = None, threading = None,
+        exceptions = None, revision = None, log = EmptyLogger()):
+    '''
+    Returns the root folder of a specific version of the mingw-builds variant
+    of gcc. Will download the compiler if needed
+    '''
+
+    # Get the repository if we don't have all the information
+    if not (arch and version and threading and exceptions and revision):
+        versions = repository(log = log)
+
+    # Determine some defaults
+    version = version or max(versions.keys())
+    if not arch:
+        arch = platform.machine().lower()
+        if arch == 'x86':
+            arch = 'i686'
+        elif arch == 'amd64':
+            arch = 'x86_64'
+    if not threading:
+        keys = versions[version][arch].keys()
+        if 'posix' in keys:
+            threading = 'posix'
+        elif 'win32' in keys:
+            threading = 'win32'
+        else:
+            threading = keys[0]
+    if not exceptions:
+        keys = versions[version][arch][threading].keys()
+        if 'seh' in keys:
+            exceptions = 'seh'
+        elif 'sjlj' in keys:
+            exceptions = 'sjlj'
+        else:
+            exceptions = keys[0]
+    if revision == None:
+        revision = max(versions[version][arch][threading][exceptions].keys())
+    if not location:
+        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
+
+    # Get the download url
+    url = versions[version][arch][threading][exceptions][revision]
+
+    # Tell the user whatzzup
+    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
+    log.debug(' - arch: %s', arch)
+    log.debug(' - threading: %s', threading)
+    log.debug(' - exceptions: %s', exceptions)
+    log.debug(' - revision: %s', revision)
+    log.debug(' - url: %s', url)
+
+    # Store each specific revision differently
+    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
+    slug = slug.format(
+        version = '.'.join(str(v) for v in version),
+        arch = arch,
+        threading = threading,
+        exceptions = exceptions,
+        revision = revision
+    )
+    if arch == 'x86_64':
+        root_dir = os.path.join(location, slug, 'mingw64')
+    elif arch == 'i686':
+        root_dir = os.path.join(location, slug, 'mingw32')
+    else:
+        raise ValueError('Unknown MinGW arch: ' + arch)
+
+    # Download if needed
+    if not os.path.exists(root_dir):
+        downloaded = download(url, os.path.join(location, slug), log = log)
+        if downloaded != root_dir:
+            raise ValueError('The location of mingw did not match\n%s\n%s'
+                % (downloaded, root_dir))
+
+    return root_dir
+
+def str2ver(string):
+    '''
+    Converts a version string into a tuple
+    '''
+    try:
+        version = tuple(int(v) for v in string.split('.'))
+        if len(version) is not 3:
+            raise ValueError()
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            'please provide a three digit version string')
+    return version
+
+def main():
+    '''
+    Invoked when the script is run directly by the python interpreter
+    '''
+    parser = argparse.ArgumentParser(
+        description = 'Downloads a specific version of MinGW',
+        formatter_class = argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--location',
+        help = 'the location to download the compiler to',
+        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
+    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
+        help = 'the target MinGW architecture string')
+    parser.add_argument('--version', type = str2ver,
+        help = 'the version of GCC to download')
+    parser.add_argument('--threading', choices = ['posix', 'win32'],
+        help = 'the threading type of the compiler')
+    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
+        help = 'the method to throw exceptions')
+    parser.add_argument('--revision', type=int,
+        help = 'the revision of the MinGW release')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-v', '--verbose', action='store_true',
+        help='increase the script output verbosity')
+    group.add_argument('-q', '--quiet', action='store_true',
+        help='only print errors and warning')
+    args = parser.parse_args()
+
+    # Create the logger
+    logger = logging.getLogger('mingw')
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    if args.quiet:
+        logger.setLevel(logging.WARN)
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    # Get MinGW
+    root_dir = root(location = args.location, arch = args.arch,
+        version = args.version, threading = args.threading,
+        exceptions = args.exceptions, revision = args.revision,
+        log = logger)
+
+    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError as e:
+        sys.stderr.write('IO error: %s\n' % e)
+        sys.exit(1)
+    except OSError as e:
+        sys.stderr.write('OS error: %s\n' % e)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        sys.stderr.write('Killed\n')
+        sys.exit(1)
--- a/3rdparty/benchmark/src/CMakeLists.txt
+++ b/3rdparty/benchmark/src/CMakeLists.txt
@ -0,0 +1,51 @@
+# Allow the source files to find headers in src/
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+# Define the source files
+set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
+                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
+                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
+                 "sysinfo.cc" "walltime.cc")
+# Determine the correct regular expression engine to use
+if(HAVE_STD_REGEX)
+  set(RE_FILES "re_std.cc")
+elseif(HAVE_GNU_POSIX_REGEX)
+  set(RE_FILES "re_posix.cc")
+elseif(HAVE_POSIX_REGEX)
+  set(RE_FILES "re_posix.cc")
+else()
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
+
+add_library(benchmark ${SOURCE_FILES} ${RE_FILES})
+
+
+set_target_properties(benchmark PROPERTIES
+  OUTPUT_NAME "benchmark"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+
+# Link threads.
+target_link_libraries(benchmark ${CMAKE_THREAD_LIBS_INIT})
+
+# We need extra libraries on Windows
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  target_link_libraries(benchmark Shlwapi)
+endif()
+
+# Expose public API
+target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
+
+# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+install(
+  TARGETS benchmark
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+  RUNTIME DESTINATION bin
+  COMPONENT library)
+
+install(
+  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+  DESTINATION include
+  FILES_MATCHING PATTERN "*.*h")
--- a/3rdparty/benchmark/src/arraysize.h
+++ b/3rdparty/benchmark/src/arraysize.h
@ -0,0 +1,34 @@
+#ifndef BENCHMARK_ARRAYSIZE_H_
+#define BENCHMARK_ARRAYSIZE_H_
+
+#include "internal_macros.h"
+
+namespace benchmark {
+namespace internal {
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef COMPILER_MSVC
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
+
+} // end namespace internal
+} // end namespace benchmark
+
+#endif // BENCHMARK_ARRAYSIZE_H_
--- a/3rdparty/benchmark/src/benchmark.cc
+++ b/3rdparty/benchmark/src/benchmark.cc
@ -0,0 +1,919 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <memory>
+#include <thread>
+
+#include "check.h"
+#include "commandlineflags.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "stat.h"
+#include "string_util.h"
+#include "sysinfo.h"
+#include "walltime.h"
+
+DEFINE_bool(benchmark_list_tests, false,
+            "Print a list of benchmarks. This option overrides all other "
+            "options.");
+
+DEFINE_string(benchmark_filter, ".",
+              "A regular expression that specifies the set of benchmarks "
+              "to execute.  If this flag is empty, no benchmarks are run.  "
+              "If this flag is the string \"all\", all benchmarks linked "
+              "into the process are run.");
+
+DEFINE_double(benchmark_min_time, 0.5,
+              "Minimum number of seconds we should run benchmark before "
+              "results are considered significant.  For cpu-time based "
+              "tests, this is the lower bound on the total cpu time "
+              "used by all threads that make up the test.  For real-time "
+              "based tests, this is the lower bound on the elapsed time "
+              "of the benchmark execution, regardless of number of "
+              "threads.");
+
+DEFINE_int32(benchmark_repetitions, 1,
+             "The number of runs of each benchmark. If greater than 1, the "
+             "mean and standard deviation of the runs will be reported.");
+
+DEFINE_string(benchmark_format, "tabular",
+              "The format to use for console output. Valid values are "
+              "'tabular', 'json', or 'csv'.");
+
+DEFINE_bool(color_print, true, "Enables colorized logging.");
+
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+
+
+namespace benchmark {
+
+namespace internal {
+
+void UseCharPointer(char const volatile*) {}
+
+// NOTE: This is a dummy "mutex" type used to denote the actual mutex
+// returned by GetBenchmarkLock(). This is only used to placate the thread
+// safety warnings by giving the return of GetBenchmarkLock() a name.
+struct CAPABILITY("mutex") BenchmarkLockType {};
+BenchmarkLockType BenchmarkLockVar;
+
+} // end namespace internal
+
+inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
+GetBenchmarkLock()
+{
+  static Mutex lock;
+  return lock;
+}
+
+namespace {
+
+bool IsZero(double n) {
+    return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+static const size_t kMaxIterations = 1000000000;
+
+bool running_benchmark = false;
+
+// Global variable so that a benchmark can cause a little extra printing
+std::string* GetReportLabel() {
+    static std::string label GUARDED_BY(GetBenchmarkLock());
+    return &label;
+}
+
+// TODO(ericwf): support MallocCounter.
+//static benchmark::MallocCounter *benchmark_mc;
+
+struct ThreadStats {
+    ThreadStats() : bytes_processed(0), items_processed(0) {}
+    int64_t bytes_processed;
+    int64_t items_processed;
+};
+
+// Timer management class
+class TimerManager {
+ public:
+  TimerManager(int num_threads, Notification* done)
+      : num_threads_(num_threads),
+        done_(done),
+        running_(false),
+        real_time_used_(0),
+        cpu_time_used_(0),
+        num_finalized_(0),
+        phase_number_(0),
+        entered_(0) {
+  }
+
+  // Called by each thread
+  void StartTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(!running_) << "Called StartTimer when timer is already running";
+        running_ = true;
+        start_real_time_ = walltime::Now();
+        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
+       }
+     }
+     if (last_thread) {
+       phase_condition_.notify_all();
+     }
+  }
+
+  // Called by each thread
+  void StopTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(running_) << "Called StopTimer when timer is already stopped";
+        InternalStop();
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
+  void Finalize() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    num_finalized_++;
+    if (num_finalized_ == num_threads_) {
+      CHECK(!running_) <<
+        "The timer should be stopped before the timer is finalized";
+      done_->Notify();
+    }
+  }
+
+  // REQUIRES: timer is not running
+  double real_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int num_threads_;
+  Notification* done_;
+
+  bool running_;                // Is the timer running
+  double start_real_time_;      // If running_
+  double start_cpu_time_;       // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_;
+  double cpu_time_used_;
+
+  // How many threads have called Finalize()
+  int num_finalized_;
+
+  // State for barrier management
+  int phase_number_;
+  int entered_;         // Number of threads that have entered this barrier
+
+  void InternalStop() REQUIRES(lock_) {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += walltime::Now() - start_real_time_;
+    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
+                       - start_cpu_time_);
+  }
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, num_threads_);
+    entered_++;
+    if (entered_ < num_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp;
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      return false;  // I was not the last one
+    } else {
+      // Last thread has reached the barrier
+      phase_number_++;
+      entered_ = 0;
+      return true;
+    }
+  }
+};
+
+// TimerManager for current run.
+static std::unique_ptr<TimerManager> timer_manager = nullptr;
+
+} // end namespace
+
+namespace internal {
+
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  std::string    name;
+  Benchmark*     benchmark;
+  bool           has_arg1;
+  int            arg1;
+  bool           has_arg2;
+  int            arg2;
+  bool           use_real_time;
+  double         min_time;
+  int            threads;    // Number of concurrent threads to use
+  bool           multithreaded;  // Is benchmark multi-threaded?
+};
+
+// Class for managing registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+class BenchmarkFamilies {
+ public:
+  static BenchmarkFamilies* GetInstance();
+
+  // Registers a benchmark family and returns the index assigned to it.
+  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  bool FindBenchmarks(const std::string& re,
+                      std::vector<Benchmark::Instance>* benchmarks);
+ private:
+  BenchmarkFamilies() {}
+
+  std::vector<std::unique_ptr<Benchmark>> families_;
+  Mutex mutex_;
+};
+
+
+class BenchmarkImp {
+public:
+  explicit BenchmarkImp(const char* name);
+  ~BenchmarkImp();
+
+  void Arg(int x);
+  void Range(int start, int limit);
+  void DenseRange(int start, int limit);
+  void ArgPair(int start, int limit);
+  void RangePair(int lo1, int hi1, int lo2, int hi2);
+  void MinTime(double n);
+  void UseRealTime();
+  void Threads(int t);
+  void ThreadRange(int min_threads, int max_threads);
+  void ThreadPerCpu();
+  void SetName(const char* name);
+
+  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
+
+private:
+  friend class BenchmarkFamilies;
+
+  std::string name_;
+  int arg_count_;
+  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
+  double min_time_;
+  bool use_real_time_;
+  std::vector<int> thread_counts_;
+
+  BenchmarkImp& operator=(BenchmarkImp const&);
+};
+
+BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
+  static BenchmarkFamilies instance;
+  return &instance;
+}
+
+
+size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+  MutexLock l(mutex_);
+  size_t index = families_.size();
+  families_.push_back(std::move(family));
+  return index;
+}
+
+bool BenchmarkFamilies::FindBenchmarks(
+    const std::string& spec,
+    std::vector<Benchmark::Instance>* benchmarks) {
+  // Make regular expression out of command-line flag
+  std::string error_msg;
+  Regex re;
+  if (!re.Init(spec, &error_msg)) {
+    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
+  }
+
+  // Special list of thread counts to use when none are specified
+  std::vector<int> one_thread;
+  one_thread.push_back(1);
+
+  MutexLock l(mutex_);
+  for (std::unique_ptr<Benchmark>& bench_family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (!bench_family) continue;
+    BenchmarkImp* family = bench_family->imp_;
+
+    if (family->arg_count_ == -1) {
+      family->arg_count_ = 0;
+      family->args_.emplace_back(-1, -1);
+    }
+    for (auto const& args : family->args_) {
+      const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+         ? &one_thread
+         : &family->thread_counts_);
+      for (int num_threads : *thread_counts) {
+
+        Benchmark::Instance instance;
+        instance.name = family->name_;
+        instance.benchmark = bench_family.get();
+        instance.has_arg1 = family->arg_count_ >= 1;
+        instance.arg1 = args.first;
+        instance.has_arg2 = family->arg_count_ == 2;
+        instance.arg2 = args.second;
+        instance.min_time = family->min_time_;
+        instance.use_real_time = family->use_real_time_;
+        instance.threads = num_threads;
+        instance.multithreaded = !(family->thread_counts_.empty());
+
+        // Add arguments to instance name
+        if (family->arg_count_ >= 1) {
+          AppendHumanReadable(instance.arg1, &instance.name);
+        }
+        if (family->arg_count_ >= 2) {
+          AppendHumanReadable(instance.arg2, &instance.name);
+        }
+        if (!IsZero(family->min_time_)) {
+          instance.name +=  StringPrintF("/min_time:%0.3f",  family->min_time_);
+        }
+        if (family->use_real_time_) {
+          instance.name +=  "/real_time";
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name += StringPrintF("/threads:%d", instance.threads);
+        }
+
+        if (re.Match(instance.name)) {
+          benchmarks->push_back(instance);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+BenchmarkImp::BenchmarkImp(const char* name)
+    : name_(name), arg_count_(-1),
+      min_time_(0.0), use_real_time_(false) {
+}
+
+BenchmarkImp::~BenchmarkImp() {
+}
+
+void BenchmarkImp::Arg(int x) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  args_.emplace_back(x, -1);
+}
+
+void BenchmarkImp::Range(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  std::vector<int> arglist;
+  AddRange(&arglist, start, limit, kRangeMultiplier);
+
+  for (int i : arglist) {
+    args_.emplace_back(i, -1);
+  }
+}
+
+void BenchmarkImp::DenseRange(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  CHECK_GE(start, 0);
+  CHECK_LE(start, limit);
+  for (int arg = start; arg <= limit; arg++) {
+    args_.emplace_back(arg, -1);
+  }
+}
+
+void BenchmarkImp::ArgPair(int x, int y) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  args_.emplace_back(x, y);
+}
+
+void BenchmarkImp::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  std::vector<int> arglist1, arglist2;
+  AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
+  AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
+
+  for (int i : arglist1) {
+    for (int j : arglist2) {
+      args_.emplace_back(i, j);
+    }
+  }
+}
+
+void BenchmarkImp::MinTime(double t) {
+  CHECK(t > 0.0);
+  min_time_ = t;
+}
+
+void BenchmarkImp::UseRealTime() {
+  use_real_time_ = true;
+}
+
+void BenchmarkImp::Threads(int t) {
+  CHECK_GT(t, 0);
+  thread_counts_.push_back(t);
+}
+
+void BenchmarkImp::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+}
+
+void BenchmarkImp::ThreadPerCpu() {
+  static int num_cpus = NumCPUs();
+  thread_counts_.push_back(num_cpus);
+}
+
+void BenchmarkImp::SetName(const char* name) {
+  name_ = name;
+}
+
+void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  static const int kint32max = std::numeric_limits<int32_t>::max();
+
+  // Now space out the benchmarks in multiples of "mult"
+  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
+    if (i >= hi) break;
+    if (i > lo) {
+      dst->push_back(i);
+    }
+  }
+  // Add "hi" (if different from "lo")
+  if (hi != lo) {
+    dst->push_back(hi);
+  }
+}
+
+Benchmark::Benchmark(const char* name)
+    : imp_(new BenchmarkImp(name))
+{
+}
+
+Benchmark::~Benchmark()  {
+  delete imp_;
+}
+
+Benchmark::Benchmark(Benchmark const& other)
+  : imp_(new BenchmarkImp(*other.imp_))
+{
+}
+
+Benchmark* Benchmark::Arg(int x) {
+  imp_->Arg(x);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int start, int limit) {
+  imp_->Range(start, limit);
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int start, int limit) {
+  imp_->DenseRange(start, limit);
+  return this;
+}
+
+Benchmark* Benchmark::ArgPair(int x, int y) {
+  imp_->ArgPair(x, y);
+  return this;
+}
+
+Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  imp_->RangePair(lo1, hi1, lo2, hi2);
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::MinTime(double t) {
+  imp_->MinTime(t);
+  return this;
+}
+
+Benchmark* Benchmark::UseRealTime() {
+  imp_->UseRealTime();
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  imp_->Threads(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  imp_->ThreadRange(min_threads, max_threads);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  imp_->ThreadPerCpu();
+  return this;
+}
+
+void Benchmark::SetName(const char* name) {
+  imp_->SetName(name);
+}
+
+void FunctionBenchmark::Run(State& st) {
+  func_(st);
+}
+
+} // end namespace internal
+
+namespace {
+
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const benchmark::internal::Benchmark::Instance* b,
+                 size_t iters, int thread_id,
+                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
+  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id);
+  b->benchmark->Run(st);
+  CHECK(st.iterations() == st.max_iterations) <<
+    "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(GetBenchmarkLock());
+    total->bytes_processed += st.bytes_processed();
+    total->items_processed += st.items_processed();
+  }
+
+  timer_manager->Finalize();
+}
+
+void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
+                  BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
+  size_t iters = 1;
+
+  std::vector<BenchmarkReporter::Run> reports;
+
+  std::vector<std::thread> pool;
+  if (b.multithreaded)
+    pool.resize(b.threads);
+
+  for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
+    std::string mem;
+    for (;;) {
+      // Try benchmark
+      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+
+      {
+        MutexLock l(GetBenchmarkLock());
+        GetReportLabel()->clear();
+      }
+
+      Notification done;
+      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
+
+      ThreadStats total;
+      running_benchmark = true;
+      if (b.multithreaded) {
+        // If this is out first iteration of the while(true) loop then the
+        // threads haven't been started and can't be joined. Otherwise we need
+        // to join the thread before replacing them.
+        for (std::thread& thread : pool) {
+          if (thread.joinable())
+            thread.join();
+        }
+        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+            pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
+        }
+      } else {
+        // Run directly in this thread
+        RunInThread(&b, iters, 0, &total);
+      }
+      done.WaitForNotification();
+      running_benchmark = false;
+
+      const double cpu_accumulated_time = timer_manager->cpu_time_used();
+      const double real_accumulated_time = timer_manager->real_time_used();
+      timer_manager.reset();
+
+      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
+              << real_accumulated_time << "\n";
+
+      // Base decisions off of real time if requested by this benchmark.
+      double seconds = cpu_accumulated_time;
+      if (b.use_real_time) {
+          seconds = real_accumulated_time;
+      }
+
+      std::string label;
+      {
+        MutexLock l(GetBenchmarkLock());
+        label = *GetReportLabel();
+      }
+
+      const double min_time = !IsZero(b.min_time) ? b.min_time
+                                                  : FLAGS_benchmark_min_time;
+
+      // If this was the first run, was elapsed time or cpu time large enough?
+      // If this is not the first run, go with the current value of iter.
+      if ((i > 0) ||
+          (iters >= kMaxIterations) ||
+          (seconds >= min_time) ||
+          (real_accumulated_time >= 5*min_time)) {
+        double bytes_per_second = 0;
+        if (total.bytes_processed > 0 && seconds > 0.0) {
+          bytes_per_second = (total.bytes_processed / seconds);
+        }
+        double items_per_second = 0;
+        if (total.items_processed > 0 && seconds > 0.0) {
+          items_per_second = (total.items_processed / seconds);
+        }
+
+        // Create report about this benchmark run.
+        BenchmarkReporter::Run report;
+        report.benchmark_name = b.name;
+        report.report_label = label;
+        // Report the total iterations across all threads.
+        report.iterations = static_cast<int64_t>(iters) * b.threads;
+        report.real_accumulated_time = real_accumulated_time;
+        report.cpu_accumulated_time = cpu_accumulated_time;
+        report.bytes_per_second = bytes_per_second;
+        report.items_per_second = items_per_second;
+        reports.push_back(report);
+        break;
+      }
+
+      // See how much iterations should be increased by
+      // Note: Avoid division by zero with max(seconds, 1ns).
+      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
+      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+      // use the multiplier directly. Otherwise we use at most 10 times
+      // expansion.
+      // NOTE: When the last run was at least 10% of the min time the max
+      // expansion should be 14x.
+      bool is_significant = (seconds / min_time) > 0.1;
+      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+      if (multiplier <= 1.0) multiplier = 2.0;
+      double next_iters = std::max(multiplier * iters, iters + 1.0);
+      if (next_iters > kMaxIterations) {
+        next_iters = kMaxIterations;
+      }
+      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+      iters = static_cast<int>(next_iters + 0.5);
+    }
+  }
+  br->ReportRuns(reports);
+  if (b.multithreaded) {
+    for (std::thread& thread : pool)
+      thread.join();
+  }
+}
+
+}  // namespace
+
+State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+             int thread_i)
+    : started_(false), total_iterations_(0),
+      has_range_x_(has_x), range_x_(x),
+      has_range_y_(has_y), range_y_(y),
+      bytes_processed_(0), items_processed_(0),
+      thread_index(thread_i),
+      max_iterations(max_iters)
+{
+    CHECK(max_iterations != 0) << "At least one iteration must be run";
+}
+
+void State::PauseTiming() {
+  // Add in time accumulated so far
+  CHECK(running_benchmark);
+  timer_manager->StopTimer();
+}
+
+void State::ResumeTiming() {
+  CHECK(running_benchmark);
+  timer_manager->StartTimer();
+}
+
+void State::SetLabel(const char* label) {
+  CHECK(running_benchmark);
+  MutexLock l(GetBenchmarkLock());
+  *GetReportLabel() = label;
+}
+
+namespace internal {
+namespace {
+
+void PrintBenchmarkList() {
+  std::vector<Benchmark::Instance> benchmarks;
+  auto families = BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(".", &benchmarks)) return;
+
+  for (const internal::Benchmark::Instance& benchmark : benchmarks) {
+    std::cout <<  benchmark.name << "\n";
+  }
+}
+
+void RunMatchingBenchmarks(const std::string& spec,
+                           BenchmarkReporter* reporter) {
+  CHECK(reporter != nullptr);
+  if (spec.empty()) return;
+
+  std::vector<Benchmark::Instance> benchmarks;
+  auto families = BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(spec, &benchmarks)) return;
+
+  // Determine the width of the name field using a minimum width of 10.
+  size_t name_field_width = 10;
+  for (const Benchmark::Instance& benchmark : benchmarks) {
+    name_field_width =
+        std::max<size_t>(name_field_width, benchmark.name.size());
+  }
+  if (FLAGS_benchmark_repetitions > 1)
+    name_field_width += std::strlen("_stddev");
+
+  // Print header here
+  BenchmarkReporter::Context context;
+  context.num_cpus = NumCPUs();
+  context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
+
+  context.cpu_scaling_enabled = CpuScalingEnabled();
+  context.name_field_width = name_field_width;
+
+  if (reporter->ReportContext(context)) {
+    for (const auto& benchmark : benchmarks) {
+      RunBenchmark(benchmark, reporter);
+    }
+  }
+}
+
+std::unique_ptr<BenchmarkReporter> GetDefaultReporter() {
+  typedef std::unique_ptr<BenchmarkReporter> PtrType;
+  if (FLAGS_benchmark_format == "tabular") {
+    return PtrType(new ConsoleReporter);
+  } else if (FLAGS_benchmark_format == "json") {
+    return PtrType(new JSONReporter);
+  } else if (FLAGS_benchmark_format == "csv") {
+    return PtrType(new CSVReporter);
+  } else {
+    std::cerr << "Unexpected format: '" << FLAGS_benchmark_format << "'\n";
+    std::exit(1);
+  }
+}
+
+} // end namespace
+} // end namespace internal
+
+void RunSpecifiedBenchmarks() {
+  RunSpecifiedBenchmarks(nullptr);
+}
+
+void RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
+  if (FLAGS_benchmark_list_tests) {
+    internal::PrintBenchmarkList();
+    return;
+  }
+  std::string spec = FLAGS_benchmark_filter;
+  if (spec.empty() || spec == "all")
+    spec = ".";  // Regexp that matches all benchmarks
+
+  std::unique_ptr<BenchmarkReporter> default_reporter;
+  if (!reporter) {
+    default_reporter = internal::GetDefaultReporter();
+    reporter = default_reporter.get();
+  }
+  internal::RunMatchingBenchmarks(spec, reporter);
+  reporter->Finalize();
+}
+
+namespace internal {
+
+void PrintUsageAndExit() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_format=<tabular|json|csv>]\n"
+          "          [--color_print={true|false}]\n"
+          "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, char** argv) {
+  using namespace benchmark;
+  for (int i = 1; i < *argc; ++i) {
+    if (
+        ParseBoolFlag(argv[i], "benchmark_list_tests",
+                      &FLAGS_benchmark_list_tests) ||
+        ParseStringFlag(argv[i], "benchmark_filter",
+                        &FLAGS_benchmark_filter) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseStringFlag(argv[i], "benchmark_format",
+                        &FLAGS_benchmark_format) ||
+        ParseBoolFlag(argv[i], "color_print",
+                       &FLAGS_color_print) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+  if (FLAGS_benchmark_format != "tabular" &&
+      FLAGS_benchmark_format != "json" &&
+      FLAGS_benchmark_format != "csv") {
+    PrintUsageAndExit();
+  }
+}
+
+Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
+    std::unique_ptr<Benchmark> bench_ptr(bench);
+    BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
+    families->AddBenchmark(std::move(bench_ptr));
+    return bench;
+}
+
+} // end namespace internal
+
+void Initialize(int* argc, char** argv) {
+  internal::ParseCommandLineFlags(argc, argv);
+  internal::SetLogLevel(FLAGS_v);
+  // TODO remove this. It prints some output the first time it is called.
+  // We don't want to have this ouput printed during benchmarking.
+  MyCPUUsage();
+  // The first call to walltime::Now initialized it. Call it once to
+  // prevent the initialization from happening in a benchmark.
+  walltime::Now();
+}
+
+} // end namespace benchmark
--- a/3rdparty/benchmark/src/check.h
+++ b/3rdparty/benchmark/src/check.h
@ -0,0 +1,60 @@
+#ifndef CHECK_H_
+#define CHECK_H_
+
+#include <cstdlib>
+#include <ostream>
+
+#include "internal_macros.h"
+#include "log.h"
+
+namespace benchmark {
+namespace internal {
+
+// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
+// will log information about the failures and abort when it is destructed.
+class CheckHandler {
+public:
+  CheckHandler(const char* check, const char* file, const char* func, int line)
+    : log_(GetErrorLogInstance())
+  {
+    log_ << file << ":" << line << ": " << func << ": Check `"
+          << check << "' failed. ";
+  }
+
+  std::ostream& GetLog() {
+    return log_;
+  }
+
+  BENCHMARK_NORETURN ~CheckHandler() {
+      log_ << std::endl;
+      std::abort();
+  }
+
+  CheckHandler & operator=(const CheckHandler&) = delete;
+  CheckHandler(const CheckHandler&) = delete;
+  CheckHandler() = delete;
+private:
+  std::ostream& log_;
+};
+
+} // end namespace internal
+} // end namespace benchmark
+
+// The CHECK macro returns a std::ostream object that can have extra information
+// written to it.
+#ifndef NDEBUG
+# define CHECK(b)  (b ? ::benchmark::internal::GetNullLogInstance()        \
+                      : ::benchmark::internal::CheckHandler(               \
+                          #b, __FILE__, __func__, __LINE__).GetLog())
+#else
+# define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#endif
+
+#define CHECK_EQ(a, b) CHECK((a) == (b))
+#define CHECK_NE(a, b) CHECK((a) != (b))
+#define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
+#define CHECK_GT(a, b) CHECK((a) > (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+
+#endif  // CHECK_H_
--- a/3rdparty/benchmark/src/colorprint.cc
+++ b/3rdparty/benchmark/src/colorprint.cc
@ -0,0 +1,116 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "colorprint.h"
+
+#include <cstdarg>
+#include <cstdio>
+
+#include "commandlineflags.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Windows.h>
+#endif
+
+DECLARE_bool(color_print);
+
+namespace benchmark {
+namespace {
+#ifdef BENCHMARK_OS_WINDOWS
+typedef WORD PlatformColorCode;
+#else
+typedef const char* PlatformColorCode;
+#endif
+
+PlatformColorCode GetPlatformColorCode(LogColor color) {
+#ifdef BENCHMARK_OS_WINDOWS
+  switch (color) {
+    case COLOR_RED:
+      return FOREGROUND_RED;
+    case COLOR_GREEN:
+      return FOREGROUND_GREEN;
+    case COLOR_YELLOW:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    case COLOR_BLUE:
+      return FOREGROUND_BLUE;
+    case COLOR_MAGENTA:
+      return FOREGROUND_BLUE | FOREGROUND_RED;
+    case COLOR_CYAN:
+      return FOREGROUND_BLUE | FOREGROUND_GREEN;
+    case COLOR_WHITE:  // fall through to default
+    default:
+      return 0;
+  }
+#else
+  switch (color) {
+    case COLOR_RED:
+      return "1";
+    case COLOR_GREEN:
+      return "2";
+    case COLOR_YELLOW:
+      return "3";
+    case COLOR_BLUE:
+      return "4";
+    case COLOR_MAGENTA:
+      return "5";
+    case COLOR_CYAN:
+      return "6";
+    case COLOR_WHITE:
+      return "7";
+    default:
+      return nullptr;
+  };
+#endif
+}
+}  // end namespace
+
+void ColorPrintf(LogColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+  if (!FLAGS_color_print) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#ifdef BENCHMARK_OS_WINDOWS
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  const char* color_code = GetPlatformColorCode(color);
+  if (color_code) fprintf(stdout, "\033[0;3%sm", color_code);
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif
+  va_end(args);
+}
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/colorprint.h
+++ b/3rdparty/benchmark/src/colorprint.h
@ -0,0 +1,19 @@
+#ifndef BENCHMARK_COLORPRINT_H_
+#define BENCHMARK_COLORPRINT_H_
+
+namespace benchmark {
+enum LogColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW,
+  COLOR_BLUE,
+  COLOR_MAGENTA,
+  COLOR_CYAN,
+  COLOR_WHITE
+};
+
+void ColorPrintf(LogColor color, const char* fmt, ...);
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COLORPRINT_H_
--- a/3rdparty/benchmark/src/commandlineflags.cc
+++ b/3rdparty/benchmark/src/commandlineflags.cc
@ -0,0 +1,220 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+namespace benchmark {
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const int32_t result = static_cast<int32_t>(long_value);
+  if (long_value == std::numeric_limits<long>::max() ||
+      long_value == std::numeric_limits<long>::min() ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+          // The parsed value overflows as an Int32.
+      ) {
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\", "
+              << "which overflows.\n";
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Parses 'str' for a double.  If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseDouble(const std::string& src_text, const char* str, double* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const double double_value = strtod(str, &end);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a double, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  *value = double_value;
+  return true;
+}
+
+inline const char* GetEnv(const char* name) {
+#if defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (nullptr).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+  return getenv(name);
+#endif
+}
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "BENCHMARK_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string flag_str(flag);
+
+  std::string env_var;
+  for (size_t i = 0; i != flag_str.length(); ++i)
+    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+
+  return "BENCHMARK_" + env_var;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
+                  &result)) {
+    std::cout << "The default value " << default_value << " is used.\n";
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or nullptr if the parsing failed.
+const char* ParseFlagValue(const char* str, const char* flag,
+                           bool def_optional) {
+  // str and flag must not be nullptr.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--".
+  const std::string flag_str = std::string("--") + std::string(flag);
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(std::string("The value of flag --") + flag, value_str,
+                    value);
+}
+
+bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseDouble(std::string("The value of flag --") + flag, value_str,
+                     value);
+}
+
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  *value = value_str;
+  return true;
+}
+
+bool IsFlag(const char* str, const char* flag) {
+  return (ParseFlagValue(str, flag, true) != nullptr);
+}
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/commandlineflags.h
+++ b/3rdparty/benchmark/src/commandlineflags.h
@ -0,0 +1,76 @@
+#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
+#define BENCHMARK_COMMANDLINEFLAGS_H_
+
+#include <cstdint>
+#include <string>
+
+// Macro for referencing flags.
+#define FLAG(name) FLAGS_##name
+
+// Macros for declaring flags.
+#define DECLARE_bool(name) extern bool FLAG(name)
+#define DECLARE_int32(name) extern int32_t FLAG(name)
+#define DECLARE_int64(name) extern int64_t FLAG(name)
+#define DECLARE_double(name) extern double FLAG(name)
+#define DECLARE_string(name) extern std::string FLAG(name)
+
+// Macros for defining flags.
+#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
+#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
+#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
+#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
+#define DEFINE_string(name, default_val, doc) \
+  std::string FLAG(name) = (default_val)
+
+namespace benchmark {
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromEnv(const char* flag, bool default_val);
+int32_t Int32FromEnv(const char* flag, int32_t default_val);
+double DoubleFromEnv(const char* flag, double default_val);
+const char* StringFromEnv(const char* flag, const char* default_val);
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value);
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
+
+// Parses a string for a Double flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseDoubleFlag(const char* str, const char* flag, double* value);
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value);
+
+// Returns true if the string matches the flag.
+bool IsFlag(const char* str, const char* flag);
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
--- a/3rdparty/benchmark/src/console_reporter.cc
+++ b/3rdparty/benchmark/src/console_reporter.cc
@ -0,0 +1,116 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "check.h"
+#include "colorprint.h"
+#include "string_util.h"
+#include "walltime.h"
+
+namespace benchmark {
+
+bool ConsoleReporter::ReportContext(const Context& context) {
+  name_field_width_ = context.name_field_width;
+
+  std::cerr << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
+            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+
+  std::cerr << LocalDateTimeString() << "\n";
+
+  if (context.cpu_scaling_enabled) {
+    std::cerr << "***WARNING*** CPU scaling is enabled, the benchmark "
+                 "real time measurements may be noisy and will incure extra "
+                 "overhead.\n";
+  }
+
+#ifndef NDEBUG
+  std::cerr << "***WARNING*** Library was built as DEBUG. Timings may be "
+               "affected.\n";
+#endif
+
+  int output_width = fprintf(stdout, "%-*s %10s %10s %10s\n",
+                             static_cast<int>(name_field_width_), "Benchmark",
+                             "Time(ns)", "CPU(ns)", "Iterations");
+  std::cout << std::string(output_width - 1, '-') << "\n";
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
+  if (reports.empty()) {
+    return;
+  }
+
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    PrintRunData(run);
+  }
+
+  if (reports.size() < 2) {
+    // We don't report aggregated data if there was a single run.
+    return;
+  }
+
+  Run mean_data;
+  Run stddev_data;
+  BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
+
+  // Output using PrintRun.
+  PrintRunData(mean_data);
+  PrintRunData(stddev_data);
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) {
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
+                   " items/s");
+  }
+
+  double const multiplier = 1e9; // nano second multiplier
+  ColorPrintf(COLOR_GREEN, "%-*s ",
+              name_field_width_, result.benchmark_name.c_str());
+  if (result.iterations == 0) {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                result.real_accumulated_time * multiplier,
+                result.cpu_accumulated_time * multiplier);
+  } else {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                (result.real_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)),
+                (result.cpu_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)));
+  }
+  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
+  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
+              13, rate.c_str(),
+              18, items.c_str(),
+              result.report_label.c_str());
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/csv_reporter.cc
+++ b/3rdparty/benchmark/src/csv_reporter.cc
@ -0,0 +1,105 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "string_util.h"
+#include "walltime.h"
+
+// File format reference: http://edoceo.com/utilitas/csv-file-format.
+
+namespace benchmark {
+
+bool CSVReporter::ReportContext(const Context& context) {
+  std::cerr << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
+            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+
+  std::cerr << LocalDateTimeString() << "\n";
+
+  if (context.cpu_scaling_enabled) {
+    std::cerr << "***WARNING*** CPU scaling is enabled, the benchmark "
+                 "real time measurements may be noisy and will incure extra "
+                 "overhead.\n";
+  }
+
+#ifndef NDEBUG
+  std::cerr << "***WARNING*** Library was built as DEBUG. Timings may be "
+               "affected.\n";
+#endif
+  std::cout << "name,iterations,real_time,cpu_time,bytes_per_second,"
+               "items_per_second,label\n";
+  return true;
+}
+
+void CSVReporter::ReportRuns(std::vector<Run> const& reports) {
+  if (reports.empty()) {
+    return;
+  }
+
+  std::vector<Run> reports_cp = reports;
+  if (reports.size() >= 2) {
+    Run mean_data;
+    Run stddev_data;
+    BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
+    reports_cp.push_back(mean_data);
+    reports_cp.push_back(stddev_data);
+  }
+  for (auto it = reports_cp.begin(); it != reports_cp.end(); ++it) {
+    PrintRunData(*it);
+  }
+}
+
+void CSVReporter::PrintRunData(Run const& run) {
+  double const multiplier = 1e9;  // nano second multiplier
+  double cpu_time = run.cpu_accumulated_time * multiplier;
+  double real_time = run.real_accumulated_time * multiplier;
+  if (run.iterations != 0) {
+    real_time = real_time / static_cast<double>(run.iterations);
+    cpu_time = cpu_time / static_cast<double>(run.iterations);
+  }
+
+  // Field with embedded double-quote characters must be doubled and the field
+  // delimited with double-quotes.
+  std::string name = run.benchmark_name;
+  ReplaceAll(&name, "\"", "\"\"");
+  std::cout << "\"" << name << "\",";
+
+  std::cout << run.iterations << ",";
+  std::cout << real_time << ",";
+  std::cout << cpu_time << ",";
+
+  if (run.bytes_per_second > 0.0) {
+    std::cout << run.bytes_per_second;
+  }
+  std::cout << ",";
+  if (run.items_per_second > 0.0) {
+    std::cout << run.items_per_second;
+  }
+  std::cout << ",";
+  if (!run.report_label.empty()) {
+    // Field with embedded double-quote characters must be doubled and the field
+    // delimited with double-quotes.
+    std::string label = run.report_label;
+    ReplaceAll(&label, "\"", "\"\"");
+    std::cout << "\"" << label << "\"";
+  }
+  std::cout << '\n';
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/cycleclock.h
+++ b/3rdparty/benchmark/src/cycleclock.h
@ -0,0 +1,137 @@
+// ----------------------------------------------------------------------
+// CycleClock
+//    A CycleClock tells you the current time in Cycles.  The "time"
+//    is actually time since power-on.  This is like time() but doesn't
+//    involve a system call and is much more precise.
+//
+// NOTE: Not all cpu/platform/kernel combinations guarantee that this
+// clock increments at a constant rate or is synchronized across all logical
+// cpus in a system.
+//
+// If you need the above guarantees, please consider using a different
+// API. There are efforts to provide an interface which provides a millisecond
+// granularity and implemented as a memory read. A memory read is generally
+// cheaper than the CycleClock for many architectures.
+//
+// Also, in some out of order CPU implementations, the CycleClock is not
+// serializing. So if you're trying to count at cycles granularity, your
+// data might be inaccurate due to out of order instruction execution.
+// ----------------------------------------------------------------------
+
+#ifndef BENCHMARK_CYCLECLOCK_H_
+#define BENCHMARK_CYCLECLOCK_H_
+
+#include <cstdint>
+
+#include "benchmark/macros.h"
+#include "internal_macros.h"
+
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_time.h>
+#endif
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+extern "C" uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <sys/time.h>
+#endif
+
+namespace benchmark {
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+namespace cycleclock {
+// This should return the number of cycles since power-on.  Thread-safe.
+inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
+#if defined(BENCHMARK_OS_MACOSX)
+  // this goes at the top because we need ALL Macs, regardless of
+  // architecture, to return the number of "mach time units" that
+  // have passed since startup.  See sysinfo.cc where
+  // InitializeSystemInfo() sets the supposed cpu clock frequency of
+  // macs to the number of mach time units per second, not actual
+  // CPU clock frequency (which can change in the face of CPU
+  // frequency scaling).  Also note that when the Mac sleeps, this
+  // counter pauses; it does not continue counting, nor does it
+  // reset to zero.
+  return mach_absolute_time();
+#elif defined(__i386__)
+  int64_t ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+  uint64_t low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  return (high << 32) | low;
+#elif defined(__powerpc__) || defined(__ppc__)
+  // This returns a time-base, which is not always precisely a cycle-count.
+  int64_t tbl, tbu0, tbu1;
+  asm("mftbu %0" : "=r"(tbu0));
+  asm("mftb  %0" : "=r"(tbl));
+  asm("mftbu %0" : "=r"(tbu1));
+  tbl &= -static_cast<int64>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
+  return (tbu1 << 32) | tbl;
+#elif defined(__sparc__)
+  int64_t tick;
+  asm(".byte 0x83, 0x41, 0x00, 0x00");
+  asm("mov   %%g1, %0" : "=r"(tick));
+  return tick;
+#elif defined(__ia64__)
+  int64_t itc;
+  asm("mov %0 = ar.itc" : "=r"(itc));
+  return itc;
+#elif defined(COMPILER_MSVC) && defined(_M_IX86)
+  // Older MSVC compilers (like 7.x) don't seem to support the
+  // __rdtsc intrinsic properly, so I prefer to use _asm instead
+  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+  // the code is being compiled with a non-ancient compiler.
+  _asm rdtsc
+#elif defined(COMPILER_MSVC)
+  return __rdtsc();
+#elif defined(__ARM_ARCH)
+#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  uint32_t pmccntr;
+  uint32_t pmuseren;
+  uint32_t pmcntenset;
+  // Read the user mode perf monitor counter access permissions.
+  asm("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+    asm("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+    if (pmcntenset & 0x80000000ul) {  // Is it counting?
+      asm("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+      // The counter is set up to count every 64th cycle
+      return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
+    }
+  }
+#endif
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__mips__)
+  // mips apparently only allows rdtsc for superusers, so we fall
+  // back to gettimeofday.  It's possible clock_gettime would be better.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#else
+// The soft failover to a generic implementation is automatic only for ARM.
+// For other platforms the developer is expected to make an attempt to create
+// a fast implementation and use generic version if nothing better is available.
+#error You need to define CycleTimer for your OS and CPU
+#endif
+}
+}  // end namespace cycleclock
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_CYCLECLOCK_H_
--- a/3rdparty/benchmark/src/internal_macros.h
+++ b/3rdparty/benchmark/src/internal_macros.h
@ -0,0 +1,40 @@
+#ifndef BENCHMARK_INTERNAL_MACROS_H_
+#define BENCHMARK_INTERNAL_MACROS_H_
+
+#include "benchmark/macros.h"
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+#if __has_feature(cxx_attributes)
+# define BENCHMARK_NORETURN [[noreturn]]
+#elif defined(__GNUC__)
+# define BENCHMARK_NORETURN __attribute__((noreturn))
+#else
+# define BENCHMARK_NORETURN
+#endif
+
+#if defined(__CYGWIN__)
+# define BENCHMARK_OS_CYGWIN 1
+#elif defined(_WIN32)
+# define BENCHMARK_OS_WINDOWS 1
+#elif defined(__APPLE__)
+// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
+// that it is an apple system.
+# define BENCHMARK_OS_MACOSX 1
+#elif defined(__FreeBSD__)
+# define BENCHMARK_OS_FREEBSD 1
+#elif defined(__linux__)
+# define BENCHMARK_OS_LINUX 1
+#endif
+
+#if defined(__clang__)
+# define COMPILER_CLANG
+#elif defined(_MSC_VER)
+# define COMPILER_MSVC
+#elif defined(__GNUC__)
+# define COMPILER_GCC
+#endif
+
+#endif // BENCHMARK_INTERNAL_MACROS_H_
--- a/3rdparty/benchmark/src/json_reporter.cc
+++ b/3rdparty/benchmark/src/json_reporter.cc
@ -0,0 +1,159 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "string_util.h"
+#include "walltime.h"
+
+namespace benchmark {
+
+namespace {
+
+std::string FormatKV(std::string const& key, std::string const& value) {
+  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str());
+}
+
+std::string FormatKV(std::string const& key, const char* value) {
+  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value);
+}
+
+std::string FormatKV(std::string const& key, bool value) {
+  return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false");
+}
+
+std::string FormatKV(std::string const& key, int64_t value) {
+  std::stringstream ss;
+  ss << '"' << key << "\": " << value;
+  return ss.str();
+}
+
+int64_t RoundDouble(double v) {
+    return static_cast<int64_t>(v + 0.5);
+}
+
+} // end namespace
+
+bool JSONReporter::ReportContext(const Context& context) {
+  std::ostream& out = std::cout;
+
+  out << "{\n";
+  std::string inner_indent(2, ' ');
+
+  // Open context block and print context information.
+  out << inner_indent << "\"context\": {\n";
+  std::string indent(4, ' ');
+
+  std::string walltime_value = LocalDateTimeString();
+  out << indent << FormatKV("date", walltime_value) << ",\n";
+
+  out << indent
+      << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
+      << ",\n";
+  out << indent
+      << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
+      << ",\n";
+  out << indent
+      << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
+      << ",\n";
+
+#if defined(NDEBUG)
+  const char build_type[] = "release";
+#else
+  const char build_type[] = "debug";
+#endif
+  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  // Close context block and open the list of benchmarks.
+  out << inner_indent << "},\n";
+  out << inner_indent << "\"benchmarks\": [\n";
+  return true;
+}
+
+void JSONReporter::ReportRuns(std::vector<Run> const& reports) {
+  if (reports.empty()) {
+    return;
+  }
+  std::string indent(4, ' ');
+  std::ostream& out = std::cout;
+  if (!first_report_) {
+    out << ",\n";
+  }
+  first_report_ = false;
+  std::vector<Run> reports_cp = reports;
+  if (reports.size() >= 2) {
+    Run mean_data;
+    Run stddev_data;
+    BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
+    reports_cp.push_back(mean_data);
+    reports_cp.push_back(stddev_data);
+  }
+  for (auto it = reports_cp.begin(); it != reports_cp.end(); ++it) {
+     out << indent << "{\n";
+     PrintRunData(*it);
+     out << indent << '}';
+     auto it_cp = it;
+     if (++it_cp != reports_cp.end()) {
+         out << ",\n";
+     }
+  }
+}
+
+void JSONReporter::Finalize() {
+    // Close the list of benchmarks and the top level object.
+    std::cout << "\n  ]\n}\n";
+}
+
+void JSONReporter::PrintRunData(Run const& run) {
+    double const multiplier = 1e9; // nano second multiplier
+    double cpu_time = run.cpu_accumulated_time * multiplier;
+    double real_time = run.real_accumulated_time * multiplier;
+    if (run.iterations != 0) {
+        real_time = real_time / static_cast<double>(run.iterations);
+        cpu_time = cpu_time / static_cast<double>(run.iterations);
+    }
+
+    std::string indent(6, ' ');
+    std::ostream& out = std::cout;
+    out << indent
+        << FormatKV("name", run.benchmark_name)
+        << ",\n";
+    out << indent
+        << FormatKV("iterations", run.iterations)
+        << ",\n";
+    out << indent
+        << FormatKV("real_time", RoundDouble(real_time))
+        << ",\n";
+    out << indent
+        << FormatKV("cpu_time", RoundDouble(cpu_time));
+    if (run.bytes_per_second > 0.0) {
+        out << ",\n" << indent
+            << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
+    }
+    if (run.items_per_second > 0.0) {
+        out << ",\n" << indent
+            << FormatKV("items_per_second", RoundDouble(run.items_per_second));
+    }
+    if (!run.report_label.empty()) {
+        out << ",\n" << indent
+            << FormatKV("label", run.report_label);
+    }
+    out << '\n';
+}
+
+} // end namespace benchmark
--- a/3rdparty/benchmark/src/log.cc
+++ b/3rdparty/benchmark/src/log.cc
@ -0,0 +1,40 @@
+#include "log.h"
+
+#include <iostream>
+
+namespace benchmark {
+namespace internal {
+
+int& LoggingLevelImp() {
+    static int level = 0;
+    return level;
+}
+
+void SetLogLevel(int value) {
+    LoggingLevelImp() = value;
+}
+
+int GetLogLevel() {
+    return LoggingLevelImp();
+}
+
+class NullLogBuffer : public std::streambuf
+{
+public:
+  int overflow(int c) {
+    return c;
+  }
+};
+
+std::ostream& GetNullLogInstance() {
+  static NullLogBuffer log_buff;
+  static std::ostream null_log(&log_buff);
+  return null_log;
+}
+
+std::ostream& GetErrorLogInstance() {
+  return std::clog;
+}
+
+} // end namespace internal
+} // end namespace benchmark
--- a/3rdparty/benchmark/src/log.h
+++ b/3rdparty/benchmark/src/log.h
@ -0,0 +1,28 @@
+#ifndef BENCHMARK_LOG_H_
+#define BENCHMARK_LOG_H_
+
+#include <ostream>
+
+namespace benchmark {
+namespace internal {
+
+int GetLogLevel();
+void SetLogLevel(int level);
+
+std::ostream& GetNullLogInstance();
+std::ostream& GetErrorLogInstance();
+
+inline std::ostream& GetLogInstanceForLevel(int level) {
+  if (level <= GetLogLevel()) {
+    return GetErrorLogInstance();
+  }
+  return GetNullLogInstance();
+}
+
+} // end namespace internal
+} // end namespace benchmark
+
+#define VLOG(x) (::benchmark::internal::GetLogInstanceForLevel(x) \
+                 << "-- LOG(" << x << "): ")
+
+#endif
--- a/3rdparty/benchmark/src/mutex.h
+++ b/3rdparty/benchmark/src/mutex.h
@ -0,0 +1,142 @@
+#ifndef BENCHMARK_MUTEX_H_
+#define BENCHMARK_MUTEX_H_
+
+#include <mutex>
+#include <condition_variable>
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
+
+#define CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY \
+  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+
+namespace benchmark {
+
+typedef std::condition_variable Condition;
+
+// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
+// we can annotate them with thread safety attributes and use the
+// -Wthread-safety warning with clang. The standard library types cannot be
+// used directly because they do not provided the required annotations.
+class CAPABILITY("mutex") Mutex
+{
+public:
+  Mutex() {}
+
+  void lock() ACQUIRE() { mut_.lock(); }
+  void unlock() RELEASE() { mut_.unlock(); }
+  std::mutex& native_handle() {
+    return mut_;
+  }
+private:
+  std::mutex mut_;
+};
+
+
+class SCOPED_CAPABILITY MutexLock
+{
+  typedef std::unique_lock<std::mutex> MutexLockImp;
+public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
+  { }
+  ~MutexLock() RELEASE() {}
+  MutexLockImp& native_handle() { return ml_; }
+private:
+  MutexLockImp ml_;
+};
+
+
+class Notification
+{
+public:
+  Notification() : notified_yet_(false) { }
+
+  void WaitForNotification() const EXCLUDES(mutex_) {
+    MutexLock m_lock(mutex_);
+    auto notified_fn = [this]() REQUIRES(mutex_) {
+                            return this->HasBeenNotified();
+                        };
+    cv_.wait(m_lock.native_handle(), notified_fn);
+  }
+
+  void Notify() EXCLUDES(mutex_) {
+    {
+      MutexLock lock(mutex_);
+      notified_yet_ = 1;
+    }
+    cv_.notify_all();
+  }
+
+private:
+  bool HasBeenNotified() const REQUIRES(mutex_) {
+    return notified_yet_;
+  }
+
+  mutable Mutex mutex_;
+  mutable std::condition_variable cv_;
+  bool notified_yet_ GUARDED_BY(mutex_);
+};
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_MUTEX_H_
--- a/3rdparty/benchmark/src/re.h
+++ b/3rdparty/benchmark/src/re.h
@ -0,0 +1,60 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RE_H_
+#define BENCHMARK_RE_H_
+
+#if defined(HAVE_STD_REGEX)
+#include <regex>
+#elif defined(HAVE_GNU_POSIX_REGEX)
+#include <gnuregex.h>
+#elif defined(HAVE_POSIX_REGEX)
+#include <regex.h>
+#else
+#error No regular expression backend was found!
+#endif
+#include <string>
+
+namespace benchmark {
+
+// A wrapper around the POSIX regular expression API that provides automatic
+// cleanup
+class Regex {
+ public:
+  Regex();
+  ~Regex();
+
+  // Compile a regular expression matcher from spec.  Returns true on success.
+  //
+  // On failure (and if error is not nullptr), error is populated with a human
+  // readable error message if an error occurs.
+  bool Init(const std::string& spec, std::string* error);
+
+  // Returns whether str matches the compiled regular expression.
+  bool Match(const std::string& str);
+ private:
+  bool init_;
+  // Underlying regular expression object
+#if defined(HAVE_STD_REGEX)
+  std::regex re_;
+#elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
+  regex_t re_;
+#else
+# error No regular expression backend implementation available
+#endif
+};
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RE_H_
--- a/3rdparty/benchmark/src/re_posix.cc
+++ b/3rdparty/benchmark/src/re_posix.cc
@ -0,0 +1,59 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "check.h"
+#include "re.h"
+
+namespace benchmark {
+
+Regex::Regex() : init_(false) { }
+
+bool Regex::Init(const std::string& spec, std::string* error) {
+  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    if (error) {
+      size_t needed = regerror(ec, &re_, nullptr, 0);
+      char* errbuf = new char[needed];
+      regerror(ec, &re_, errbuf, needed);
+
+      // regerror returns the number of bytes necessary to null terminate
+      // the string, so we move that when assigning to error.
+      CHECK_NE(needed, 0);
+      error->assign(errbuf, needed - 1);
+
+      delete[] errbuf;
+    }
+
+    return false;
+  }
+
+  init_ = true;
+  return true;
+}
+
+Regex::~Regex() {
+  if (init_) {
+    regfree(&re_);
+  }
+}
+
+bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+
+  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/re_std.cc
+++ b/3rdparty/benchmark/src/re_std.cc
@ -0,0 +1,44 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "re.h"
+
+namespace benchmark {
+
+Regex::Regex() : init_(false) { }
+
+bool Regex::Init(const std::string& spec, std::string* error) {
+  try {
+    re_ = std::regex(spec, std::regex_constants::extended);
+
+    init_ = true;
+  } catch (const std::regex_error& e) {
+    if (error) {
+      *error = e.what();
+    }
+  }
+  return init_;
+}
+
+Regex::~Regex() { }
+
+bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+
+  return std::regex_search(str, re_);
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/reporter.cc
+++ b/3rdparty/benchmark/src/reporter.cc
@ -0,0 +1,86 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+
+#include <cstdlib>
+#include <vector>
+
+#include "check.h"
+#include "stat.h"
+
+namespace benchmark {
+
+void BenchmarkReporter::ComputeStats(
+    const std::vector<Run>& reports,
+    Run* mean_data, Run* stddev_data) {
+  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
+  // Accumulators.
+  Stat1_d real_accumulated_time_stat;
+  Stat1_d cpu_accumulated_time_stat;
+  Stat1_d bytes_per_second_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  int64_t const run_iterations = reports.front().iterations;
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
+    real_accumulated_time_stat +=
+        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
+    cpu_accumulated_time_stat +=
+        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+  }
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
+  mean_data->iterations = run_iterations;
+  mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
+                                     run_iterations;
+  mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
+                                    run_iterations;
+  mean_data->bytes_per_second = bytes_per_second_stat.Mean();
+  mean_data->items_per_second = items_per_second_stat.Mean();
+
+  // Only add label to mean/stddev if it is same for all runs
+  mean_data->report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != reports[0].report_label) {
+      mean_data->report_label = "";
+      break;
+    }
+  }
+
+  stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
+  stddev_data->report_label = mean_data->report_label;
+  stddev_data->iterations = 0;
+  stddev_data->real_accumulated_time =
+      real_accumulated_time_stat.StdDev();
+  stddev_data->cpu_accumulated_time =
+      cpu_accumulated_time_stat.StdDev();
+  stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
+  stddev_data->items_per_second = items_per_second_stat.StdDev();
+}
+
+void BenchmarkReporter::Finalize() {
+}
+
+BenchmarkReporter::~BenchmarkReporter() {
+}
+
+} // end namespace benchmark
--- a/3rdparty/benchmark/src/sleep.cc
+++ b/3rdparty/benchmark/src/sleep.cc
@ -0,0 +1,50 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sleep.h"
+
+#include <cerrno>
+#include <ctime>
+
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Windows.h>
+#endif
+
+namespace benchmark {
+#ifdef BENCHMARK_OS_WINDOWS
+// Window's Sleep takes milliseconds argument.
+void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
+void SleepForSeconds(double seconds) {
+  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
+}
+#else   // BENCHMARK_OS_WINDOWS
+void SleepForMicroseconds(int microseconds) {
+  struct timespec sleep_time;
+  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
+  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+}
+
+void SleepForMilliseconds(int milliseconds) {
+  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+}
+
+void SleepForSeconds(double seconds) {
+  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
+}
+#endif  // BENCHMARK_OS_WINDOWS
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/sleep.h
+++ b/3rdparty/benchmark/src/sleep.h
@ -0,0 +1,17 @@
+#ifndef BENCHMARK_SLEEP_H_
+#define BENCHMARK_SLEEP_H_
+
+#include <cstdint>
+
+namespace benchmark {
+const int64_t kNumMillisPerSecond = 1000LL;
+const int64_t kNumMicrosPerMilli = 1000LL;
+const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
+const int64_t kNumNanosPerMicro = 1000LL;
+const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+
+void SleepForMilliseconds(int milliseconds);
+void SleepForSeconds(double seconds);
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_SLEEP_H_
--- a/3rdparty/benchmark/src/stat.h
+++ b/3rdparty/benchmark/src/stat.h
@ -0,0 +1,307 @@
+#ifndef BENCHMARK_STAT_H_
+#define BENCHMARK_STAT_H_
+
+#include <cmath>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+
+
+namespace benchmark {
+
+template <typename VType, typename NumType>
+class Stat1;
+
+template <typename VType, typename NumType>
+class Stat1MinMax;
+
+typedef Stat1<float, int64_t> Stat1_f;
+typedef Stat1<double, int64_t> Stat1_d;
+typedef Stat1MinMax<float, int64_t> Stat1MinMax_f;
+typedef Stat1MinMax<double, int64_t> Stat1MinMax_d;
+
+template <typename VType>
+class Vector2;
+template <typename VType>
+class Vector3;
+template <typename VType>
+class Vector4;
+
+template <typename VType, typename NumType>
+class Stat1 {
+ public:
+  typedef Stat1<VType, NumType> Self;
+
+  Stat1() { Clear(); }
+  // Create a sample of value dat and weight 1
+  explicit Stat1(const VType &dat) {
+    sum_ = dat;
+    sum_squares_ = Sqr(dat);
+    numsamples_ = 1;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1(const VType *begin, const VType *end) {
+    Clear();
+    for (const VType *item = begin; item < end; ++item) {
+      (*this) += Stat1(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1(const VType &dat, const NumType &w) {
+    sum_ = w * dat;
+    sum_squares_ = w * Sqr(dat);
+    numsamples_ = w;
+  }
+  // Copy operator
+  Stat1(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+  }
+
+  void Clear() {
+    numsamples_ = NumType();
+    sum_squares_ = sum_ = VType();
+  }
+
+  Self &operator=(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self &operator+=(const Self &stat) {
+    sum_ += stat.sum_;
+    sum_squares_ += stat.sum_squares_;
+    numsamples_ += stat.numsamples_;
+    return (*this);
+  }
+  // The operation opposite to +=
+  Self &operator-=(const Self &stat) {
+    sum_ -= stat.sum_;
+    sum_squares_ -= stat.sum_squares_;
+    numsamples_ -= stat.numsamples_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  Self &operator*=(const VType &k) {
+    sum_ *= k;
+    sum_squares_ *= k;
+    numsamples_ *= k;
+    return (*this);
+  }
+
+  // Merge statistics from two sample sets.
+  Self operator+(const Self &stat) const { return Self(*this) += stat; }
+
+  // The operation opposite to +
+  Self operator-(const Self &stat) const { return Self(*this) -= stat; }
+
+  // Multiply the weight of the set of samples by a factor k
+  Self operator*(const VType &k) const { return Self(*this) *= k; }
+
+  // Return the total weight of this sample set
+  NumType numSamples() const { return numsamples_; }
+
+  // Return the sum of this sample set
+  VType Sum() const { return sum_; }
+
+  // Return the mean of this sample set
+  VType Mean() const {
+    if (numsamples_ == 0) return VType();
+    return sum_ * (1.0 / numsamples_);
+  }
+
+  // Return the mean of this sample set and compute the standard deviation at
+  // the same time.
+  VType Mean(VType *stddev) const {
+    if (numsamples_ == 0) return VType();
+    VType mean = sum_ * (1.0 / numsamples_);
+    if (stddev) {
+      VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+      *stddev = Sqrt(avg_squares - Sqr(mean));
+    }
+    return mean;
+  }
+
+  // Return the standard deviation of the sample set
+  VType StdDev() const {
+    if (numsamples_ == 0) return VType();
+    VType mean = Mean();
+    VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+    return Sqrt(avg_squares - Sqr(mean));
+  }
+
+ private:
+  static_assert(std::is_integral<NumType>::value &&
+                !std::is_same<NumType, bool>::value,
+                "NumType must be an integral type that is not bool.");
+  // Let i be the index of the samples provided (using +=)
+  // and weight[i],value[i] be the data of sample #i
+  // then the variables have the following meaning:
+  NumType numsamples_;  // sum of weight[i];
+  VType sum_;           // sum of weight[i]*value[i];
+  VType sum_squares_;   // sum of weight[i]*value[i]^2;
+
+  // Template function used to square a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqr(const SType &dat) {
+    return dat * dat;
+  }
+
+  template <typename SType>
+  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  template <typename SType>
+  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  template <typename SType>
+  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  // Template function used to take the square root of a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqrt(const SType &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    if (dat < 0) return 0;
+    return sqrt(dat);
+  }
+
+  template <typename SType>
+  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector2<SType>()).Sqrt();
+  }
+
+  template <typename SType>
+  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector3<SType>()).Sqrt();
+  }
+
+  template <typename SType>
+  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector4<SType>()).Sqrt();
+  }
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+std::ostream &operator<<(std::ostream &out, const Stat1<VType, NumType> &s) {
+  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples() << "}";
+  return out;
+}
+
+// Stat1MinMax: same as Stat1, but it also
+// keeps the Min and Max values; the "-"
+// operator is disabled because it cannot be implemented
+// efficiently
+template <typename VType, typename NumType>
+class Stat1MinMax : public Stat1<VType, NumType> {
+ public:
+  typedef Stat1MinMax<VType, NumType> Self;
+
+  Stat1MinMax() { Clear(); }
+  // Create a sample of value dat and weight 1
+  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1MinMax(const VType *begin, const VType *end) {
+    Clear();
+    for (const VType *item = begin; item < end; ++item) {
+      (*this) += Stat1MinMax(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1MinMax(const VType &dat, const NumType &w)
+      : Stat1<VType, NumType>(dat, w) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Copy operator
+  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
+    max_ = stat.max_;
+    min_ = stat.min_;
+  }
+
+  void Clear() {
+    Stat1<VType, NumType>::Clear();
+    if (std::numeric_limits<VType>::has_infinity) {
+      min_ = std::numeric_limits<VType>::infinity();
+      max_ = -std::numeric_limits<VType>::infinity();
+    } else {
+      min_ = std::numeric_limits<VType>::max();
+      max_ = std::numeric_limits<VType>::min();
+    }
+  }
+
+  Self &operator=(const Self &stat) {
+    this->Stat1<VType, NumType>::operator=(stat);
+    max_ = stat.max_;
+    min_ = stat.min_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self &operator+=(const Self &stat) {
+    this->Stat1<VType, NumType>::operator+=(stat);
+    if (stat.max_ > max_) max_ = stat.max_;
+    if (stat.min_ < min_) min_ = stat.min_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  Self &operator*=(const VType &stat) {
+    this->Stat1<VType, NumType>::operator*=(stat);
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self operator+(const Self &stat) const { return Self(*this) += stat; }
+  // Multiply the weight of the set of samples by a factor k
+  Self operator*(const VType &k) const { return Self(*this) *= k; }
+
+  // Return the maximal value in this sample set
+  VType Max() const { return max_; }
+  // Return the minimal value in this sample set
+  VType Min() const { return min_; }
+
+ private:
+  // The - operation makes no sense with Min/Max
+  // unless we keep the full list of values (but we don't)
+  // make it private, and let it undefined so nobody can call it
+  Self &operator-=(const Self &stat);  // senseless. let it undefined.
+
+  // The operation opposite to -
+  Self operator-(const Self &stat) const;  // senseless. let it undefined.
+
+  // Let i be the index of the samples provided (using +=)
+  // and weight[i],value[i] be the data of sample #i
+  // then the variables have the following meaning:
+  VType max_;  // max of value[i]
+  VType min_;  // min of value[i]
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+std::ostream &operator<<(std::ostream &out,
+                         const Stat1MinMax<VType, NumType> &s) {
+  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples() << " min = " << s.Min()
+      << " max = " << s.Max() << "}";
+  return out;
+}
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_STAT_H_
--- a/3rdparty/benchmark/src/string_util.cc
+++ b/3rdparty/benchmark/src/string_util.cc
@ -0,0 +1,169 @@
+#include "string_util.h"
+
+#include <cmath>
+#include <cstdarg>
+#include <array>
+#include <memory>
+#include <sstream>
+#include <stdio.h>
+
+#include "arraysize.h"
+
+namespace benchmark {
+namespace {
+
+// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
+const char kBigSIUnits[] = "kMGTPEZY";
+// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
+const char kBigIECUnits[] = "KMGTPEZY";
+// milli, micro, nano, pico, femto, atto, zepto, yocto.
+const char kSmallSIUnits[] = "munpfazy";
+
+// We require that all three arrays have the same size.
+static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
+              "SI and IEC unit arrays must be the same size");
+static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
+              "Small SI and Big SI unit arrays must be the same size");
+
+static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+
+} // end anonymous namespace
+
+void ToExponentAndMantissa(double val, double thresh, int precision,
+                           double one_k, std::string* mantissa,
+                           int64_t* exponent) {
+  std::stringstream mantissa_stream;
+
+  if (val < 0) {
+    mantissa_stream << "-";
+    val = -val;
+  }
+
+  // Adjust threshold so that it never excludes things which can't be rendered
+  // in 'precision' digits.
+  const double adjusted_threshold =
+      std::max(thresh, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = adjusted_threshold * one_k;
+  const double small_threshold = adjusted_threshold;
+
+  if (val > big_threshold) {
+    // Positive powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
+      scaled /= one_k;
+      if (scaled <= big_threshold) {
+        mantissa_stream << scaled;
+        *exponent = i + 1;
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else if (val < small_threshold) {
+    // Negative powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+      scaled *= one_k;
+      if (scaled >= small_threshold) {
+        mantissa_stream << scaled;
+        *exponent = -static_cast<int64_t>(i + 1);
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else {
+    mantissa_stream << val;
+    *exponent = 0;
+  }
+  *mantissa = mantissa_stream.str();
+}
+
+std::string ExponentToPrefix(int64_t exponent, bool iec) {
+  if (exponent == 0) return "";
+
+  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  if (index >= kUnitsSize) return "";
+
+  const char* array =
+      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
+  if (iec)
+    return array[index] + std::string("i");
+  else
+    return std::string(1, array[index]);
+}
+
+std::string ToBinaryStringFullySpecified(double value, double threshold,
+                                         int precision) {
+  std::string mantissa;
+  int64_t exponent;
+  ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
+                        &exponent);
+  return mantissa + ExponentToPrefix(exponent, false);
+}
+
+void AppendHumanReadable(int n, std::string* str) {
+  std::stringstream ss;
+  // Round down to the nearest SI prefix.
+  ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
+  *str += ss.str();
+}
+
+std::string HumanReadableNumber(double n) {
+  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
+  // this softens edge effects.
+  // 1 means that we should show one decimal place of precision.
+  return ToBinaryStringFullySpecified(n, 1.1, 1);
+}
+
+std::string StringPrintFImp(const char *msg, va_list args)
+{
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  // TODO(ericwf): use std::array for first attempt to avoid one memory
+  // allocation guess what the size might be
+  std::array<char, 256> local_buff;
+  std::size_t size = local_buff.size();
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // handle empty expansion
+  if (ret == 0)
+    return std::string{};
+  if (static_cast<std::size_t>(ret) < size)
+    return std::string(local_buff.data());
+
+  // we did not provide a long enough buffer on our first attempt.
+  // add 1 to size to account for null-byte in size cast to prevent overflow
+  size = static_cast<std::size_t>(ret) + 1;
+  auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  return std::string(buff_ptr.get());
+}
+
+std::string StringPrintF(const char* format, ...)
+{
+  va_list args;
+  va_start(args, format);
+  std::string tmp = StringPrintFImp(format, args);
+  va_end(args);
+  return tmp;
+}
+
+void ReplaceAll(std::string* str, const std::string& from,
+                const std::string& to) {
+  std::size_t start = 0;
+  while((start = str->find(from, start)) != std::string::npos) {
+    str->replace(start, from.length(), to);
+    start += to.length();
+  }
+}
+
+} // end namespace benchmark
--- a/3rdparty/benchmark/src/string_util.h
+++ b/3rdparty/benchmark/src/string_util.h
@ -0,0 +1,44 @@
+#ifndef BENCHMARK_STRING_UTIL_H_
+#define BENCHMARK_STRING_UTIL_H_
+
+#include <string>
+#include <sstream>
+#include <utility>
+#include "internal_macros.h"
+
+namespace benchmark {
+
+void AppendHumanReadable(int n, std::string* str);
+
+std::string HumanReadableNumber(double n);
+
+std::string StringPrintF(const char* format, ...);
+
+inline std::ostream&
+StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT
+{
+  return out;
+}
+
+template <class First, class ...Rest>
+inline std::ostream&
+StringCatImp(std::ostream& out, First&& f, Rest&&... rest)
+{
+  out << std::forward<First>(f);
+  return StringCatImp(out, std::forward<Rest>(rest)...);
+}
+
+template<class ...Args>
+inline std::string StrCat(Args&&... args)
+{
+  std::ostringstream ss;
+  StringCatImp(ss, std::forward<Args>(args)...);
+  return ss.str();
+}
+
+void ReplaceAll(std::string* str, const std::string& from,
+                const std::string& to);
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_STRING_UTIL_H_
--- a/3rdparty/benchmark/src/sysinfo.cc
+++ b/3rdparty/benchmark/src/sysinfo.cc
@ -0,0 +1,416 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sysinfo.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Shlwapi.h>
+#include <Windows.h>
+#include <VersionHelpers.h>
+#else
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/types.h> // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <sys/time.h>
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#endif
+
+#include <cerrno>
+#include <cstdio>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "arraysize.h"
+#include "check.h"
+#include "cycleclock.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+namespace {
+std::once_flag cpuinfo_init;
+double cpuinfo_cycles_per_second = 1.0;
+int cpuinfo_num_cpus = 1;  // Conservative guess
+std::mutex cputimens_mutex;
+
+#if !defined BENCHMARK_OS_MACOSX
+const int64_t estimate_time_ms = 1000;
+
+// Helper function estimates cycles/sec by observing cycles elapsed during
+// sleep(). Using small sleep time decreases accuracy significantly.
+int64_t EstimateCyclesPerSecond() {
+  const int64_t start_ticks = cycleclock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  return cycleclock::Now() - start_ticks;
+}
+#endif
+
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+bool ReadIntFromFile(const char* file, long* value) {
+  bool ret = false;
+  int fd = open(file, O_RDONLY);
+  if (fd != -1) {
+    char line[1024];
+    char* err;
+    memset(line, '\0', sizeof(line));
+    CHECK(read(fd, line, sizeof(line) - 1));
+    const long temp_value = strtol(line, &err, 10);
+    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+      *value = temp_value;
+      ret = true;
+    }
+    close(fd);
+  }
+  return ret;
+}
+#endif
+
+void InitializeSystemInfo() {
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+  char line[1024];
+  char* err;
+  long freq;
+
+  bool saw_mhz = false;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+    // The value is in kHz (as the file name suggests).  For example, on a
+    // 2GHz warpstation, the file contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // If CPU scaling is in effect, we want to use the *maximum* frequency,
+  // not whatever CPU speed some random processor happens to be using now.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz warpstation, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
+  const char* pname = "/proc/cpuinfo";
+  int fd = open(pname, O_RDONLY);
+  if (fd == -1) {
+    perror(pname);
+    if (!saw_mhz) {
+      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+    }
+    return;
+  }
+
+  double bogo_clock = 1.0;
+  bool saw_bogo = false;
+  long max_cpu_id = 0;
+  int num_cpus = 0;
+  line[0] = line[1] = '\0';
+  size_t chars_read = 0;
+  do {  // we'll exit when the last read didn't read anything
+    // Move the next line to the beginning of the buffer
+    const size_t oldlinelen = strlen(line);
+    if (sizeof(line) == oldlinelen + 1)  // oldlinelen took up entire line
+      line[0] = '\0';
+    else  // still other lines left to save
+      memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1));
+    // Terminate the new line, reading more if we can't find the newline
+    char* newline = strchr(line, '\n');
+    if (newline == nullptr) {
+      const size_t linelen = strlen(line);
+      const size_t bytes_to_read = sizeof(line) - 1 - linelen;
+      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
+      chars_read = read(fd, line + linelen, bytes_to_read);
+      line[linelen + chars_read] = '\0';
+      newline = strchr(line, '\n');
+    }
+    if (newline != nullptr) *newline = '\0';
+
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept postive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz") - 1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+          saw_mhz = true;
+      }
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips") - 1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
+          saw_bogo = true;
+      }
+    } else if (strncasecmp(line, "processor", sizeof("processor") - 1) == 0) {
+      num_cpus++;  // count up every time we see an "processor :" entry
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        const long cpu_id = strtol(freqstr + 1, &err, 10);
+        if (freqstr[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
+          max_cpu_id = cpu_id;
+      }
+    }
+  } while (chars_read > 0);
+  close(fd);
+
+  if (!saw_mhz) {
+    if (saw_bogo) {
+      // If we didn't find anything better, we'll use bogomips, but
+      // we're not happy about it.
+      cpuinfo_cycles_per_second = bogo_clock;
+    } else {
+      // If we don't even have bogomips, we'll use the slow estimation.
+      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+    }
+  }
+  if (num_cpus == 0) {
+    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
+  } else {
+    if ((max_cpu_id + 1) != num_cpus) {
+      fprintf(stderr,
+              "CPU ID assignments in /proc/cpuinfo seems messed up."
+              " This is usually caused by a bad BIOS.\n");
+    }
+    cpuinfo_num_cpus = num_cpus;
+  }
+
+#elif defined BENCHMARK_OS_FREEBSD
+// For this sysctl to work, the machine must be configured without
+// SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
+// and later.  Before that, it's a 32-bit quantity (and gives the
+// wrong answer on machines faster than 2^32 Hz).  See
+//  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
+// But also compare FreeBSD 7.0:
+//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
+//  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
+// To FreeBSD 6.3 (it's the same in 6-STABLE):
+//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
+//  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+#if __FreeBSD__ >= 7
+  uint64_t hz = 0;
+#else
+  unsigned int hz = 0;
+#endif
+  size_t sz = sizeof(hz);
+  const char* sysctl_path = "machdep.tsc_freq";
+  if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) {
+    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+            sysctl_path, strerror(errno));
+    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+  } else {
+    cpuinfo_cycles_per_second = hz;
+  }
+// TODO: also figure out cpuinfo_num_cpus
+
+#elif defined BENCHMARK_OS_WINDOWS
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  DWORD data, data_size = sizeof(data);
+  if (IsWindowsXPOrGreater() &&
+      SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size)))
+    cpuinfo_cycles_per_second = static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
+  else
+    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+// TODO: also figure out cpuinfo_num_cpus
+
+#elif defined BENCHMARK_OS_MACOSX
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
+
+  int num_cpus = 0;
+  size_t size = sizeof(num_cpus);
+  int numcpus_name[] = {CTL_HW, HW_NCPU};
+  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0) ==
+          0 &&
+      (size == sizeof(num_cpus)))
+    cpuinfo_num_cpus = num_cpus;
+
+#else
+  // Generic cycles per second counter
+  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+#endif
+}
+}  // end namespace
+
+// getrusage() based implementation of MyCPUUsage
+static double MyCPUUsageRUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec) +
+            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec) +
+            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+  } else {
+    return 0.0;
+  }
+#else
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) * 1e-7;
+#endif  // OS_WINDOWS
+}
+
+#ifndef BENCHMARK_OS_WINDOWS
+static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
+  static int cputime_fd = -1;
+  if (cputime_fd == -1) {
+    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
+    if (cputime_fd < 0) {
+      cputime_fd = -1;
+      return false;
+    }
+  }
+  char buff[64];
+  memset(buff, 0, sizeof(buff));
+  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  unsigned long long result = strtoull(buff, nullptr, 0);
+  if (result == (std::numeric_limits<unsigned long long>::max)()) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  *cputime = static_cast<double>(result) / 1e9;
+  return true;
+}
+#endif  // OS_WINDOWS
+
+double MyCPUUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  {
+    std::lock_guard<std::mutex> l(cputimens_mutex);
+    static bool use_cputime_ns = true;
+    if (use_cputime_ns) {
+      double value;
+      if (MyCPUUsageCPUTimeNsLocked(&value)) {
+        return value;
+      }
+      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
+      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
+      use_cputime_ns = false;
+    }
+  }
+#endif  // OS_WINDOWS
+  return MyCPUUsageRUsage();
+}
+
+double ChildrenCPUUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  struct rusage ru;
+  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec) +
+            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec) +
+            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+  } else {
+    return 0.0;
+  }
+#else
+  // TODO: Not sure what this even means on Windows
+  return 0.0;
+#endif  // OS_WINDOWS
+}
+
+double CyclesPerSecond(void) {
+  std::call_once(cpuinfo_init, InitializeSystemInfo);
+  return cpuinfo_cycles_per_second;
+}
+
+int NumCPUs(void) {
+  std::call_once(cpuinfo_init, InitializeSystemInfo);
+  return cpuinfo_num_cpus;
+}
+
+// The ""'s catch people who don't pass in a literal for "str"
+#define strliterallen(str) (sizeof("" str "") - 1)
+
+// Must use a string literal for prefix.
+#define memprefix(str, len, prefix)                       \
+  ((((len) >= strliterallen(prefix)) &&                   \
+    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
+       ? str + strliterallen(prefix)                      \
+       : nullptr)
+
+bool CpuScalingEnabled() {
+#ifndef BENCHMARK_OS_WINDOWS
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
+    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
+                                       "/cpufreq/scaling_governor");
+    FILE* file = fopen(governor_file.c_str(), "r");
+    if (!file) break;
+    char buff[16];
+    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
+    fclose(file);
+    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
+  }
+#endif
+  return false;
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/sysinfo.h
+++ b/3rdparty/benchmark/src/sysinfo.h
@ -0,0 +1,12 @@
+#ifndef BENCHMARK_SYSINFO_H_
+#define BENCHMARK_SYSINFO_H_
+
+namespace benchmark {
+double MyCPUUsage();
+double ChildrenCPUUsage();
+int NumCPUs();
+double CyclesPerSecond();
+bool CpuScalingEnabled();
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_SYSINFO_H_
--- a/3rdparty/benchmark/src/walltime.cc
+++ b/3rdparty/benchmark/src/walltime.cc
@ -0,0 +1,263 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/macros.h"
+#include "internal_macros.h"
+#include "walltime.h"
+
+#if defined(BENCHMARK_OS_WINDOWS)
+#include <time.h>
+#include <winsock.h> // for timeval
+#else
+#include <sys/time.h>
+#endif
+
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <ctime>
+
+#include <atomic>
+#include <chrono>
+#include <limits>
+
+#include "arraysize.h"
+#include "check.h"
+#include "cycleclock.h"
+#include "log.h"
+#include "sysinfo.h"
+
+namespace benchmark {
+namespace walltime {
+
+namespace {
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+    typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+    typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+class WallTimeImp
+{
+public:
+  WallTime Now();
+
+  static WallTimeImp& GetWallTimeImp() {
+    static WallTimeImp* imp = new WallTimeImp();
+    return *imp;
+  }
+
+private:
+  WallTimeImp();
+  // Helper routines to load/store a float from an AtomicWord. Required because
+  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
+  // get rid of this horror show.
+  void SetDrift(float f) {
+    int32_t w;
+    memcpy(&w, &f, sizeof(f));
+    std::atomic_store(&drift_adjust_, w);
+  }
+
+  float GetDrift() const {
+    float f;
+    int32_t w = std::atomic_load(&drift_adjust_);
+    memcpy(&f, &w, sizeof(f));
+    return f;
+  }
+
+  WallTime Slow() const {
+    struct timeval tv;
+#if defined(BENCHMARK_OS_WINDOWS)
+    FILETIME    file_time;
+    SYSTEMTIME  system_time;
+    ULARGE_INTEGER ularge;
+    const unsigned __int64 epoch = 116444736000000000LL;
+
+    GetSystemTime(&system_time);
+    SystemTimeToFileTime(&system_time, &file_time);
+    ularge.LowPart = file_time.dwLowDateTime;
+    ularge.HighPart = file_time.dwHighDateTime;
+
+    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
+    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
+#else
+    gettimeofday(&tv, nullptr);
+#endif
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+  }
+
+private:
+  static_assert(sizeof(float) <= sizeof(int32_t),
+               "type sizes don't allow the drift_adjust hack");
+
+  WallTime base_walltime_;
+  int64_t base_cycletime_;
+  int64_t cycles_per_second_;
+  double seconds_per_cycle_;
+  uint32_t last_adjust_time_;
+  std::atomic<int32_t> drift_adjust_;
+  int64_t max_interval_cycles_;
+
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
+};
+
+
+WallTime WallTimeImp::Now() {
+  WallTime now = 0.0;
+  WallTime result = 0.0;
+  int64_t ct = 0;
+  uint32_t top_bits = 0;
+  do {
+    ct = cycleclock::Now();
+    int64_t cycle_delta = ct - base_cycletime_;
+    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
+
+    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
+    // Recompute drift no more often than every 2^32 cycles.
+    // I.e., @2GHz, ~ every two seconds
+    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
+      return result + GetDrift();
+    }
+
+    now = Slow();
+  } while (cycleclock::Now() - ct > max_interval_cycles_);
+  // We are now sure that "now" and "result" were produced within
+  // kMaxErrorInterval of one another.
+
+  SetDrift(static_cast<float>(now - result));
+  last_adjust_time_ = top_bits;
+  return now;
+}
+
+
+WallTimeImp::WallTimeImp()
+    : base_walltime_(0.0), base_cycletime_(0),
+      cycles_per_second_(0), seconds_per_cycle_(0.0),
+      last_adjust_time_(0), drift_adjust_(0),
+      max_interval_cycles_(0) {
+  const double kMaxErrorInterval = 100e-6;
+  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
+  CHECK(cycles_per_second_ != 0);
+  seconds_per_cycle_ = 1.0 / cycles_per_second_;
+  max_interval_cycles_ =
+      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
+  do {
+    base_cycletime_ = cycleclock::Now();
+    base_walltime_ = Slow();
+  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
+  // We are now sure that "base_walltime" and "base_cycletime" were produced
+  // within kMaxErrorInterval of one another.
+
+  SetDrift(0.0);
+  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
+}
+
+WallTime CPUWalltimeNow() {
+  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
+  return imp.Now();
+}
+
+WallTime ChronoWalltimeNow() {
+  typedef ChooseClockType::type Clock;
+  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
+          FPSeconds;
+  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
+                "This type must be treated as a floating point type.");
+  auto now = Clock::now().time_since_epoch();
+  return std::chrono::duration_cast<FPSeconds>(now).count();
+}
+
+bool UseCpuCycleClock() {
+    bool useWallTime = !CpuScalingEnabled();
+    if (useWallTime) {
+        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
+    } else {
+        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
+    }
+    return useWallTime;
+}
+
+
+} // end anonymous namespace
+
+// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
+// enabled at the start of the program then std::chrono::system_clock is used
+// instead.
+WallTime Now()
+{
+  static bool useCPUClock = UseCpuCycleClock();
+  if (useCPUClock) {
+    return CPUWalltimeNow();
+  } else {
+    return ChronoWalltimeNow();
+  }
+}
+
+}  // end namespace walltime
+
+
+namespace {
+
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  char storage[128];
+  std::size_t written;
+
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < arraysize(storage));
+  ((void)written); // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+
+} // end namespace
+
+std::string LocalDateTimeString() {
+  return DateTimeString(true);
+}
+
+}  // end namespace benchmark
--- a/3rdparty/benchmark/src/walltime.h
+++ b/3rdparty/benchmark/src/walltime.h
@ -0,0 +1,17 @@
+#ifndef BENCHMARK_WALLTIME_H_
+#define BENCHMARK_WALLTIME_H_
+
+#include <string>
+
+namespace benchmark {
+typedef double WallTime;
+
+namespace walltime {
+WallTime Now();
+}  // end namespace walltime
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_WALLTIME_H_
--- a/3rdparty/benchmark/test/CMakeLists.txt
+++ b/3rdparty/benchmark/test/CMakeLists.txt
@ -0,0 +1,89 @@
+# Enable the tests
+
+find_package(Threads REQUIRED)
+
+set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
+string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+
+macro(compile_benchmark_test name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_benchmark_test)
+
+# Demonstration executable
+compile_benchmark_test(benchmark_test)
+add_test(benchmark benchmark_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(filter_test)
+macro(add_filter_test name filter expect)
+  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+endmacro(add_filter_test)
+
+add_filter_test(filter_simple "Foo" 3)
+add_filter_test(filter_suffix "BM_.*" 4)
+add_filter_test(filter_regex_all ".*" 5)
+add_filter_test(filter_regex_blank "" 5)
+add_filter_test(filter_regex_none "monkey" 0)
+add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
+add_filter_test(filter_regex_begin "^BM_.*" 4)
+add_filter_test(filter_regex_begin2 "^N" 1)
+add_filter_test(filter_regex_end ".*Ba$" 1)
+
+compile_benchmark_test(options_test)
+add_test(options_benchmarks options_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(basic_test)
+add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(fixture_test)
+add_test(fixture_test fixture_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(cxx03_test)
+set_target_properties(cxx03_test
+    PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
+add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+
+# Add the coverage command(s)
+if(CMAKE_BUILD_TYPE)
+  string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+endif()
+if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+  find_program(GCOV gcov)
+  find_program(LCOV lcov)
+  find_program(GENHTML genhtml)
+  find_program(CTEST ctest)
+  if (GCOV AND LCOV AND GENHTML AND CTEST AND HAVE_CXX_FLAG_COVERAGE)
+    add_custom_command(
+      OUTPUT ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMAND ${LCOV} -q -z -d .
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o before.lcov -i
+      COMMAND ${CTEST} --force-new-ctest-process
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o after.lcov
+      COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
+      COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
+      COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+      COMMENT "Running LCOV"
+    )
+    add_custom_target(coverage
+      DEPENDS ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMENT "LCOV report at lcov/index.html"
+    )
+    message(STATUS "Coverage command added")
+  else()
+    if (HAVE_CXX_FLAG_COVERAGE)
+      set(CXX_FLAG_COVERAGE_MESSAGE supported)
+    else()
+      set(CXX_FLAG_COVERAGE_MESSAGE unavailable)
+    endif()
+    message(WARNING
+      "Coverage not available:\n"
+      "  gcov: ${GCOV}\n"
+      "  lcov: ${LCOV}\n"
+      "  genhtml: ${GENHTML}\n"
+      "  ctest: ${CTEST}\n"
+      "  --coverage flag: ${CXX_FLAG_COVERAGE_MESSAGE}")
+  endif()
+endif()
--- a/3rdparty/benchmark/test/basic_test.cc
+++ b/3rdparty/benchmark/test/basic_test.cc
@ -0,0 +1,102 @@
+
+#include "benchmark/benchmark_api.h"
+
+#define BASIC_BENCHMARK_TEST(x) \
+    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    for (int x = 0; x < state.range_x(); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+
+void BM_spin_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+    state.ResumeTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+void BM_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    state.ResumeTiming();
+  }
+}
+BENCHMARK(BM_pause_during);
+BENCHMARK(BM_pause_during)->ThreadPerCpu();
+BENCHMARK(BM_pause_during)->UseRealTime();
+BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
+
+void BM_spin_pause_after(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+
+void BM_empty_stop_start(benchmark::State& state) {
+  while (state.KeepRunning()) { }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+BENCHMARK_MAIN()
--- a/3rdparty/benchmark/test/benchmark_test.cc
+++ b/3rdparty/benchmark/test/benchmark_test.cc
@ -0,0 +1,154 @@
+#include "benchmark/benchmark.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if defined(__GNUC__)
+# define BENCHMARK_NOINLINE __attribute__((noinline))
+#else
+# define BENCHMARK_NOINLINE
+#endif
+
+namespace {
+
+int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+  return (n == 1) ? 1 : n * Factorial(n - 1);
+}
+
+double CalculatePi(int depth) {
+  double pi = 0.0;
+  for (int i = 0; i < depth; ++i) {
+    double numerator = static_cast<double>(((i % 2) * 2) - 1);
+    double denominator = static_cast<double>((2 * i) - 1);
+    pi += numerator / denominator;
+  }
+  return (pi - 1.0) * 4;
+}
+
+std::set<int> ConstructRandomSet(int size) {
+  std::set<int> s;
+  for (int i = 0; i < size; ++i)
+    s.insert(i);
+  return s;
+}
+
+std::mutex test_vector_mu;
+std::vector<int>* test_vector = nullptr;
+
+}  // end namespace
+
+static void BM_Factorial(benchmark::State& state) {
+  int fac_42 = 0;
+  while (state.KeepRunning())
+    fac_42 = Factorial(8);
+  // Prevent compiler optimizations
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
+}
+BENCHMARK(BM_Factorial);
+BENCHMARK(BM_Factorial)->UseRealTime();
+
+static void BM_CalculatePiRange(benchmark::State& state) {
+  double pi = 0.0;
+  while (state.KeepRunning())
+    pi = CalculatePi(state.range_x());
+  std::stringstream ss;
+  ss << pi;
+  state.SetLabel(ss.str());
+}
+BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
+
+static void BM_CalculatePi(benchmark::State& state) {
+  static const int depth = 1024;
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(CalculatePi(depth));
+  }
+}
+BENCHMARK(BM_CalculatePi)->Threads(8);
+BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
+BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
+
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(rand());
+  }
+  state.SetItemsProcessed(state.iterations() * state.range_y());
+  state.SetBytesProcessed(state.iterations() * state.range_y() * sizeof(int));
+}
+BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
+
+template<typename Container, typename ValueType = typename Container::value_type>
+static void BM_Sequential(benchmark::State& state) {
+  ValueType v = 42;
+  while (state.KeepRunning()) {
+    Container c;
+    for (int i = state.range_x(); --i; )
+      c.push_back(v);
+  }
+  const size_t items_processed = state.iterations() * state.range_x();
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(items_processed * sizeof(v));
+}
+BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
+// Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
+#if __cplusplus >= 201103L
+BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
+#endif
+
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range_x(), '-');
+  std::string s2(state.range_x(), '-');
+  while (state.KeepRunning())
+    benchmark::DoNotOptimize(s1.compare(s2));
+}
+BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
+
+static void BM_SetupTeardown(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // No need to lock test_vector_mu here as this is running single-threaded.
+    test_vector = new std::vector<int>();
+  }
+  int i = 0;
+  while (state.KeepRunning()) {
+    std::lock_guard<std::mutex> l(test_vector_mu);
+    if (i%2 == 0)
+      test_vector->push_back(i);
+    else
+      test_vector->pop_back();
+    ++i;
+  }
+  if (state.thread_index == 0) {
+    delete test_vector;
+  }
+}
+BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
+
+static void BM_LongTest(benchmark::State& state) {
+  double tracker = 0.0;
+  while (state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i)
+      benchmark::DoNotOptimize(tracker += i);
+  }
+}
+BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
+
+BENCHMARK_MAIN()
+
--- a/3rdparty/benchmark/test/cxx03_test.cc
+++ b/3rdparty/benchmark/test/cxx03_test.cc
@ -0,0 +1,31 @@
+
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+
+#if __cplusplus >= 201103L
+#error C++11 or greater detected. Should be C++03.
+#endif
+
+void BM_empty(benchmark::State& state) {
+    while (state.KeepRunning()) {
+        volatile std::size_t x = state.iterations();
+        ((void)x);
+    }
+}
+BENCHMARK(BM_empty);
+
+template <class T, class U>
+void BM_template2(benchmark::State& state) {
+    BM_empty(state);
+}
+BENCHMARK_TEMPLATE2(BM_template2, int, long);
+
+template <class T>
+void BM_template1(benchmark::State& state) {
+    BM_empty(state);
+}
+BENCHMARK_TEMPLATE(BM_template1, long);
+BENCHMARK_TEMPLATE1(BM_template1, int);
+
+BENCHMARK_MAIN()
--- a/3rdparty/benchmark/test/filter_test.cc
+++ b/3rdparty/benchmark/test/filter_test.cc
@ -0,0 +1,91 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    ++count_;
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() : count_(0) {}
+
+  virtual ~TestReporter() {}
+
+  size_t GetCount() const {
+    return count_;
+  }
+
+ private:
+  mutable size_t count_;
+};
+
+}  // end namespace
+
+
+static void NoPrefix(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(NoPrefix);
+
+static void BM_Foo(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_Foo);
+
+
+static void BM_Bar(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_Bar);
+
+
+static void BM_FooBar(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_FooBar);
+
+
+static void BM_FooBa(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_FooBa);
+
+
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  if (argc == 2) {
+    // Make sure we ran all of the tests
+    std::stringstream ss(argv[1]);
+    size_t expected;
+    ss >> expected;
+
+    const size_t count = test_reporter.GetCount();
+    if (count != expected) {
+      std::cerr << "ERROR: Expected " << expected << " tests to be ran but only "
+                << count << " completed" << std::endl;
+      return -1;
+    }
+  }
+  return 0;
+}
--- a/3rdparty/benchmark/test/fixture_test.cc
+++ b/3rdparty/benchmark/test/fixture_test.cc
@ -0,0 +1,42 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+
+class MyFixture : public ::benchmark::Fixture
+{
+public:
+    void SetUp() {
+        data = new int(42);
+    }
+
+    void TearDown() {
+        assert(data != nullptr);
+        delete data;
+        data = nullptr;
+    }
+
+    ~MyFixture() {
+      assert(data == nullptr);
+    }
+
+    int* data;
+};
+
+
+BENCHMARK_F(MyFixture, Foo)(benchmark::State& st) {
+    assert(data != nullptr);
+    assert(*data == 42);
+    while (st.KeepRunning()) {
+    }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+  while (st.KeepRunning()) {
+  }
+  st.SetItemsProcessed(st.range_x());
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
+
+
+BENCHMARK_MAIN()
--- a/3rdparty/benchmark/test/options_test.cc
+++ b/3rdparty/benchmark/test/options_test.cc
@ -0,0 +1,26 @@
+#include "benchmark/benchmark_api.h"
+
+void BM_basic(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_basic);
+BENCHMARK(BM_basic)->Arg(42);
+BENCHMARK(BM_basic)->Range(1, 8);
+BENCHMARK(BM_basic)->DenseRange(10, 15);
+BENCHMARK(BM_basic)->ArgPair(42, 42);
+BENCHMARK(BM_basic)->RangePair(64, 512, 64, 512);
+BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->UseRealTime();
+BENCHMARK(BM_basic)->ThreadRange(2, 4);
+BENCHMARK(BM_basic)->ThreadPerCpu();
+
+void CustomArgs(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i < 10; ++i) {
+    b->Arg(i);
+  }
+}
+
+BENCHMARK(BM_basic)->Apply(CustomArgs);
+
+BENCHMARK_MAIN()
--- a/benchmarks/eminline_native.cpp
+++ b/benchmarks/eminline_native.cpp
@ -0,0 +1,15 @@
+// license:BSD-3-Clause
+// copyright-holders:Miodrag Milanovic
+
+#include "benchmark/benchmark_api.h"
+#include "osdcomm.h"
+#include "eminline.h"
+static void BM_count_leading_zeros_native(benchmark::State& state) {
+	UINT32 cnt = 0x332533;
+	while (state.KeepRunning()) {
+		(void)count_leading_zeros(cnt);
+		cnt++;
+	}
+}
+// Register the function as a benchmark
+BENCHMARK(BM_count_leading_zeros_native);
--- a/benchmarks/eminline_noasm.cpp
+++ b/benchmarks/eminline_noasm.cpp
@ -0,0 +1,24 @@
+// license:BSD-3-Clause
+// copyright-holders:Miodrag Milanovic
+
+#include "benchmark/benchmark_api.h"
+#include <time.h>
+#include "osdcore.h"
+#include "osdcomm.h"
+#define MAME_NOASM 1
+osd_ticks_t osd_ticks(void)
+{
+	// use the standard library clock function
+	return clock();
+}
+#include "eminline.h"
+
+static void BM_count_leading_zeros_noasm(benchmark::State& state) {
+	UINT32 cnt = 0x332533;
+	while (state.KeepRunning()) {
+		(void)count_leading_zeros(cnt);
+		cnt++;
+	}
+}
+// Register the function as a benchmark
+BENCHMARK(BM_count_leading_zeros_noasm);
--- a/benchmarks/main.cpp
+++ b/benchmarks/main.cpp
@ -0,0 +1,6 @@
+// license:BSD-3-Clause
+// copyright-holders:Miodrag Milanovic
+
+#include "benchmark/benchmark_api.h"
+
+BENCHMARK_MAIN();
--- a/6
+++ b/6
@ -20,6 +20,7 @@
 # SUBTARGET = tiny
 # TOOLS = 1
 # TESTS = 1
+# BENCHMARKS = 1
 # OSD = sdl

 # USE_BGFX = 1
@ -473,6 +474,10 @@ ifdef TESTS
 PARAMS += --with-tests
 endif

+ifdef BENCHMARKS
+PARAMS += --with-benchmarks
+endif
+
 ifdef SYMBOLS
 PARAMS += --SYMBOLS='$(SYMBOLS)'
 endif
@ -698,6 +703,7 @@ SCRIPTS = scripts/genie.lua \
 	scripts/src/sound.lua \
 	scripts/src/tools.lua \
 	scripts/src/tests.lua \
+	scripts/src/benchmarks.lua \
 	scripts/src/video.lua \
 	scripts/src/bus.lua \
 	scripts/src/netlist.lua \
--- a/scripts/genie.lua
+++ b/scripts/genie.lua
@ -82,6 +82,11 @@ newoption {
 	description = "Enable building tests.",
 }

+newoption {
+	trigger = "with-benchmarks",
+	description = "Enable building benchmarks.",
+}
+
 newoption {
 	trigger = "osd",
 	description = "Choose OSD layer implementation",
@ -1296,3 +1301,8 @@ if _OPTIONS["with-tests"] then
 	group "tests"
 	dofile(path.join("src", "tests.lua"))
 end
+
+if _OPTIONS["with-benchmarks"] then
+	group "benchmarks"
+	dofile(path.join("src", "benchmarks.lua"))
+end
--- a/scripts/src/3rdparty.lua
+++ b/scripts/src/3rdparty.lua
@ -917,39 +917,3 @@ links {
 	"portaudio",
 }
 end
-
--------------------------------------------------
-- GoogleTest library objects
--------------------------------------------------
-
-project "gtest"
-	uuid "fa306a8d-fb10-4d4a-9d2e-fdb9076407b4"
-	kind "StaticLib"
-
-	configuration { "gmake" }
-		buildoptions {
-			"-Wno-undef",
-			"-Wno-unused-variable",
-		}
-
-	configuration { "mingw-clang" }
-		buildoptions {
-			"-O0", -- crash of compiler when doing optimization
-		}
-
-	configuration { "vs*" }
-if _OPTIONS["vs"]=="intel-15" then
-		buildoptions {
-			"/Qwd1195", 			-- error #1195: conversion from integer to smaller pointer
-		}
-end
-
-	configuration { }
-
-	includedirs {
-		MAME_DIR .. "3rdparty/googletest/googletest/include",
-		MAME_DIR .. "3rdparty/googletest/googletest",
-	}
-	files {
-		MAME_DIR .. "3rdparty/googletest/googletest/src/gtest-all.cc",
-	}
--- a/scripts/src/benchmarks.lua
+++ b/scripts/src/benchmarks.lua
@ -0,0 +1,74 @@
+-- license:BSD-3-Clause
+-- copyright-holders:MAMEdev Team
+
+---------------------------------------------------------------------------
+--
+--   benchmarks.lua
+--
+--   Rules for building benchmarks
+--
+---------------------------------------------------------------------------
+
+--------------------------------------------------
+-- Google Benchmark library objects
+--------------------------------------------------
+
+project "benchmark"
+	uuid "60a7e05c-8b4f-497c-bfda-2949a009ba0d"
+	kind "StaticLib"
+
+	configuration { }
+		defines {
+			"HAVE_STD_REGEX",
+		}
+
+	includedirs {
+		MAME_DIR .. "3rdparty/benchmark/include",
+	}
+	files {
+		MAME_DIR .. "3rdparty/benchmark/src/benchmark.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/colorprint.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/commandlineflags.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/console_reporter.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/csv_reporter.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/json_reporter.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/log.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/reporter.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/sleep.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/string_util.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/sysinfo.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/walltime.cc",
+		MAME_DIR .. "3rdparty/benchmark/src/re_std.cc",
+	}
+
+
+
+project("benchmarks")
+	uuid ("a9750a48-d283-4a6d-b126-31c7ce049af1")
+	kind "ConsoleApp"	
+
+	flags {
+		"Symbols", -- always include minimum symbols for executables 	
+	}
+
+	if _OPTIONS["SEPARATE_BIN"]~="1" then 
+		targetdir(MAME_DIR)
+	end
+
+	configuration { }
+
+	links {
+		"benchmark",
+	}
+
+	includedirs {
+		MAME_DIR .. "3rdparty/benchmark/include",
+		MAME_DIR .. "src/osd",
+	}
+
+	files {
+		MAME_DIR .. "benchmarks/main.cpp",
+		MAME_DIR .. "benchmarks/eminline_native.cpp",
+		MAME_DIR .. "benchmarks/eminline_noasm.cpp",
+	}
+
--- a/scripts/src/tests.lua
+++ b/scripts/src/tests.lua
@ -8,42 +8,78 @@
 --   Rules for building tests
 --
 ---------------------------------------------------------------------------
+--------------------------------------------------
+-- GoogleTest library objects
+--------------------------------------------------

-project("tests")
-uuid ("66d4c639-196b-4065-a411-7ee9266564f5")
-kind "ConsoleApp"	
+project "gtest"
+	uuid "fa306a8d-fb10-4d4a-9d2e-fdb9076407b4"
+	kind "StaticLib"

-flags {
-	"Symbols", -- always include minimum symbols for executables 	
-}
+	configuration { "gmake" }
+		buildoptions {
+			"-Wno-undef",
+			"-Wno-unused-variable",
+		}

-if _OPTIONS["SEPARATE_BIN"]~="1" then 
-	targetdir(MAME_DIR)
+	configuration { "mingw-clang" }
+		buildoptions {
+			"-O0", -- crash of compiler when doing optimization
+		}
+
+	configuration { "vs*" }
+if _OPTIONS["vs"]=="intel-15" then
+		buildoptions {
+			"/Qwd1195", 			-- error #1195: conversion from integer to smaller pointer
+		}
 end

-configuration { "gmake" }
-	buildoptions {
-		"-Wno-undef",
+	configuration { }
+
+	includedirs {
+		MAME_DIR .. "3rdparty/googletest/googletest/include",
+		MAME_DIR .. "3rdparty/googletest/googletest",
+	}
+	files {
+		MAME_DIR .. "3rdparty/googletest/googletest/src/gtest-all.cc",
 	}

-configuration { }

-links {
-	"gtest",
-	"utils",
-	"expat",
-	"zlib",
-	"ocore_" .. _OPTIONS["osd"],
-}
+project("tests")
+	uuid ("66d4c639-196b-4065-a411-7ee9266564f5")
+	kind "ConsoleApp"	

-includedirs {
-	MAME_DIR .. "3rdparty/googletest/googletest/include",
-	MAME_DIR .. "src/osd",
-	MAME_DIR .. "src/lib/util",
-}
+	flags {
+		"Symbols", -- always include minimum symbols for executables 	
+	}

-files {
-	MAME_DIR .. "tests/main.cpp",
-	MAME_DIR .. "tests/lib/util/corestr.cpp",
-}
+	if _OPTIONS["SEPARATE_BIN"]~="1" then 
+		targetdir(MAME_DIR)
+	end
+
+	configuration { "gmake" }
+		buildoptions {
+			"-Wno-undef",
+		}
+
+	configuration { }
+
+	links {
+		"gtest",
+		"utils",
+		"expat",
+		"zlib",
+		"ocore_" .. _OPTIONS["osd"],
+	}
+
+	includedirs {
+		MAME_DIR .. "3rdparty/googletest/googletest/include",
+		MAME_DIR .. "src/osd",
+		MAME_DIR .. "src/lib/util",
+	}
+
+	files {
+		MAME_DIR .. "tests/main.cpp",
+		MAME_DIR .. "tests/lib/util/corestr.cpp",
+	}