diff --git a/.clang-tidy b/.clang-tidy
index 56938a598d..1e229e582e 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -2,6 +2,5 @@
 Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
 WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
 HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
 FormatStyle:     none
 User:            user
diff --git a/.github/install_bazel.sh b/.github/install_bazel.sh
index d07db0e758..1b0d63c98e 100644
--- a/.github/install_bazel.sh
+++ b/.github/install_bazel.sh
@@ -3,11 +3,10 @@ if ! bazel version; then
   if [ "$arch" == "aarch64" ]; then
     arch="arm64"
   fi
-  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
-  yum install -y wget
-  wget "https://github.com/bazelbuild/bazel/releases/download/6.4.0/bazel-6.4.0-linux-$arch" -O /usr/local/bin/bazel
-  chmod +x /usr/local/bin/bazel
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/7.1.1/bazel-7.1.1-linux-$arch"
+  chmod +x $HOME/bin/bazel
 else
-  # bazel is installed for the correct architecture
+  # Bazel is installed for the correct architecture
   exit 0
 fi
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a36d312aa6..8b772cd8b9 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -15,16 +15,16 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
-      - name: Install Python 3.11
+      - name: Install Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: 3.11
+          python-version: 3.12
       - run: python -m pip install build
       - name: Build sdist
         run: python -m build --sdist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
-          name: dist
+          name: dist-sdist
           path: dist/*.tar.gz
 
   build_wheels:
@@ -32,7 +32,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-13, macos-14, windows-latest]
 
     steps:
       - name: Check out Google Benchmark
@@ -47,32 +47,44 @@ jobs:
           platforms: all
 
       - name: Build wheels on ${{ matrix.os }} using cibuildwheel
-        uses: pypa/cibuildwheel@v2.16.2
+        uses: pypa/cibuildwheel@v2.17
         env:
-          CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-* cp312-*'
+          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
           CIBW_SKIP: "*-musllinux_*"
-          CIBW_TEST_SKIP: "*-macosx_arm64"
-          CIBW_ARCHS_LINUX: x86_64 aarch64
-          CIBW_ARCHS_MACOS: x86_64 arm64
-          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
           CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
           CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
 
       - name: Upload Google Benchmark ${{ matrix.os }} wheels
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: dist
+          name: dist-${{ matrix.os }}
           path: wheelhouse/*.whl
 
+  merge_wheels:
+    name: Merge all built wheels into one artifact
+    runs-on: ubuntu-latest
+    needs: build_wheels
+    steps:
+      - name: Merge wheels
+        uses: actions/upload-artifact/merge@v4
+        with:
+          name: dist
+          pattern: dist-*
+          delete-merged: true
+
   pypi_upload:
     name: Publish google-benchmark wheels to PyPI
-    needs: [build_sdist, build_wheels]
+    needs: [merge_wheels]
     runs-on: ubuntu-latest
     permissions:
       id-token: write
     steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: dist
-        path: dist
-    - uses: pypa/gh-action-pypi-publish@v1.8.11
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+      - uses: pypa/gh-action-pypi-publish@v1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0247d1b062..93455ab60d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
         types_or: [ python, pyi ]
         args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.3.1
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix ]
diff --git a/BUILD.bazel b/BUILD.bazel
index d72ae86728..15d836998c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1,5 +1,22 @@
 licenses(["notice"])
 
+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
 config_setting(
     name = "qnx",
     constraint_values = ["@platforms//os:qnx"],
@@ -47,7 +64,7 @@ cc_library(
     ],
     copts = select({
         ":windows": [],
-        "//conditions:default": ["-Werror=old-style-cast"],
+        "//conditions:default": COPTS,
     }),
     defines = [
         "BENCHMARK_STATIC_DEFINE",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9bcc6a493..23b519c250 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,6 +190,7 @@ else()
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Wfloat-equal)
   add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
   if(BENCHMARK_ENABLE_WERROR)
       add_cxx_compiler_flag(-Werror)
   endif()
diff --git a/MODULE.bazel b/MODULE.bazel
index 7e0e016123..95db0b1292 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -4,11 +4,11 @@ module(
 )
 
 bazel_dep(name = "bazel_skylib", version = "1.5.0")
-bazel_dep(name = "platforms", version = "0.0.7")
+bazel_dep(name = "platforms", version = "0.0.8")
 bazel_dep(name = "rules_foreign_cc", version = "0.10.1")
 bazel_dep(name = "rules_cc", version = "0.0.9")
 
-bazel_dep(name = "rules_python", version = "0.27.1", dev_dependency = True)
+bazel_dep(name = "rules_python", version = "0.31.0", dev_dependency = True)
 bazel_dep(name = "googletest", version = "1.12.1", dev_dependency = True, repo_name = "com_google_googletest")
 
 bazel_dep(name = "libpfm", version = "4.11.0")
@@ -19,7 +19,14 @@ bazel_dep(name = "libpfm", version = "4.11.0")
 # of relying on the changing default version from rules_python.
 
 python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
 python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
 
 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
 pip.parse(
@@ -30,3 +37,5 @@ pip.parse(
 use_repo(pip, "tools_pip_deps")
 
 # -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "1.0.0", dev_dependency = True)
diff --git a/WORKSPACE b/WORKSPACE
index 2562070225..503202465e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -22,9 +22,3 @@ pip_parse(
 load("@tools_pip_deps//:requirements.bzl", "install_deps")
 
 install_deps()
-
-new_local_repository(
-    name = "python_headers",
-    build_file = "@//bindings/python:python_headers.BUILD",
-    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
-)
diff --git a/bindings/python/BUILD b/bindings/python/BUILD
deleted file mode 100644
index d61dcb12a1..0000000000
--- a/bindings/python/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-exports_files(glob(["*.BUILD"]))
-
-exports_files(["build_defs.bzl"])
diff --git a/bindings/python/build_defs.bzl b/bindings/python/build_defs.bzl
deleted file mode 100644
index b0c1b0f580..0000000000
--- a/bindings/python/build_defs.bzl
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-This file contains some build definitions for C++ extensions used in the Google Benchmark Python bindings.
-"""
-
-_SHARED_LIB_SUFFIX = {
-    "//conditions:default": ".so",
-    "//:windows": ".dll",
-}
-
-def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
-    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
-        shared_lib_name = name + shared_lib_suffix
-        native.cc_binary(
-            name = shared_lib_name,
-            linkshared = True,
-            linkstatic = True,
-            srcs = srcs + hdrs,
-            copts = copts,
-            features = features,
-            deps = deps,
-        )
-
-    return native.py_library(
-        name = name,
-        data = select({
-            platform: [name + shared_lib_suffix]
-            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
-        }),
-    )
diff --git a/bindings/python/google_benchmark/BUILD b/bindings/python/google_benchmark/BUILD
index f516a693eb..0c8e3c103f 100644
--- a/bindings/python/google_benchmark/BUILD
+++ b/bindings/python/google_benchmark/BUILD
@@ -1,4 +1,4 @@
-load("//bindings/python:build_defs.bzl", "py_extension")
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension")
 
 py_library(
     name = "google_benchmark",
@@ -9,22 +9,10 @@ py_library(
     ],
 )
 
-py_extension(
+nanobind_extension(
     name = "_benchmark",
     srcs = ["benchmark.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = [
-        "-use_header_modules",
-        "-parse_headers",
-    ],
-    deps = [
-        "//:benchmark",
-        "@nanobind",
-        "@python_headers",
-    ],
+    deps = ["//:benchmark"],
 )
 
 py_test(
diff --git a/bindings/python/google_benchmark/__init__.py b/bindings/python/google_benchmark/__init__.py
index e14769f451..c1393b4e58 100644
--- a/bindings/python/google_benchmark/__init__.py
+++ b/bindings/python/google_benchmark/__init__.py
@@ -26,6 +26,7 @@ def my_benchmark(state):
   if __name__ == '__main__':
     benchmark.main()
 """
+
 import atexit
 
 from absl import app
diff --git a/bindings/python/nanobind.BUILD b/bindings/python/nanobind.BUILD
deleted file mode 100644
index 9874b80d1f..0000000000
--- a/bindings/python/nanobind.BUILD
+++ /dev/null
@@ -1,59 +0,0 @@
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "msvc_compiler",
-    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc-cl"},
-)
-
-selects.config_setting_group(
-    name = "winplusmsvc",
-    match_all = [
-        "@platforms//os:windows",
-        ":msvc_compiler",
-    ],
-)
-
-cc_library(
-    name = "nanobind",
-    srcs = glob([
-        "src/*.cpp",
-    ]),
-    additional_linker_inputs = select({
-        "@platforms//os:macos": [":cmake/darwin-ld-cpython.sym"],
-        "//conditions:default": [],
-    }),
-    copts = select({
-        ":msvc_compiler": [
-            "/EHsc",  # exceptions
-            "/Os",  # size optimizations
-            "/GL",  # LTO / whole program optimization
-        ],
-        # these should work on both clang and gcc.
-        "//conditions:default": [
-            "-fexceptions",
-            "-flto",
-            "-Os",
-        ],
-    }),
-    includes = [
-        "ext/robin_map/include",
-        "include",
-    ],
-    linkopts = select({
-        ":winplusmsvc": ["/LTGC"],  # Windows + MSVC.
-        "@platforms//os:macos": ["-Wl,@$(location :cmake/darwin-ld-cpython.sym)"],  # Apple.
-        "//conditions:default": [],
-    }),
-    textual_hdrs = glob(
-        [
-            "include/**/*.h",
-            "src/*.h",
-            "ext/robin_map/include/tsl/*.h",
-        ],
-    ),
-    deps = ["@python_headers"],
-)
diff --git a/bindings/python/python_headers.BUILD b/bindings/python/python_headers.BUILD
deleted file mode 100644
index 8f139f8621..0000000000
--- a/bindings/python/python_headers.BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "python_headers",
-    hdrs = glob(["**/*.h"]),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/cmake/benchmark_main.pc.in b/cmake/benchmark_main.pc.in
new file mode 100644
index 0000000000..a90f3cd060
--- /dev/null
+++ b/cmake/benchmark_main.pc.in
@@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
diff --git a/pyproject.toml b/pyproject.toml
index aa24ae8c3f..62507a8703 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,11 +75,12 @@ src = ["bindings/python"]
 line-length = 80
 target-version = "py311"
 
+[tool.ruff.lint]
 # Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
 select = ["E", "F", "I", "W"]
 ignore = [
     "E501", # line too long
 ]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 combine-as-imports = true
diff --git a/setup.py b/setup.py
index cb20042da5..40cdc8d339 100644
--- a/setup.py
+++ b/setup.py
@@ -1,46 +1,70 @@
 import contextlib
 import os
 import platform
+import re
 import shutil
-import sysconfig
 from pathlib import Path
-from typing import Generator
+from typing import Any, Generator
 
 import setuptools
 from setuptools.command import build_ext
 
-PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
-
 IS_WINDOWS = platform.system() == "Windows"
 IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
+
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+version_tuple = tuple(int(i) for i in platform.python_version_tuple())
+py_limited_api = version_tuple >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}
+
+
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None
 
 
 @contextlib.contextmanager
-def temp_fill_include_path(fp: str) -> Generator[None, None, None]:
-    """Temporarily set the Python include path in a file."""
-    with open(fp, "r+") as f:
-        try:
-            content = f.read()
-            replaced = content.replace(
-                PYTHON_INCLUDE_PATH_PLACEHOLDER,
-                Path(sysconfig.get_paths()["include"]).as_posix(),
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """
+
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
+        else:
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"
+
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    try:
+        if CIBW_LINUX:
+            module_bazel = Path("MODULE.bazel")
+            content: str = module_bazel.read_text()
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
             )
-            f.seek(0)
-            f.write(replaced)
-            f.truncate()
-            yield
-        finally:
-            # revert to the original content after exit
-            f.seek(0)
-            f.write(content)
-            f.truncate()
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)
 
 
 class BazelExtension(setuptools.Extension):
     """A C/C++ extension that is defined as a Bazel BUILD target."""
 
-    def __init__(self, name: str, bazel_target: str):
-        super().__init__(name=name, sources=[])
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)
 
         self.bazel_target = bazel_target
         stripped_target = bazel_target.split("//")[-1]
@@ -67,49 +91,48 @@ def copy_extensions_to_source(self):
 
     def bazel_build(self, ext: BazelExtension) -> None:
         """Runs the bazel build to create the package."""
-        with temp_fill_include_path("WORKSPACE"):
-            temp_path = Path(self.build_temp)
-
-            bazel_argv = [
-                "bazel",
-                "build",
-                ext.bazel_target,
-                "--enable_bzlmod=false",
-                f"--symlink_prefix={temp_path / 'bazel-'}",
-                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
-                # C++17 is required by nanobind
-                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
-            ]
-
-            if IS_WINDOWS:
-                # Link with python*.lib.
-                for library_dir in self.library_dirs:
-                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-            elif IS_MAC:
-                if platform.machine() == "x86_64":
-                    # C++17 needs macOS 10.14 at minimum
-                    bazel_argv.append("--macos_minimum_os=10.14")
-
-                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
-                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
-                    archflags = os.getenv("ARCHFLAGS", "")
-                    if "arm64" in archflags:
-                        bazel_argv.append("--cpu=darwin_arm64")
-                        bazel_argv.append("--macos_cpus=arm64")
-
-                elif platform.machine() == "arm64":
-                    bazel_argv.append("--macos_minimum_os=11.0")
-
+        temp_path = Path(self.build_temp)
+        # omit the patch version to avoid build errors if the toolchain is not
+        # yet registered in the current @rules_python version.
+        # patch version differences should be fine.
+        python_version = ".".join(platform.python_version_tuple()[:2])
+
+        bazel_argv = [
+            "bazel",
+            "build",
+            ext.bazel_target,
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"--@rules_python//python/config_settings:python_version={python_version}",
+        ]
+
+        if ext.py_limited_api:
+            bazel_argv += ["--@nanobind_bazel//:py-limited-api=cp312"]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")
+
+        with _maybe_patch_toolchains():
             self.spawn(bazel_argv)
 
-            shared_lib_suffix = ".dll" if IS_WINDOWS else ".so"
-            ext_name = ext.target_name + shared_lib_suffix
-            ext_bazel_bin_path = (
-                temp_path / "bazel-bin" / ext.relpath / ext_name
-            )
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"
 
-            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
-            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+        ext_name = ext.target_name + suffix
+        ext_bazel_bin_path = temp_path / "bazel-bin" / ext.relpath / ext_name
+        ext_dest_path = Path(self.get_ext_fullpath(ext.name)).with_name(
+            ext_name
+        )
+        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
 
 
 setuptools.setup(
@@ -118,6 +141,8 @@ def bazel_build(self, ext: BazelExtension) -> None:
         BazelExtension(
             name="google_benchmark._benchmark",
             bazel_target="//bindings/python/google_benchmark:_benchmark",
+            py_limited_api=py_limited_api,
         )
     ],
+    options=options,
 )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 943594b70b..5551099b2a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -86,6 +86,7 @@ set(generated_dir "${PROJECT_BINARY_DIR}")
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
 set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")
 
@@ -105,6 +106,7 @@ write_basic_package_version_file(
 )
 
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)
 
 export (
   TARGETS ${targets_to_export}
@@ -133,7 +135,7 @@ if (BENCHMARK_ENABLE_INSTALL)
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 
   install(
-      FILES "${pkg_config}"
+      FILES "${pkg_config}" "${pkg_config_main}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
   install(
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 3a856433d7..dc2fa4e701 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -152,8 +152,16 @@ BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
   return global_context;
 }
 
-// FIXME: wouldn't LTO mess this up?
-void UseCharPointer(char const volatile*) {}
+static void const volatile* volatile global_force_escape_pointer;
+
+// FIXME: Verify if LTO still messes this up?
+void UseCharPointer(char const volatile* const v) {
+  // We want to escape the pointer `v` so that the compiler can not eliminate
+  // computations that produced it. To do that, we escape the pointer by storing
+  // it into a volatile variable, since generally, volatile store, is not
+  // something the compiler is allowed to elide.
+  global_force_escape_pointer = reinterpret_cast<void const volatile*>(v);
+}
 
 }  // namespace internal
 
@@ -399,7 +407,8 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
       benchmarks_with_threads += (benchmark.threads() > 1);
       runners.emplace_back(benchmark, &perfcounters, reports_for_family);
       int num_repeats_of_this_instance = runners.back().GetNumRepeats();
-      num_repetitions_total += num_repeats_of_this_instance;
+      num_repetitions_total +=
+          static_cast<size_t>(num_repeats_of_this_instance);
       if (reports_for_family)
         reports_for_family->num_runs_total += num_repeats_of_this_instance;
     }
@@ -746,7 +755,13 @@ int InitializeStreams() {
 
 }  // end namespace internal
 
-std::string GetBenchmarkVersion() { return {BENCHMARK_VERSION}; }
+std::string GetBenchmarkVersion() {
+#ifdef BENCHMARK_VERSION
+  return {BENCHMARK_VERSION};
+#else
+  return {""};
+#endif
+}
 
 void PrintDefaultHelp() {
   fprintf(stdout,
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index e447c9a2d3..8ade048225 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -482,8 +482,9 @@ int Benchmark::ArgsCnt() const {
 
 const char* Benchmark::GetArgName(int arg) const {
   BM_CHECK_GE(arg, 0);
-  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
-  return arg_names_[arg].c_str();
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
 }
 
 TimeUnit Benchmark::GetTimeUnit() const {
diff --git a/src/benchmark_register.h b/src/benchmark_register.h
index 53367c707c..be50265f72 100644
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@@ -24,7 +24,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -52,7 +52,7 @@ void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
 
   const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
 
-  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
   std::reverse(it, dst->end());
 }
 
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index dcddb437e3..a74bdadd3e 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -235,7 +235,7 @@ BenchmarkRunner::BenchmarkRunner(
       has_explicit_iteration_count(b.iterations() != 0 ||
                                    parsed_benchtime_flag.tag ==
                                        BenchTimeType::ITERS),
-      pool(b.threads() - 1),
+      pool(static_cast<size_t>(b.threads() - 1)),
       iters(has_explicit_iteration_count
                 ? ComputeIters(b_, parsed_benchtime_flag)
                 : 1),
diff --git a/src/cycleclock.h b/src/cycleclock.h
index eff563e7fa..a25843760b 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -70,7 +70,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // frequency scaling).  Also note that when the Mac sleeps, this
   // counter pauses; it does not continue counting, nor does it
   // reset to zero.
-  return mach_absolute_time();
+  return static_cast<int64_t>(mach_absolute_time());
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // this goes above x86-specific code because old versions of Emscripten
   // define __x86_64__, although they have nothing to do with it.
@@ -82,7 +82,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__x86_64__) || defined(__amd64__)
   uint64_t low, high;
   __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
+  return static_cast<int64_t>((high << 32) | low);
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
 #if defined(__powerpc64__) || defined(__ppc64__)
@@ -181,10 +181,11 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
-#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
-  // z/OS XL compiler HLASM syntax.
+#if defined(BENCHMARK_OS_ZOS)
+  // z/OS HLASM syntax.
   asm(" stck %0" : "=m"(tsc) : : "cc");
 #else
+  // Linux on Z syntax.
   asm("stck %0" : "=Q"(tsc) : : "cc");
 #endif
   return tsc;
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 8dd7d0c650..f4894ba8e6 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -11,11 +11,7 @@
 #endif
 
 #if defined(__clang__)
-  #if defined(__ibmxl__)
-    #if !defined(COMPILER_IBMXL)
-      #define COMPILER_IBMXL
-    #endif
-  #elif !defined(COMPILER_CLANG)
+  #if !defined(COMPILER_CLANG)
     #define COMPILER_CLANG
   #endif
 #elif defined(_MSC_VER)
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
index d466e27e86..2eb97eb46a 100644
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@@ -39,7 +39,8 @@ size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
     auto read_bytes = ::read(lead, ptr, size);
     if (read_bytes >= ssize_t(sizeof(uint64_t))) {
       // Actual data bytes are all bytes minus initial padding
-      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      std::size_t data_bytes =
+          static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
       // This should be very cheap since it's in hot cache
       std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
       // Increment our counters
diff --git a/src/statistics.cc b/src/statistics.cc
index 261dcb299a..16b60261fd 100644
--- a/src/statistics.cc
+++ b/src/statistics.cc
@@ -97,7 +97,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   auto error_count = std::count_if(reports.begin(), reports.end(),
                                    [](Run const& run) { return run.skipped; });
 
-  if (reports.size() - error_count < 2) {
+  if (reports.size() - static_cast<size_t>(error_count) < 2) {
     // We don't report aggregated data if there was a single run.
     return results;
   }
@@ -179,7 +179,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     // Similarly, if there are N repetitions with 1 iterations each,
     // an aggregate will be computed over N measurements, not 1.
     // Thus it is best to simply use the count of separate reports.
-    data.iterations = reports.size();
+    data.iterations = static_cast<IterationCount>(reports.size());
 
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
diff --git a/src/string_util.cc b/src/string_util.cc
index c69e40a813..9ba63a700a 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -56,7 +56,7 @@ void ToExponentAndMantissa(double val, int precision, double one_k,
       scaled /= one_k;
       if (scaled <= big_threshold) {
         mantissa_stream << scaled;
-        *exponent = i + 1;
+        *exponent = static_cast<int64_t>(i + 1);
         *mantissa = mantissa_stream.str();
         return;
       }
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index daeb98b026..7261e2a96b 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -162,7 +162,7 @@ ValueUnion GetSysctlImp(std::string const& name) {
       mib[1] = HW_CPUSPEED;
     }
 
-    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+    if (sysctl(mib, 2, buff.data(), &buff.size, nullptr, 0) == -1) {
       return ValueUnion();
     }
     return buff;
@@ -350,7 +350,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     CPUInfo::CacheInfo C;
     C.num_sharing = static_cast<int>(b.count());
     C.level = cache.Level;
-    C.size = cache.Size;
+    C.size = static_cast<int>(cache.Size);
     C.type = "Unknown";
     switch (cache.Type) {
       case CacheUnified:
@@ -485,9 +485,8 @@ int GetNumCPUsImpl() {
   // positives.
   std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
   GetSystemInfo(&sysinfo);
-  return sysinfo.dwNumberOfProcessors;  // number of logical
-                                        // processors in the current
-                                        // group
+  // number of logical processors in the current group
+  return static_cast<int>(sysinfo.dwNumberOfProcessors);
 #elif defined(BENCHMARK_OS_SOLARIS)
   // Returns -1 in case of a failure.
   long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
@@ -735,9 +734,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
 #endif
   unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
+  if (GetSysctl(freqStr, &hz)) return static_cast<double>(hz * 1000000);
 #else
-  if (GetSysctl(freqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) return static_cast<double>(hz);
 #endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
           freqStr, strerror(errno));
@@ -837,7 +836,7 @@ std::vector<double> GetLoadAvg() {
     !(defined(__ANDROID__) && __ANDROID_API__ < 29)
   static constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
-  const int nelem = getloadavg(res.data(), kMaxSamples);
+  const size_t nelem = static_cast<size_t>(getloadavg(res.data(), kMaxSamples));
   if (nelem < 1) {
     res.clear();
   } else {
diff --git a/src/timers.cc b/src/timers.cc
index 667e7b2eef..d0821f3166 100644
--- a/src/timers.cc
+++ b/src/timers.cc
@@ -245,9 +245,9 @@ std::string LocalDateTimeString() {
       tz_offset_sign = '-';
     }
 
-    tz_len =
+    tz_len = static_cast<size_t>(
         ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100));
     BM_CHECK(tz_len == kTzOffsetLen);
     ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
diff --git a/test/BUILD b/test/BUILD
index e43b802350..b245fa7622 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -21,6 +21,7 @@ TEST_COPTS = [
     ## assert() are used a lot in tests upstream, which may be optimised out leading to
     ## unused-variable warning.
     "-Wno-unused-variable",
+    "-Werror=old-style-cast",
 ]
 
 # Some of the issues with DoNotOptimize only occur when optimization is enabled
diff --git a/test/benchmark_gtest.cc b/test/benchmark_gtest.cc
index 2c9e555d92..0aa2552c1e 100644
--- a/test/benchmark_gtest.cc
+++ b/test/benchmark_gtest.cc
@@ -38,7 +38,7 @@ TEST(AddRangeTest, Advanced64) {
 
 TEST(AddRangeTest, FullRange8) {
   std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
   EXPECT_THAT(
       dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 0c159cd27d..0729d15aa7 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -71,11 +71,11 @@ void BM_Complexity_O1(benchmark::State &state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    double tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
@@ -120,16 +120,16 @@ void BM_Complexity_O_N(benchmark::State &state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    double tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
     // 1ns per iteration per entry
-    state.SetIterationTime(state.range(0) * 42 * 1e-9);
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -178,16 +178,16 @@ static void BM_Complexity_O_N_log_N(benchmark::State &state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    double tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
-    state.SetIterationTime(state.range(0) * kLog2E * std::log(state.range(0)) *
-                           42 * 1e-9);
+    state.SetIterationTime(static_cast<double>(state.range(0)) * kLog2E *
+                           std::log(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -238,15 +238,15 @@ void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    double tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
-    state.SetIterationTime(state.range(0) * 42 * 1e-9);
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(n);
 }
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
index 4d061a3a1e..1119a1a2ca 100644
--- a/tools/gbench/util.py
+++ b/tools/gbench/util.py
@@ -1,5 +1,5 @@
-"""util.py - General utilities for running, loading, and processing benchmarks
-"""
+"""util.py - General utilities for running, loading, and processing benchmarks"""
+
 import json
 import os
 import re
@@ -37,7 +37,7 @@ def is_executable_file(filename):
     elif sys.platform.startswith("win"):
         return magic_bytes == b"MZ"
     else:
-        return magic_bytes == b"\x7FELF"
+        return magic_bytes == b"\x7fELF"
 
 
 def is_json_file(filename):