What is the correct or most robust way to tell from Python if an imported module comes from a C extension as opposed to a pure Python module? This is useful, for example, i
@Cecil Curry's function is excellent. Two minor comments: firsly, the _elementtree
example raises a TypeError
with my copy of Python 3.5.6. Secondly, as @crld points out, it's also helpful to know if a module contains C extensions, but a more portable version might help. More generic versions (with Python 3.6+ f-string syntax) may therefore be:
from importlib.machinery import ExtensionFileLoader, EXTENSION_SUFFIXES
import inspect
import logging
import os
import os.path
import pkgutil
from types import ModuleType
from typing import List
log = logging.getLogger(__name__)
def is_builtin_module(module: ModuleType) -> bool:
"""
Is this module a built-in module, like ``os``?
Method is as per :func:`inspect.getfile`.
"""
return not hasattr(module, "__file__")
def is_module_a_package(module: ModuleType) -> bool:
assert inspect.ismodule(module)
return os.path.basename(inspect.getfile(module)) == "__init__.py"
def is_c_extension(module: ModuleType) -> bool:
"""
Modified from
https://stackoverflow.com/questions/20339053/in-python-how-can-one-tell-if-a-module-comes-from-a-c-extension.
``True`` only if the passed module is a C extension implemented as a
dynamically linked shared library specific to the current platform.
Args:
module: Previously imported module object to be tested.
Returns:
bool: ``True`` only if this module is a C extension.
Examples:
.. code-block:: python
from cardinal_pythonlib.modules import is_c_extension
import os
import _elementtree as et
import numpy
import numpy.core.multiarray as numpy_multiarray
is_c_extension(os) # False
is_c_extension(numpy) # False
is_c_extension(et) # False on my system (Python 3.5.6). True in the original example.
is_c_extension(numpy_multiarray) # True
""" # noqa
assert inspect.ismodule(module), f'"{module}" not a module.'
# If this module was loaded by a PEP 302-compliant CPython-specific loader
# loading only C extensions, this module is a C extension.
if isinstance(getattr(module, '__loader__', None), ExtensionFileLoader):
return True
# If it's built-in, it's not a C extension.
if is_builtin_module(module):
return False
# Else, fallback to filetype matching heuristics.
#
# Absolute path of the file defining this module.
module_filename = inspect.getfile(module)
# "."-prefixed filetype of this path if any or the empty string otherwise.
module_filetype = os.path.splitext(module_filename)[1]
# This module is only a C extension if this path's filetype is that of a
# C extension specific to the current platform.
return module_filetype in EXTENSION_SUFFIXES
def contains_c_extension(module: ModuleType,
import_all_submodules: bool = True,
include_external_imports: bool = False,
seen: List[ModuleType] = None,
verbose: bool = False) -> bool:
"""
Extends :func:`is_c_extension` by asking: is this module, or any of its
submodules, a C extension?
Args:
module: Previously imported module object to be tested.
import_all_submodules: explicitly import all submodules of this module?
include_external_imports: check modules in other packages that this
module imports?
seen: used internally for recursion (to deal with recursive modules);
should be ``None`` when called by users
verbose: show working via log?
Returns:
bool: ``True`` only if this module or one of its submodules is a C
extension.
Examples:
.. code-block:: python
import logging
import _elementtree as et
import os
import arrow
import alembic
import django
import numpy
import numpy.core.multiarray as numpy_multiarray
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG) # be verbose
contains_c_extension(os) # False
contains_c_extension(et) # False
contains_c_extension(numpy) # True -- different from is_c_extension()
contains_c_extension(numpy_multiarray) # True
contains_c_extension(arrow) # False
contains_c_extension(alembic) # False
contains_c_extension(alembic, include_external_imports=True) # True
# ... this example shows that Alembic imports hashlib, which can import
# _hashlib, which is a C extension; however, that doesn't stop us (for
# example) installing Alembic on a machine with no C compiler
contains_c_extension(django)
""" # noqa
assert inspect.ismodule(module), f'"{module}" not a module.'
if seen is None: # only true for the top-level call
seen = [] # type: List[ModuleType]
if module in seen: # modules can "contain" themselves
# already inspected; avoid infinite loops
return False
seen.append(module)
# Check the thing we were asked about
is_c_ext = is_c_extension(module)
if verbose:
log.info(f"Is module {module!r} a C extension? {is_c_ext}")
if is_c_ext:
return True
if is_builtin_module(module):
# built-in, therefore we stop searching it
return False
# Now check any children, in a couple of ways
top_level_module = seen[0]
top_path = os.path.dirname(top_level_module.__file__)
# Recurse using dir(). This picks up modules that are automatically
# imported by our top-level model. But it won't pick up all submodules;
# try e.g. for django.
for candidate_name in dir(module):
candidate = getattr(module, candidate_name)
# noinspection PyBroadException
try:
if not inspect.ismodule(candidate):
# not a module
continue
except Exception:
# e.g. a Django module that won't import until we configure its
# settings
log.error(f"Failed to test ismodule() status of {candidate!r}")
continue
if is_builtin_module(candidate):
# built-in, therefore we stop searching it
continue
candidate_fname = getattr(candidate, "__file__")
if not include_external_imports:
if os.path.commonpath([top_path, candidate_fname]) != top_path:
if verbose:
log.debug(f"Skipping, not within the top-level module's "
f"directory: {candidate!r}")
continue
# Recurse:
if contains_c_extension(
module=candidate,
import_all_submodules=False, # only done at the top level, below # noqa
include_external_imports=include_external_imports,
seen=seen):
return True
if import_all_submodules:
if not is_module_a_package(module):
if verbose:
log.debug(f"Top-level module is not a package: {module!r}")
return False
# Otherwise, for things like Django, we need to recurse in a different
# way to scan everything.
# See https://stackoverflow.com/questions/3365740/how-to-import-all-submodules. # noqa
log.debug(f"Walking path: {top_path!r}")
try:
for loader, module_name, is_pkg in pkgutil.walk_packages([top_path]): # noqa
if not is_pkg:
log.debug(f"Skipping, not a package: {module_name!r}")
continue
log.debug(f"Manually importing: {module_name!r}")
# noinspection PyBroadException
try:
candidate = loader.find_module(module_name)\
.load_module(module_name) # noqa
except Exception:
# e.g. Alembic "autogenerate" gives: "ValueError: attempted
# relative import beyond top-level package"; or Django
# "django.core.exceptions.ImproperlyConfigured"
log.error(f"Package failed to import: {module_name!r}")
continue
if contains_c_extension(
module=candidate,
import_all_submodules=False, # only done at the top level # noqa
include_external_imports=include_external_imports,
seen=seen):
return True
except Exception:
log.error("Unable to walk packages further; no C extensions "
"detected so far!")
raise
return False
# noinspection PyUnresolvedReferences,PyTypeChecker
def test() -> None:
import _elementtree as et
import arrow
import alembic
import django
import django.conf
import numpy
import numpy.core.multiarray as numpy_multiarray
log.info(f"contains_c_extension(os): "
f"{contains_c_extension(os)}") # False
log.info(f"contains_c_extension(et): "
f"{contains_c_extension(et)}") # False
log.info(f"is_c_extension(numpy): "
f"{is_c_extension(numpy)}") # False
log.info(f"contains_c_extension(numpy): "
f"{contains_c_extension(numpy)}") # True
log.info(f"contains_c_extension(numpy_multiarray): "
f"{contains_c_extension(numpy_multiarray)}") # True # noqa
log.info(f"contains_c_extension(arrow): "
f"{contains_c_extension(arrow)}") # False
log.info(f"contains_c_extension(alembic): "
f"{contains_c_extension(alembic)}") # False
log.info(f"contains_c_extension(alembic, include_external_imports=True): "
f"{contains_c_extension(alembic, include_external_imports=True)}") # True # noqa
# ... this example shows that Alembic imports hashlib, which can import
# _hashlib, which is a C extension; however, that doesn't stop us (for
# example) installing Alembic on a machine with no C compiler
django.conf.settings.configure()
log.info(f"contains_c_extension(django): "
f"{contains_c_extension(django)}") # False
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) # be verbose
test()
First, I don't think this is at all useful. It's very common for modules to be pure-Python wrappers around a C extension module—or, in some cases, pure-Python wrappers around a C extension module if it's available, or a pure Python implementation if not.
For some popular third-party examples: numpy
is pure Python, even though everything important is implemented in C; bintrees
is pure Python, even though its classes may all be implemented either in C or in Python depending on how you build it; etc.
And this is true in most of the stdlib from 3.2 on. For example, if you just import pickle
, the implementation classes will be built in C (what you used to get from cpickle
in 2.7) in CPython, while they'll be pure-Python versions in PyPy, but either way pickle
itself is pure Python.
But if you do want to do this, you actually need to distinguish three things:
sys
.cpickle
.pickle
.And that's assuming you only care about CPython; if your code runs in, say, Jython, or IronPython, the implementation could be JVM or .NET rather than native code.
You can't distinguish perfectly based on __file__
, for a number of reasons:
__file__
at all. (This is documented in a few places—e.g., the Types and members table in the inspect
docs.) Note that if you're using something like py2app
or cx_freeze
, what counts as "built-in" may be different from a standalone installation.easy_install
, less so with pip
) will have either a blank or useless __file__
.In 3.1+, the import process has been massively cleaned up, mostly rewritten in Python, and mostly exposed to the Python layer.
So, you can use the importlib module to see the chain of loaders used to load a module, and ultimately you'll get to BuiltinImporter
(builtins), ExtensionFileLoader
(.so/.pyd/etc.), SourceFileLoader
(.py), or SourcelessFileLoader
(.pyc/.pyo).
You can also see the suffixes assigned to each of the four, on the current target platform, as constants in importlib.machinery
. So, you could check that the any(pathname.endswith(suffix) for suffix in importlib.machinery.EXTENSION_SUFFIXES))
, but that won't actually help in, e.g., the egg/zip case unless you've already traveled up the chain anyway.
The best heuristics anyone has come up with for this are the ones implemented in the inspect
module, so the best thing to do is to use that.
The best choice will be one or more of getsource
, getsourcefile
, and getfile
; which is best depends on which heuristics you want.
A built-in module will raise a TypeError
for any of them.
An extension module ought to return an empty string for getsourcefile
. This seems to work in all the 2.5-3.4 versions I have, but I don't have 2.4 around. For getsource
, at least in some versions, it returns the actual bytes of the .so file, even though it should be returning an empty string or raising an IOError
. (In 3.x, you will almost certainly get a UnicodeError
or SyntaxError
, but you probably don't want to rely on that…)
Pure Python modules may return an empty string for getsourcefile
if in an egg/zip/etc. They should always return a non-empty string for getsource
if source is available, even inside an egg/zip/etc., but if they're sourceless bytecode (.pyc/etc.) they will return an empty string or raise an IOError.
The best bet is to experiment with the version you care about on the platform(s) you care about in the distribution/setup(s) you care about.
While Cecil Curry's answer works (and was very informative, as was abarnert's, I might add) it will return False for the "top level" of a module even if it includes sub-modules that use the C extension (e.g. numpy vs. numpy.core.multiarray).
While probably not as robust as it could be, the following is working for my use current use cases:
def is_c(module):
# if module is part of the main python library (e.g. os), it won't have a path
try:
for path, subdirs, files in os.walk(module.__path__[0]):
for f in files:
ftype = f.split('.')[-1]
if ftype == 'so':
is_c = True
break
return is_c
except AttributeError:
path = inspect.getfile(module)
suffix = path.split('.')[-1]
if suffix != 'so':
return False
elif suffix == 'so':
return True
is_c(os), is_c(im), is_c(et), is_c_extension(ma), is_c(numpy)
# (False, False, True, True, True)
tl;dr
See the "In Search of Perfection" subsection below for the well-tested answer.
As a pragmatic counterpoint to abarnert's helpful analysis of the subtlety involved in portably identifying C extensions, Stack Overflow Productions™ presents... an actual answer.
The capacity to reliably differentiate C extensions from non-C extensions is incredibly useful, without which the Python community would be impoverished. Real-world use cases include:
We can all agree that freezing, optimization, and minimizing end user complaints are useful. Ergo, identifying C extensions is useful.
I also disagree with abarnert's penultimate conclusion that:
The best heuristics anyone has come up with for this are the ones implemented in the
inspect
module, so the best thing to do is to use that.
No. The best heuristics anyone has come up with for this are those given below. All stdlib modules (including but not limited to inspect
) are useless for this purpose. Specifically:
inspect.getsource()
and inspect.getsourcefile()
functions ambiguously return None
for both C extensions (which understandably have no pure-Python source) and other types of modules that also have no pure-Python source (e.g., bytecode-only modules). Useless.importlib
machinery only applies to modules loadable by PEP 302-compliant loaders and hence visible to the default importlib
import algorithm. Useful, but hardly generally applicable. The assumption of PEP 302 compliance breaks down when the real world hits your package in the face repeatedly. For example, did you know that the __import__()
built-in is actually overriddable? This is how we used to customize Python's import mechanism – back when the Earth was still flat.abarnert's ultimate conclusion is also contentious:
…there is no perfect answer.
There is a perfect answer. Much like the oft-doubted Triforce of Hyrulean legend, a perfect answer exists for every imperfect question.
Let's find it.
The pure-Python function that follows returns True
only if the passed previously imported module object is a C extension: For simplicity, Python 3.x is assumed.
import inspect, os
from importlib.machinery import ExtensionFileLoader, EXTENSION_SUFFIXES
from types import ModuleType
def is_c_extension(module: ModuleType) -> bool:
'''
`True` only if the passed module is a C extension implemented as a
dynamically linked shared library specific to the current platform.
Parameters
----------
module : ModuleType
Previously imported module object to be tested.
Returns
----------
bool
`True` only if this module is a C extension.
'''
assert isinstance(module, ModuleType), '"{}" not a module.'.format(module)
# If this module was loaded by a PEP 302-compliant CPython-specific loader
# loading only C extensions, this module is a C extension.
if isinstance(getattr(module, '__loader__', None), ExtensionFileLoader):
return True
# Else, fallback to filetype matching heuristics.
#
# Absolute path of the file defining this module.
module_filename = inspect.getfile(module)
# "."-prefixed filetype of this path if any or the empty string otherwise.
module_filetype = os.path.splitext(module_filename)[1]
# This module is only a C extension if this path's filetype is that of a
# C extension specific to the current platform.
return module_filetype in EXTENSION_SUFFIXES
If it looks long, that's because docstrings, comments, and assertions are good. It's actually only six lines. Eat your elderly heart out, Guido.
Let's unit test this function with four portably importable modules:
os.__init__
module. Hopefully not a C extension.importlib.machinery
submodule. Hopefully not a C extension._elementtree
C extension.numpy.core.multiarray
C extension.To wit:
>>> import os
>>> import importlib.machinery as im
>>> import _elementtree as et
>>> import numpy.core.multiarray as ma
>>> for module in (os, im, et, ma):
... print('Is "{}" a C extension? {}'.format(
... module.__name__, is_c_extension(module)))
Is "os" a C extension? False
Is "importlib.machinery" a C extension? False
Is "_elementtree" a C extension? True
Is "numpy.core.multiarray" a C extension? True
All's well that ends.
The details of our code are quite inconsequential. Very well, where do we begin?
__loader__
attribute whose value is the loader object loading this module. Hence:
importlib.machinery.ExtensionFileLoader
class, this module is a C extension.__import__()
machinery being overridden (e.g., by a low-level bootloader running this Python application as a platform-specific frozen binary). In either case, fallback to testing whether this module's filetype is that of a C extension specific to the current platform.Eight line functions with twenty page explanations. Thas just how we rolls.