123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469 |
- """
- magic is a wrapper around the libmagic file identification library.
- See README for more information.
- Usage:
- >>> import magic
- >>> magic.from_file("testdata/test.pdf")
- 'PDF document, version 1.2'
- >>> magic.from_file("testdata/test.pdf", mime=True)
- 'application/pdf'
- >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
- 'PDF document, version 1.2'
- >>>
- """
- import sys
- import glob
- import ctypes
- import ctypes.util
- import threading
- import logging
- from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER
- # avoid shadowing the real open with the version from compat.py
- _real_open = open
- class MagicException(Exception):
- def __init__(self, message):
- super(Exception, self).__init__(message)
- self.message = message
- class Magic:
- """
- Magic is a wrapper around the libmagic C library.
- """
- def __init__(self, mime=False, magic_file=None, mime_encoding=False,
- keep_going=False, uncompress=False, raw=False, extension=False):
- """
- Create a new libmagic wrapper.
- mime - if True, mimetypes are returned instead of textual descriptions
- mime_encoding - if True, codec is returned
- magic_file - use a mime database other than the system default
- keep_going - don't stop at the first match, keep going
- uncompress - Try to look inside compressed files.
- raw - Do not try to decode "non-printable" chars.
- extension - Print a slash-separated list of valid extensions for the file type found.
- """
- self.flags = MAGIC_NONE
- if mime:
- self.flags |= MAGIC_MIME_TYPE
- if mime_encoding:
- self.flags |= MAGIC_MIME_ENCODING
- if keep_going:
- self.flags |= MAGIC_CONTINUE
- if uncompress:
- self.flags |= MAGIC_COMPRESS
- if raw:
- self.flags |= MAGIC_RAW
- if extension:
- self.flags |= MAGIC_EXTENSION
- self.cookie = magic_open(self.flags)
- self.lock = threading.Lock()
- magic_load(self.cookie, magic_file)
- # MAGIC_EXTENSION was added in 523 or 524, so bail if
- # it doesn't appear to be available
- if extension and (not _has_version or version() < 524):
- raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic')
- # For https://github.com/ahupp/python-magic/issues/190
- # libmagic has fixed internal limits that some files exceed, causing
- # an error. We can avoid this (at least for the sample file given)
- # by bumping the limit up. It's not clear if this is a general solution
- # or whether other internal limits should be increased, but given
- # the lack of other reports I'll assume this is rare.
- if _has_param:
- try:
- self.setparam(MAGIC_PARAM_NAME_MAX, 64)
- except MagicException as e:
- # some versions of libmagic fail this call,
- # so rather than fail hard just use default behavior
- pass
- def from_buffer(self, buf):
- """
- Identify the contents of `buf`
- """
- with self.lock:
- try:
- # if we're on python3, convert buf to bytes
- # otherwise this string is passed as wchar*
- # which is not what libmagic expects
- # NEXTBREAK: only take bytes
- if type(buf) == str and str != bytes:
- buf = buf.encode('utf-8', errors='replace')
- return maybe_decode(magic_buffer(self.cookie, buf))
- except MagicException as e:
- return self._handle509Bug(e)
- def from_file(self, filename):
- # raise FileNotFoundException or IOError if the file does not exist
- with _real_open(filename):
- pass
- with self.lock:
- try:
- return maybe_decode(magic_file(self.cookie, filename))
- except MagicException as e:
- return self._handle509Bug(e)
- def from_descriptor(self, fd):
- with self.lock:
- try:
- return maybe_decode(magic_descriptor(self.cookie, fd))
- except MagicException as e:
- return self._handle509Bug(e)
- def _handle509Bug(self, e):
- # libmagic 5.09 has a bug where it might fail to identify the
- # mimetype of a file and returns null from magic_file (and
- # likely _buffer), but also does not return an error message.
- if e.message is None and (self.flags & MAGIC_MIME_TYPE):
- return "application/octet-stream"
- else:
- raise e
- def setparam(self, param, val):
- return magic_setparam(self.cookie, param, val)
- def getparam(self, param):
- return magic_getparam(self.cookie, param)
- def __del__(self):
- # no _thread_check here because there can be no other
- # references to this object at this point.
- # during shutdown magic_close may have been cleared already so
- # make sure it exists before using it.
- # the self.cookie check should be unnecessary and was an
- # incorrect fix for a threading problem, however I'm leaving
- # it in because it's harmless and I'm slightly afraid to
- # remove it.
- if hasattr(self, 'cookie') and self.cookie and magic_close:
- magic_close(self.cookie)
- self.cookie = None
- _instances = {}
- def _get_magic_type(mime):
- i = _instances.get(mime)
- if i is None:
- i = _instances[mime] = Magic(mime=mime)
- return i
- def from_file(filename, mime=False):
- """"
- Accepts a filename and returns the detected filetype. Return
- value is the mimetype if mime=True, otherwise a human readable
- name.
- >>> magic.from_file("testdata/test.pdf", mime=True)
- 'application/pdf'
- """
- m = _get_magic_type(mime)
- return m.from_file(filename)
- def from_buffer(buffer, mime=False):
- """
- Accepts a binary string and returns the detected filetype. Return
- value is the mimetype if mime=True, otherwise a human readable
- name.
- >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
- 'PDF document, version 1.2'
- """
- m = _get_magic_type(mime)
- return m.from_buffer(buffer)
- def from_descriptor(fd, mime=False):
- """
- Accepts a file descriptor and returns the detected filetype. Return
- value is the mimetype if mime=True, otherwise a human readable
- name.
- >>> f = open("testdata/test.pdf")
- >>> magic.from_descriptor(f.fileno())
- 'PDF document, version 1.2'
- """
- m = _get_magic_type(mime)
- return m.from_descriptor(fd)
- from . import loader
- libmagic = loader.load_lib()
- magic_t = ctypes.c_void_p
- def errorcheck_null(result, func, args):
- if result is None:
- err = magic_error(args[0])
- raise MagicException(err)
- else:
- return result
- def errorcheck_negative_one(result, func, args):
- if result == -1:
- err = magic_error(args[0])
- raise MagicException(err)
- else:
- return result
- # return str on python3. Don't want to unconditionally
- # decode because that results in unicode on python2
- def maybe_decode(s):
- # NEXTBREAK: remove
- if str == bytes:
- return s
- else:
- # backslashreplace here because sometimes libmagic will return metadata in the charset
- # of the file, which is unknown to us (e.g the title of a Word doc)
- return s.decode('utf-8', 'backslashreplace')
- try:
- from os import PathLike
- def unpath(filename):
- if isinstance(filename, PathLike):
- return filename.__fspath__()
- else:
- return filename
- except ImportError:
- def unpath(filename):
- return filename
- def coerce_filename(filename):
- if filename is None:
- return None
- filename = unpath(filename)
- # ctypes will implicitly convert unicode strings to bytes with
- # .encode('ascii'). If you use the filesystem encoding
- # then you'll get inconsistent behavior (crashes) depending on the user's
- # LANG environment variable
- # NEXTBREAK: remove
- is_unicode = (sys.version_info[0] <= 2 and
- isinstance(filename, unicode)) or \
- (sys.version_info[0] >= 3 and
- isinstance(filename, str))
- if is_unicode:
- return filename.encode('utf-8', 'surrogateescape')
- else:
- return filename
- magic_open = libmagic.magic_open
- magic_open.restype = magic_t
- magic_open.argtypes = [c_int]
- magic_close = libmagic.magic_close
- magic_close.restype = None
- magic_close.argtypes = [magic_t]
- magic_error = libmagic.magic_error
- magic_error.restype = c_char_p
- magic_error.argtypes = [magic_t]
- magic_errno = libmagic.magic_errno
- magic_errno.restype = c_int
- magic_errno.argtypes = [magic_t]
- _magic_file = libmagic.magic_file
- _magic_file.restype = c_char_p
- _magic_file.argtypes = [magic_t, c_char_p]
- _magic_file.errcheck = errorcheck_null
- def magic_file(cookie, filename):
- return _magic_file(cookie, coerce_filename(filename))
- _magic_buffer = libmagic.magic_buffer
- _magic_buffer.restype = c_char_p
- _magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
- _magic_buffer.errcheck = errorcheck_null
- def magic_buffer(cookie, buf):
- return _magic_buffer(cookie, buf, len(buf))
- magic_descriptor = libmagic.magic_descriptor
- magic_descriptor.restype = c_char_p
- magic_descriptor.argtypes = [magic_t, c_int]
- magic_descriptor.errcheck = errorcheck_null
- _magic_descriptor = libmagic.magic_descriptor
- _magic_descriptor.restype = c_char_p
- _magic_descriptor.argtypes = [magic_t, c_int]
- _magic_descriptor.errcheck = errorcheck_null
- def magic_descriptor(cookie, fd):
- return _magic_descriptor(cookie, fd)
- _magic_load = libmagic.magic_load
- _magic_load.restype = c_int
- _magic_load.argtypes = [magic_t, c_char_p]
- _magic_load.errcheck = errorcheck_negative_one
- def magic_load(cookie, filename):
- return _magic_load(cookie, coerce_filename(filename))
- magic_setflags = libmagic.magic_setflags
- magic_setflags.restype = c_int
- magic_setflags.argtypes = [magic_t, c_int]
- magic_check = libmagic.magic_check
- magic_check.restype = c_int
- magic_check.argtypes = [magic_t, c_char_p]
- magic_compile = libmagic.magic_compile
- magic_compile.restype = c_int
- magic_compile.argtypes = [magic_t, c_char_p]
- _has_param = False
- if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'):
- _has_param = True
- _magic_setparam = libmagic.magic_setparam
- _magic_setparam.restype = c_int
- _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
- _magic_setparam.errcheck = errorcheck_negative_one
- _magic_getparam = libmagic.magic_getparam
- _magic_getparam.restype = c_int
- _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
- _magic_getparam.errcheck = errorcheck_negative_one
- def magic_setparam(cookie, param, val):
- if not _has_param:
- raise NotImplementedError("magic_setparam not implemented")
- v = c_size_t(val)
- return _magic_setparam(cookie, param, byref(v))
- def magic_getparam(cookie, param):
- if not _has_param:
- raise NotImplementedError("magic_getparam not implemented")
- val = c_size_t()
- _magic_getparam(cookie, param, byref(val))
- return val.value
- _has_version = False
- if hasattr(libmagic, "magic_version"):
- _has_version = True
- magic_version = libmagic.magic_version
- magic_version.restype = c_int
- magic_version.argtypes = []
- def version():
- if not _has_version:
- raise NotImplementedError("magic_version not implemented")
- return magic_version()
- MAGIC_NONE = 0x000000 # No flags
- MAGIC_DEBUG = 0x000001 # Turn on debugging
- MAGIC_SYMLINK = 0x000002 # Follow symlinks
- MAGIC_COMPRESS = 0x000004 # Check inside compressed files
- MAGIC_DEVICES = 0x000008 # Look at the contents of devices
- MAGIC_MIME_TYPE = 0x000010 # Return a mime string
- MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding
- # TODO: should be
- # MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING
- MAGIC_MIME = 0x000010 # Return a mime string
- MAGIC_EXTENSION = 0x1000000 # Return a /-separated list of extensions
- MAGIC_CONTINUE = 0x000020 # Return all matches
- MAGIC_CHECK = 0x000040 # Print warnings to stderr
- MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit
- MAGIC_RAW = 0x000100 # Don't translate unprintable chars
- MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors
- MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files
- MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files
- MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries
- MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type
- MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details
- MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files
- MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff
- MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran
- MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens
- MAGIC_PARAM_INDIR_MAX = 0 # Recursion limit for indirect magic
- MAGIC_PARAM_NAME_MAX = 1 # Use count limit for name/use magic
- MAGIC_PARAM_ELF_PHNUM_MAX = 2 # Max ELF notes processed
- MAGIC_PARAM_ELF_SHNUM_MAX = 3 # Max ELF program sections processed
- MAGIC_PARAM_ELF_NOTES_MAX = 4 # # Max ELF sections processed
- MAGIC_PARAM_REGEX_MAX = 5 # Length limit for regex searches
- MAGIC_PARAM_BYTES_MAX = 6 # Max number of bytes to read from file
- # This package name conflicts with the one provided by upstream
- # libmagic. This is a common source of confusion for users. To
- # resolve, We ship a copy of that module, and expose it's functions
- # wrapped in deprecation warnings.
- def _add_compat(to_module):
- import warnings, re
- from magic import compat
- def deprecation_wrapper(fn):
- def _(*args, **kwargs):
- warnings.warn(
- "Using compatibility mode with libmagic's python binding. "
- "See https://github.com/ahupp/python-magic/blob/master/COMPAT.md for details.",
- PendingDeprecationWarning)
- return fn(*args, **kwargs)
- return _
- fn = ['detect_from_filename',
- 'detect_from_content',
- 'detect_from_fobj',
- 'open']
- for fname in fn:
- to_module[fname] = deprecation_wrapper(compat.__dict__[fname])
- # copy constants over, ensuring there's no conflicts
- is_const_re = re.compile("^[A-Z_]+$")
- allowed_inconsistent = set(['MAGIC_MIME'])
- for name, value in compat.__dict__.items():
- if is_const_re.match(name):
- if name in to_module:
- if name in allowed_inconsistent:
- continue
- if to_module[name] != value:
- raise Exception("inconsistent value for " + name)
- else:
- continue
- else:
- to_module[name] = value
- _add_compat(globals())
|