__init__.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. """
  2. magic is a wrapper around the libmagic file identification library.
  3. See README for more information.
  4. Usage:
  5. >>> import magic
  6. >>> magic.from_file("testdata/test.pdf")
  7. 'PDF document, version 1.2'
  8. >>> magic.from_file("testdata/test.pdf", mime=True)
  9. 'application/pdf'
  10. >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
  11. 'PDF document, version 1.2'
  12. >>>
  13. """
  14. import sys
  15. import glob
  16. import ctypes
  17. import ctypes.util
  18. import threading
  19. import logging
  20. from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER
  21. # avoid shadowing the real open with the version from compat.py
  22. _real_open = open
  23. class MagicException(Exception):
  24. def __init__(self, message):
  25. super(Exception, self).__init__(message)
  26. self.message = message
  27. class Magic:
  28. """
  29. Magic is a wrapper around the libmagic C library.
  30. """
  31. def __init__(self, mime=False, magic_file=None, mime_encoding=False,
  32. keep_going=False, uncompress=False, raw=False, extension=False):
  33. """
  34. Create a new libmagic wrapper.
  35. mime - if True, mimetypes are returned instead of textual descriptions
  36. mime_encoding - if True, codec is returned
  37. magic_file - use a mime database other than the system default
  38. keep_going - don't stop at the first match, keep going
  39. uncompress - Try to look inside compressed files.
  40. raw - Do not try to decode "non-printable" chars.
  41. extension - Print a slash-separated list of valid extensions for the file type found.
  42. """
  43. self.cookie = None
  44. self.flags = MAGIC_NONE
  45. if mime:
  46. self.flags |= MAGIC_MIME_TYPE
  47. if mime_encoding:
  48. self.flags |= MAGIC_MIME_ENCODING
  49. if keep_going:
  50. self.flags |= MAGIC_CONTINUE
  51. if uncompress:
  52. self.flags |= MAGIC_COMPRESS
  53. if raw:
  54. self.flags |= MAGIC_RAW
  55. if extension:
  56. self.flags |= MAGIC_EXTENSION
  57. self.cookie = magic_open(self.flags)
  58. self.lock = threading.Lock()
  59. magic_load(self.cookie, magic_file)
  60. # MAGIC_EXTENSION was added in 523 or 524, so bail if
  61. # it doesn't appear to be available
  62. if extension and (not _has_version or version() < 524):
  63. raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic')
  64. # For https://github.com/ahupp/python-magic/issues/190
  65. # libmagic has fixed internal limits that some files exceed, causing
  66. # an error. We can avoid this (at least for the sample file given)
  67. # by bumping the limit up. It's not clear if this is a general solution
  68. # or whether other internal limits should be increased, but given
  69. # the lack of other reports I'll assume this is rare.
  70. if _has_param:
  71. try:
  72. self.setparam(MAGIC_PARAM_NAME_MAX, 64)
  73. except MagicException as e:
  74. # some versions of libmagic fail this call,
  75. # so rather than fail hard just use default behavior
  76. pass
  77. def from_buffer(self, buf):
  78. """
  79. Identify the contents of `buf`
  80. """
  81. with self.lock:
  82. try:
  83. # if we're on python3, convert buf to bytes
  84. # otherwise this string is passed as wchar*
  85. # which is not what libmagic expects
  86. if type(buf) == str and str != bytes:
  87. buf = buf.encode('utf-8', errors='replace')
  88. return maybe_decode(magic_buffer(self.cookie, buf))
  89. except MagicException as e:
  90. return self._handle509Bug(e)
  91. def from_file(self, filename):
  92. # raise FileNotFoundException or IOError if the file does not exist
  93. with _real_open(filename):
  94. pass
  95. with self.lock:
  96. try:
  97. return maybe_decode(magic_file(self.cookie, filename))
  98. except MagicException as e:
  99. return self._handle509Bug(e)
  100. def from_descriptor(self, fd):
  101. with self.lock:
  102. try:
  103. return maybe_decode(magic_descriptor(self.cookie, fd))
  104. except MagicException as e:
  105. return self._handle509Bug(e)
  106. def _handle509Bug(self, e):
  107. # libmagic 5.09 has a bug where it might fail to identify the
  108. # mimetype of a file and returns null from magic_file (and
  109. # likely _buffer), but also does not return an error message.
  110. if e.message is None and (self.flags & MAGIC_MIME_TYPE):
  111. return "application/octet-stream"
  112. else:
  113. raise e
  114. def setparam(self, param, val):
  115. return magic_setparam(self.cookie, param, val)
  116. def getparam(self, param):
  117. return magic_getparam(self.cookie, param)
  118. def __del__(self):
  119. # no _thread_check here because there can be no other
  120. # references to this object at this point.
  121. # during shutdown magic_close may have been cleared already so
  122. # make sure it exists before using it.
  123. # the self.cookie check should be unnecessary and was an
  124. # incorrect fix for a threading problem, however I'm leaving
  125. # it in because it's harmless and I'm slightly afraid to
  126. # remove it.
  127. if self.cookie and magic_close:
  128. magic_close(self.cookie)
  129. self.cookie = None
  130. _instances = {}
  131. def _get_magic_type(mime):
  132. i = _instances.get(mime)
  133. if i is None:
  134. i = _instances[mime] = Magic(mime=mime)
  135. return i
  136. def from_file(filename, mime=False):
  137. """"
  138. Accepts a filename and returns the detected filetype. Return
  139. value is the mimetype if mime=True, otherwise a human readable
  140. name.
  141. >>> magic.from_file("testdata/test.pdf", mime=True)
  142. 'application/pdf'
  143. """
  144. m = _get_magic_type(mime)
  145. return m.from_file(filename)
  146. def from_buffer(buffer, mime=False):
  147. """
  148. Accepts a binary string and returns the detected filetype. Return
  149. value is the mimetype if mime=True, otherwise a human readable
  150. name.
  151. >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
  152. 'PDF document, version 1.2'
  153. """
  154. m = _get_magic_type(mime)
  155. return m.from_buffer(buffer)
  156. def from_descriptor(fd, mime=False):
  157. """
  158. Accepts a file descriptor and returns the detected filetype. Return
  159. value is the mimetype if mime=True, otherwise a human readable
  160. name.
  161. >>> f = open("testdata/test.pdf")
  162. >>> magic.from_descriptor(f.fileno())
  163. 'PDF document, version 1.2'
  164. """
  165. m = _get_magic_type(mime)
  166. return m.from_descriptor(fd)
  167. libmagic = None
  168. # Let's try to find magic or magic1
  169. dll = ctypes.util.find_library('magic') \
  170. or ctypes.util.find_library('magic1') \
  171. or ctypes.util.find_library('cygmagic-1') \
  172. or ctypes.util.find_library('libmagic-1') \
  173. or ctypes.util.find_library('msys-magic-1') # for MSYS2
  174. # necessary because find_library returns None if it doesn't find the library
  175. if dll:
  176. libmagic = ctypes.CDLL(dll)
  177. if not libmagic or not libmagic._name:
  178. windows_dlls = ['magic1.dll', 'cygmagic-1.dll', 'libmagic-1.dll', 'msys-magic-1.dll']
  179. platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
  180. '/usr/local/lib/libmagic.dylib'] +
  181. # Assumes there will only be one version installed
  182. glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'), # flake8:noqa
  183. 'win32': windows_dlls,
  184. 'cygwin': windows_dlls,
  185. 'linux': ['libmagic.so.1'],
  186. # fallback for some Linuxes (e.g. Alpine) where library search does not work # flake8:noqa
  187. }
  188. platform = 'linux' if sys.platform.startswith('linux') else sys.platform
  189. for dll in platform_to_lib.get(platform, []):
  190. try:
  191. libmagic = ctypes.CDLL(dll)
  192. break
  193. except OSError:
  194. pass
  195. if not libmagic or not libmagic._name:
  196. # It is better to raise an ImportError since we are importing magic module
  197. raise ImportError('failed to find libmagic. Check your installation')
  198. magic_t = ctypes.c_void_p
  199. def errorcheck_null(result, func, args):
  200. if result is None:
  201. err = magic_error(args[0])
  202. raise MagicException(err)
  203. else:
  204. return result
  205. def errorcheck_negative_one(result, func, args):
  206. if result == -1:
  207. err = magic_error(args[0])
  208. raise MagicException(err)
  209. else:
  210. return result
  211. # return str on python3. Don't want to unconditionally
  212. # decode because that results in unicode on python2
  213. def maybe_decode(s):
  214. if str == bytes:
  215. return s
  216. else:
  217. # backslashreplace here because sometimes libmagic will return metadata in the charset
  218. # of the file, which is unknown to us (e.g the title of a Word doc)
  219. return s.decode('utf-8', 'backslashreplace')
  220. def coerce_filename(filename):
  221. if filename is None:
  222. return None
  223. # ctypes will implicitly convert unicode strings to bytes with
  224. # .encode('ascii'). If you use the filesystem encoding
  225. # then you'll get inconsistent behavior (crashes) depending on the user's
  226. # LANG environment variable
  227. is_unicode = (sys.version_info[0] <= 2 and
  228. isinstance(filename, unicode)) or \
  229. (sys.version_info[0] >= 3 and
  230. isinstance(filename, str))
  231. if is_unicode:
  232. return filename.encode('utf-8', 'surrogateescape')
  233. else:
  234. return filename
  235. magic_open = libmagic.magic_open
  236. magic_open.restype = magic_t
  237. magic_open.argtypes = [c_int]
  238. magic_close = libmagic.magic_close
  239. magic_close.restype = None
  240. magic_close.argtypes = [magic_t]
  241. magic_error = libmagic.magic_error
  242. magic_error.restype = c_char_p
  243. magic_error.argtypes = [magic_t]
  244. magic_errno = libmagic.magic_errno
  245. magic_errno.restype = c_int
  246. magic_errno.argtypes = [magic_t]
  247. _magic_file = libmagic.magic_file
  248. _magic_file.restype = c_char_p
  249. _magic_file.argtypes = [magic_t, c_char_p]
  250. _magic_file.errcheck = errorcheck_null
  251. def magic_file(cookie, filename):
  252. return _magic_file(cookie, coerce_filename(filename))
  253. _magic_buffer = libmagic.magic_buffer
  254. _magic_buffer.restype = c_char_p
  255. _magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
  256. _magic_buffer.errcheck = errorcheck_null
  257. def magic_buffer(cookie, buf):
  258. return _magic_buffer(cookie, buf, len(buf))
  259. magic_descriptor = libmagic.magic_descriptor
  260. magic_descriptor.restype = c_char_p
  261. magic_descriptor.argtypes = [magic_t, c_int]
  262. magic_descriptor.errcheck = errorcheck_null
  263. _magic_descriptor = libmagic.magic_descriptor
  264. _magic_descriptor.restype = c_char_p
  265. _magic_descriptor.argtypes = [magic_t, c_int]
  266. _magic_descriptor.errcheck = errorcheck_null
  267. def magic_descriptor(cookie, fd):
  268. return _magic_descriptor(cookie, fd)
  269. _magic_load = libmagic.magic_load
  270. _magic_load.restype = c_int
  271. _magic_load.argtypes = [magic_t, c_char_p]
  272. _magic_load.errcheck = errorcheck_negative_one
  273. def magic_load(cookie, filename):
  274. return _magic_load(cookie, coerce_filename(filename))
  275. magic_setflags = libmagic.magic_setflags
  276. magic_setflags.restype = c_int
  277. magic_setflags.argtypes = [magic_t, c_int]
  278. magic_check = libmagic.magic_check
  279. magic_check.restype = c_int
  280. magic_check.argtypes = [magic_t, c_char_p]
  281. magic_compile = libmagic.magic_compile
  282. magic_compile.restype = c_int
  283. magic_compile.argtypes = [magic_t, c_char_p]
  284. _has_param = False
  285. if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'):
  286. _has_param = True
  287. _magic_setparam = libmagic.magic_setparam
  288. _magic_setparam.restype = c_int
  289. _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
  290. _magic_setparam.errcheck = errorcheck_negative_one
  291. _magic_getparam = libmagic.magic_getparam
  292. _magic_getparam.restype = c_int
  293. _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
  294. _magic_getparam.errcheck = errorcheck_negative_one
  295. def magic_setparam(cookie, param, val):
  296. if not _has_param:
  297. raise NotImplementedError("magic_setparam not implemented")
  298. v = c_size_t(val)
  299. return _magic_setparam(cookie, param, byref(v))
  300. def magic_getparam(cookie, param):
  301. if not _has_param:
  302. raise NotImplementedError("magic_getparam not implemented")
  303. val = c_size_t()
  304. _magic_getparam(cookie, param, byref(val))
  305. return val.value
  306. _has_version = False
  307. if hasattr(libmagic, "magic_version"):
  308. _has_version = True
  309. magic_version = libmagic.magic_version
  310. magic_version.restype = c_int
  311. magic_version.argtypes = []
  312. def version():
  313. if not _has_version:
  314. raise NotImplementedError("magic_version not implemented")
  315. return magic_version()
  316. MAGIC_NONE = 0x000000 # No flags
  317. MAGIC_DEBUG = 0x000001 # Turn on debugging
  318. MAGIC_SYMLINK = 0x000002 # Follow symlinks
  319. MAGIC_COMPRESS = 0x000004 # Check inside compressed files
  320. MAGIC_DEVICES = 0x000008 # Look at the contents of devices
  321. MAGIC_MIME_TYPE = 0x000010 # Return a mime string
  322. MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding
  323. # TODO: should be
  324. # MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING
  325. MAGIC_MIME = 0x000010 # Return a mime string
  326. MAGIC_EXTENSION = 0x1000000 # Return a /-separated list of extensions
  327. MAGIC_CONTINUE = 0x000020 # Return all matches
  328. MAGIC_CHECK = 0x000040 # Print warnings to stderr
  329. MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit
  330. MAGIC_RAW = 0x000100 # Don't translate unprintable chars
  331. MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors
  332. MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files
  333. MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files
  334. MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries
  335. MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type
  336. MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details
  337. MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files
  338. MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff
  339. MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran
  340. MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens
  341. MAGIC_PARAM_INDIR_MAX = 0 # Recursion limit for indirect magic
  342. MAGIC_PARAM_NAME_MAX = 1 # Use count limit for name/use magic
  343. MAGIC_PARAM_ELF_PHNUM_MAX = 2 # Max ELF notes processed
  344. MAGIC_PARAM_ELF_SHNUM_MAX = 3 # Max ELF program sections processed
  345. MAGIC_PARAM_ELF_NOTES_MAX = 4 # # Max ELF sections processed
  346. MAGIC_PARAM_REGEX_MAX = 5 # Length limit for regex searches
  347. MAGIC_PARAM_BYTES_MAX = 6 # Max number of bytes to read from file
  348. # This package name conflicts with the one provided by upstream
  349. # libmagic. This is a common source of confusion for users. To
  350. # resolve, We ship a copy of that module, and expose it's functions
  351. # wrapped in deprecation warnings.
  352. def _add_compat(to_module):
  353. import warnings, re
  354. from magic import compat
  355. def deprecation_wrapper(fn):
  356. def _(*args, **kwargs):
  357. warnings.warn(
  358. "Using compatability mode with libmagic's python binding. "
  359. "See https://github.com/ahupp/python-magic/blob/master/COMPAT.md for details.",
  360. PendingDeprecationWarning)
  361. return fn(*args, **kwargs)
  362. return _
  363. fn = ['detect_from_filename',
  364. 'detect_from_content',
  365. 'detect_from_fobj',
  366. 'open']
  367. for fname in fn:
  368. to_module[fname] = deprecation_wrapper(compat.__dict__[fname])
  369. # copy constants over, ensuring there's no conflicts
  370. is_const_re = re.compile("^[A-Z_]+$")
  371. allowed_inconsistent = set(['MAGIC_MIME'])
  372. for name, value in compat.__dict__.items():
  373. if is_const_re.match(name):
  374. if name in to_module:
  375. if name in allowed_inconsistent:
  376. continue
  377. if to_module[name] != value:
  378. raise Exception("inconsistent value for " + name)
  379. else:
  380. continue
  381. else:
  382. to_module[name] = value
  383. _add_compat(globals())