magic.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. """
  2. magic is a wrapper around the libmagic file identification library.
  3. See README for more information.
  4. Usage:
  5. >>> import magic
  6. >>> magic.from_file("testdata/test.pdf")
  7. 'PDF document, version 1.2'
  8. >>> magic.from_file("testdata/test.pdf", mime=True)
  9. 'application/pdf'
  10. >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
  11. 'PDF document, version 1.2'
  12. >>>
  13. """
  14. import sys
  15. import glob
  16. import os.path
  17. import ctypes
  18. import ctypes.util
  19. import threading
  20. from ctypes import c_char_p, c_int, c_size_t, c_void_p
  21. class MagicException(Exception):
  22. def __init__(self, message):
  23. super(MagicException, self).__init__(message)
  24. self.message = message
  25. class Magic:
  26. """
  27. Magic is a wrapper around the libmagic C library.
  28. """
  29. def __init__(self, mime=False, magic_file=None, mime_encoding=False,
  30. keep_going=False, uncompress=False):
  31. """
  32. Create a new libmagic wrapper.
  33. mime - if True, mimetypes are returned instead of textual descriptions
  34. mime_encoding - if True, codec is returned
  35. magic_file - use a mime database other than the system default
  36. keep_going - don't stop at the first match, keep going
  37. uncompress - Try to look inside compressed files.
  38. """
  39. self.flags = MAGIC_NONE
  40. if mime:
  41. self.flags |= MAGIC_MIME
  42. if mime_encoding:
  43. self.flags |= MAGIC_MIME_ENCODING
  44. if keep_going:
  45. self.flags |= MAGIC_CONTINUE
  46. if uncompress:
  47. self.flags |= MAGIC_COMPRESS
  48. self.cookie = magic_open(self.flags)
  49. self.lock = threading.Lock()
  50. magic_load(self.cookie, magic_file)
  51. def from_buffer(self, buf):
  52. """
  53. Identify the contents of `buf`
  54. """
  55. with self.lock:
  56. try:
  57. # if we're on python3, convert buf to bytes
  58. # otherwise this string is passed as wchar*
  59. # which is not what libmagic expects
  60. if type(buf) == str and str != bytes:
  61. buf = buf.encode('utf-8', errors='replace')
  62. return maybe_decode(magic_buffer(self.cookie, buf))
  63. except MagicException as e:
  64. return self._handle509Bug(e)
  65. def from_file(self, filename):
  66. # raise FileNotFoundException or IOError if the file does not exist
  67. with open(filename):
  68. pass
  69. with self.lock:
  70. try:
  71. return maybe_decode(magic_file(self.cookie, filename))
  72. except MagicException as e:
  73. return self._handle509Bug(e)
  74. def _handle509Bug(self, e):
  75. # libmagic 5.09 has a bug where it might fail to identify the
  76. # mimetype of a file and returns null from magic_file (and
  77. # likely _buffer), but also does not return an error message.
  78. if e.message is None and (self.flags & MAGIC_MIME):
  79. return "application/octet-stream"
  80. else:
  81. raise e
  82. def __del__(self):
  83. # no _thread_check here because there can be no other
  84. # references to this object at this point.
  85. # during shutdown magic_close may have been cleared already so
  86. # make sure it exists before using it.
  87. # the self.cookie check should be unnecessary and was an
  88. # incorrect fix for a threading problem, however I'm leaving
  89. # it in because it's harmless and I'm slightly afraid to
  90. # remove it.
  91. if self.cookie and magic_close:
  92. magic_close(self.cookie)
  93. self.cookie = None
  94. _instances = {}
  95. def _get_magic_type(mime):
  96. i = _instances.get(mime)
  97. if i is None:
  98. i = _instances[mime] = Magic(mime=mime)
  99. return i
  100. def from_file(filename, mime=False):
  101. """"
  102. Accepts a filename and returns the detected filetype. Return
  103. value is the mimetype if mime=True, otherwise a human readable
  104. name.
  105. >>> magic.from_file("testdata/test.pdf", mime=True)
  106. 'application/pdf'
  107. """
  108. m = _get_magic_type(mime)
  109. return m.from_file(filename)
  110. def from_buffer(buffer, mime=False):
  111. """
  112. Accepts a binary string and returns the detected filetype. Return
  113. value is the mimetype if mime=True, otherwise a human readable
  114. name.
  115. >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
  116. 'PDF document, version 1.2'
  117. """
  118. m = _get_magic_type(mime)
  119. return m.from_buffer(buffer)
  120. libmagic = None
  121. # Let's try to find magic or magic1
  122. dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') or ctypes.util.find_library('cygmagic-1')
  123. # This is necessary because find_library returns None if it doesn't find the library
  124. if dll:
  125. libmagic = ctypes.CDLL(dll)
  126. if not libmagic or not libmagic._name:
  127. windows_dlls = ['magic1.dll','cygmagic-1.dll']
  128. platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
  129. '/usr/local/lib/libmagic.dylib'] +
  130. # Assumes there will only be one version installed
  131. glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'),
  132. 'win32': windows_dlls,
  133. 'cygwin': windows_dlls,
  134. 'linux': ['libmagic.so.1'], # fallback for some Linuxes (e.g. Alpine) where library search does not work
  135. }
  136. platform = 'linux' if sys.platform.startswith('linux') else sys.platform
  137. for dll in platform_to_lib.get(platform, []):
  138. try:
  139. libmagic = ctypes.CDLL(dll)
  140. break
  141. except OSError:
  142. pass
  143. if not libmagic or not libmagic._name:
  144. # It is better to raise an ImportError since we are importing magic module
  145. raise ImportError('failed to find libmagic. Check your installation')
  146. magic_t = ctypes.c_void_p
  147. def errorcheck_null(result, func, args):
  148. if result is None:
  149. err = magic_error(args[0])
  150. raise MagicException(err)
  151. else:
  152. return result
  153. def errorcheck_negative_one(result, func, args):
  154. if result is -1:
  155. err = magic_error(args[0])
  156. raise MagicException(err)
  157. else:
  158. return result
  159. # return str on python3. Don't want to unconditionally
  160. # decode because that results in unicode on python2
  161. def maybe_decode(s):
  162. if str == bytes:
  163. return s
  164. else:
  165. return s.decode('utf-8')
  166. def coerce_filename(filename):
  167. if filename is None:
  168. return None
  169. # ctypes will implicitly convert unicode strings to bytes with
  170. # .encode('ascii'). If you use the filesystem encoding
  171. # then you'll get inconsistent behavior (crashes) depending on the user's
  172. # LANG environment variable
  173. is_unicode = (sys.version_info[0] <= 2 and
  174. isinstance(filename, unicode)) or \
  175. (sys.version_info[0] >= 3 and
  176. isinstance(filename, str))
  177. if is_unicode:
  178. return filename.encode('utf-8', 'surrogateescape')
  179. else:
  180. return filename
  181. magic_open = libmagic.magic_open
  182. magic_open.restype = magic_t
  183. magic_open.argtypes = [c_int]
  184. magic_close = libmagic.magic_close
  185. magic_close.restype = None
  186. magic_close.argtypes = [magic_t]
  187. magic_error = libmagic.magic_error
  188. magic_error.restype = c_char_p
  189. magic_error.argtypes = [magic_t]
  190. magic_errno = libmagic.magic_errno
  191. magic_errno.restype = c_int
  192. magic_errno.argtypes = [magic_t]
  193. _magic_file = libmagic.magic_file
  194. _magic_file.restype = c_char_p
  195. _magic_file.argtypes = [magic_t, c_char_p]
  196. _magic_file.errcheck = errorcheck_null
  197. def magic_file(cookie, filename):
  198. return _magic_file(cookie, coerce_filename(filename))
  199. _magic_buffer = libmagic.magic_buffer
  200. _magic_buffer.restype = c_char_p
  201. _magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
  202. _magic_buffer.errcheck = errorcheck_null
  203. def magic_buffer(cookie, buf):
  204. return _magic_buffer(cookie, buf, len(buf))
  205. _magic_load = libmagic.magic_load
  206. _magic_load.restype = c_int
  207. _magic_load.argtypes = [magic_t, c_char_p]
  208. _magic_load.errcheck = errorcheck_negative_one
  209. def magic_load(cookie, filename):
  210. return _magic_load(cookie, coerce_filename(filename))
  211. magic_setflags = libmagic.magic_setflags
  212. magic_setflags.restype = c_int
  213. magic_setflags.argtypes = [magic_t, c_int]
  214. magic_check = libmagic.magic_check
  215. magic_check.restype = c_int
  216. magic_check.argtypes = [magic_t, c_char_p]
  217. magic_compile = libmagic.magic_compile
  218. magic_compile.restype = c_int
  219. magic_compile.argtypes = [magic_t, c_char_p]
  220. MAGIC_NONE = 0x000000 # No flags
  221. MAGIC_DEBUG = 0x000001 # Turn on debugging
  222. MAGIC_SYMLINK = 0x000002 # Follow symlinks
  223. MAGIC_COMPRESS = 0x000004 # Check inside compressed files
  224. MAGIC_DEVICES = 0x000008 # Look at the contents of devices
  225. MAGIC_MIME = 0x000010 # Return a mime string
  226. MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding
  227. MAGIC_CONTINUE = 0x000020 # Return all matches
  228. MAGIC_CHECK = 0x000040 # Print warnings to stderr
  229. MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit
  230. MAGIC_RAW = 0x000100 # Don't translate unprintable chars
  231. MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors
  232. MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files
  233. MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files
  234. MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries
  235. MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type
  236. MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details
  237. MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files
  238. MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff
  239. MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran
  240. MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens