cfile_tools.c 13 KB


  1. /**
  2. cfile_tools.c
  3. A library to deal transparently with possibly compressed files.
  4. Documentation in the function headers and in cfile_tools.h
  5. Copyright (C) 2004 by Arno Wagner <arno.wagner@acm.org>
  6. Distributed under the Gnu Public License version 2 or the modified
  7. BSD license (see file COPYING)
  8. Support for gzip added by Bernhard Tellenbach <bernhard.tellenbach@gmail.com>
  9. */
  10. #define _GNU_SOURCE
  11. #define _FILE_OFFSET_BITS 64
  12. //#ifndef DONT_HAVE_BZ2
  13. //#include <bzlib.h>
  14. //#endif
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #include <errno.h>
  18. #include <assert.h>
  19. #include "cfile_tools.h"
  20. // Concrete formats. remember to adjust CFR_NUM_FORMATS if changed!
  21. // Note: 0, 1 are special entries.
  22. const char * cfr_formats[CFR_NUM_FORMATS] = {
  23. "not open", // 0
  24. "uncompressed", // 1
  25. "bzip2", // 2
  26. "gzip", // 3
  27. };
  28. const char * cfr_extensions[CFR_NUM_FORMATS] = {
  29. "", // 0
  30. "", // 1
  31. ".bz2", // 2
  32. ".gz" // 3
  33. };
  34. // Prototypes of non API functions (don't use these from outside this file)
  35. const char * _cfr_compressor_strerror(int format, int err);
  36. const char * _bz2_strerror(int err);
  37. // API Functions
  38. CFRFILE *cfr_open(const char *path) {
  39. /*******************************/
  40. // Analog to 'fopen'. Error in result has to be tested using
  41. // 'cfr_error' on the result!
  42. // Note: The user needs to free the reurn value!
  43. // Opens a possibly compressed file for reading.
  44. // File type is determined by file name ending
  45. int format, ext_len, name_len;
  46. CFRFILE * retval = NULL;
  47. // determine file format
  48. name_len = strlen(path);
  49. format = 2; // skip specials 0, 1
  50. // Do action dependent on file format
  51. retval = (CFRFILE *) calloc(1,sizeof(CFRFILE));
  52. if(retval == NULL)
  53. return (NULL);
  54. retval->eof = 0;
  55. retval->error1 = 0;
  56. retval->error2 = 0;
  57. if((path == NULL) || (strcmp(path, "-") == 0)) {
  58. /* dump from stdin */
  59. gzFile f;
  60. while (format < CFR_NUM_FORMATS) {
  61. if (strcmp(cfr_extensions[format], ".gz") == 0)
  62. break;
  63. format ++;
  64. }
  65. f = gzdopen(0, "r");
  66. if(f == NULL) {
  67. free(retval);
  68. return (NULL);
  69. }
  70. retval->data2 = f;
  71. retval->format = format;
  72. return (retval);
  73. }
  74. while (format < CFR_NUM_FORMATS) {
  75. ext_len = strlen(cfr_extensions[format]);
  76. if (strncmp(cfr_extensions[format],
  77. path+(name_len-ext_len),
  78. ext_len) == 0
  79. ) break;
  80. format ++;
  81. }
  82. if (format >= CFR_NUM_FORMATS)
  83. format = 1; // uncompressed
  84. retval->format = format;
  85. switch (format) {
  86. case 1: // uncompressed
  87. {
  88. FILE * in;
  89. in = fopen(path,"r");
  90. if (in == NULL) {
  91. free(retval);
  92. return(NULL);
  93. }
  94. retval->data1 = in;
  95. return(retval);
  96. }
  97. break;
  98. case 2: // bzip2
  99. {
  100. int bzerror;
  101. BZFILE * bzin;
  102. FILE * in;
  103. retval->bz2_stream_end = 0;
  104. // get file
  105. in = fopen(path,"r");
  106. if (in == NULL) {
  107. free(retval);
  108. return(NULL);
  109. }
  110. retval->data1 = in;
  111. // bzip2ify file
  112. bzin = BZ2_bzReadOpen( &bzerror, in, 0, 0, NULL, 0);
  113. if (bzerror != BZ_OK) {
  114. errno = bzerror;
  115. BZ2_bzReadClose( &bzerror, bzin);
  116. fclose(in);
  117. free(retval);
  118. return(NULL);
  119. }
  120. retval->data2 = bzin;
  121. return(retval);
  122. }
  123. break;
  124. case 3: // gzip
  125. {
  126. gzFile f;
  127. // get file
  128. f = gzopen(path, "r");
  129. if(f == NULL) {
  130. free(retval);
  131. return (NULL);
  132. }
  133. retval->data2 = f;
  134. return (retval);
  135. }
  136. break;
  137. default: // this is an internal error, no diag yet.
  138. fprintf(stderr,"illegal format '%d' in cfr_open!\n", format);
  139. exit(1);
  140. }
  141. return NULL;
  142. }
  143. int cfr_close(CFRFILE *stream) {
  144. /**************************/
  145. // Analog to 'fclose'.
  146. // FIXME - why is stream->* set, then freed?
  147. if (stream == NULL || stream->closed) {
  148. errno = EBADF;
  149. return -1;
  150. }
  151. int retval = -1;
  152. switch (stream->format) {
  153. case 1: // uncompressed
  154. retval = fclose((FILE *)(stream->data1));
  155. stream->error1 = retval;
  156. break;
  157. case 2: // bzip2
  158. BZ2_bzReadClose( &stream->error2, (BZFILE *)stream->data2);
  159. stream->error1 = retval = fclose((FILE *)(stream->data1));
  160. break;
  161. case 3: // gzip
  162. if(stream->data2!=NULL)
  163. retval = gzclose(stream->data2);
  164. stream->error2 = retval;
  165. break;
  166. default: // internal error
  167. assert("illegal stream->format" && 0);
  168. }
  169. free(stream);
  170. return(retval);
  171. }
  172. size_t cfr_read_n(CFRFILE *stream, void *ptr, size_t bytes) {
  173. /******************************************************************/
  174. // Wrapper, will return either 'bytes' (the number of bytes to read) or 0
  175. return(cfr_read(ptr, bytes, 1, stream)*bytes);
  176. }
  177. size_t cfr_read(void *ptr, size_t size, size_t nmemb, CFRFILE *stream) {
  178. /******************************************************************/
  179. // Analog to 'fread'. Will not return with partial elements, only
  180. // full ones. Hence calling this function with one large element
  181. // size will result in a complete or no read.
  182. size_t retval = 0;
  183. if (stream == NULL) return(0);
  184. // shortcut
  185. if (stream->eof) return(0);
  186. switch (stream->format) {
  187. case 1: // uncompressed
  188. {
  189. FILE * in;
  190. in = (FILE *)(stream->data1);
  191. retval = fread(ptr, size, nmemb, in);
  192. if (retval != nmemb) {
  193. // fprintf(stderr,"short read!!!\n");
  194. stream->eof = feof(in);
  195. stream->error1 = ferror(in);
  196. retval = 0;
  197. }
  198. return (retval);
  199. }
  200. break;
  201. case 2: // bzip2
  202. {
  203. BZFILE * bzin;
  204. int bzerror;
  205. int buffsize;
  206. if (stream->bz2_stream_end == 1) {
  207. // feof-behaviour: Last read did consume last byte but not more
  208. stream->eof = 1;
  209. return(0);
  210. }
  211. bzerror = BZ_OK;
  212. bzin = (BZFILE *) (stream->data2);
  213. buffsize = size * nmemb;
  214. retval = BZ2_bzRead(&bzerror, bzin, ptr, buffsize);
  215. if (bzerror == BZ_STREAM_END ) {
  216. stream->bz2_stream_end = 1;
  217. stream->error2 = bzerror;
  218. if (retval == buffsize) {
  219. // feof-behaviour: no eof yet
  220. } else {
  221. // feof-behaviour: read past end, set eof
  222. stream->eof = 1;
  223. retval = 0;
  224. }
  225. return(retval/size);
  226. }
  227. if (bzerror == BZ_OK) {
  228. // Normal case, no error.
  229. // A short read here is an error, so catch it
  230. if (retval == buffsize) {
  231. return(retval/size);
  232. }
  233. }
  234. // Other error...
  235. stream->error2 = bzerror;
  236. BZ2_bzReadClose( &bzerror, bzin );
  237. if (bzerror != BZ_OK) {
  238. stream->error2 = bzerror;
  239. }
  240. retval = fclose((FILE *)(stream->data1));
  241. stream->error1 = retval;
  242. stream->closed = 1;
  243. return(0);
  244. }
  245. break;
  246. case 3: // gzip
  247. {
  248. gzFile in;
  249. in = (gzFile)(stream->data2);
  250. retval = gzread(in, ptr, size*nmemb);
  251. if (retval != nmemb*size) {
  252. // fprintf(stderr,"short read!!!\n");
  253. stream->eof = gzeof(in);
  254. stream->error2 = errno;
  255. retval = 0;
  256. }
  257. return (retval/size);
  258. }
  259. break;
  260. default: // this is an internal error, no diag yet.
  261. fprintf(stderr,"illegal format '%d' in cfr_read!\n",stream->format);
  262. exit(1);
  263. }
  264. }
  265. ssize_t cfr_getline(char **lineptr, size_t *n, CFRFILE *stream) {
  266. /************************************************************/
  267. // May not be very efficient, since it uses single-char reads
  268. // for formats where there is no native getline in the library.
  269. // For bzip2 the speedup for additional buffering was only 5%
  270. // so I dropped it.
  271. // Returns -1 in case of an error.
  272. char *tmp;
  273. if (stream == NULL) return(-1);
  274. switch (stream->format) {
  275. case 1: // uncompressed
  276. {
  277. if (fgets(*lineptr, *n, (FILE *)(stream->data1)) == NULL) {
  278. stream->error1 = errno;
  279. return -1;
  280. }
  281. return 0;
  282. }
  283. break;
  284. case 2: // bzip2
  285. {
  286. size_t count;
  287. char c;
  288. size_t ret;
  289. //bzin = (BZFILE *) (stream->data2);
  290. // allocate initial buffer if none was passed or size was zero
  291. if (*lineptr == NULL) {
  292. *lineptr = (char *) calloc(120, 1);
  293. if(*lineptr == NULL) {
  294. stream->error1 = errno;
  295. return(-1);
  296. }
  297. *n = 120;
  298. }
  299. if (*n == 0) {
  300. *n = 120;
  301. tmp = (char *) realloc(*lineptr, *n); // to avoid memory-leaks
  302. if(tmp == NULL) {
  303. stream->error1 = errno;
  304. return(-1);
  305. }
  306. *lineptr = tmp;
  307. }
  308. count = 0;
  309. // read until '\n'
  310. do {
  311. ret = cfr_read(&c, 1, 1, stream);
  312. if (ret != 1) {
  313. return(-1);
  314. }
  315. count ++;
  316. if (count >= *n) {
  317. *n = 2 * *n;
  318. tmp = (char *) realloc(*lineptr, *n);
  319. if (tmp == NULL) {
  320. stream->error1 = errno;
  321. return(-1);
  322. }
  323. *lineptr = tmp;
  324. }
  325. (*lineptr)[count-1] = c;
  326. } while (c != '\n');
  327. (*lineptr)[count] = 0;
  328. return(count);
  329. }
  330. break;
  331. case 3: // gzip
  332. {
  333. char * return_ptr = gzgets((gzFile)(stream->data2), *lineptr, *n );
  334. if (return_ptr == Z_NULL) {
  335. stream->error2 = errno;
  336. return(-1);
  337. }
  338. return *n;
  339. }
  340. break;
  341. default: // this is an internal error, no diag yet.
  342. fprintf(stderr,"illegal format '%d' in cfr_getline!\n",stream->format);
  343. exit(1);
  344. return(-1);
  345. }
  346. }
  347. int cfr_eof(CFRFILE *stream) {
  348. // Returns true on end of file/end of compressed data.
  349. // The end of the compressed data is regarded as end of file
  350. // in this library, embedded or multiple compressed data per
  351. // file is not supported by this library.
  352. //
  353. // Note: The sematics is that cfr_eof is true only after
  354. // the first byte after the end of file was read. Some compressors
  355. // report EOF already when the last availale character has been
  356. // read (far more sensible IMO), but for consistency we follow the
  357. // convention of the standard c library here.
  358. return(stream->eof);
  359. }
  360. int cfr_error(CFRFILE *stream) {
  361. // Returns true on error.
  362. // Errors can be ordinary errors from fopen.fclose/fread
  363. // or can originate from the underlying compression.
  364. // This function just returns 0 when there is no error or
  365. // 1 in case of error.
  366. // To get a more detailed report cfr_strerror will try to
  367. // come up with a description of the whole situation.
  368. // For numeric details, more query functions would need to be
  369. // implemented.
  370. if (stream == NULL) return(1);
  371. return(stream->error1 || stream->error2);
  372. }
  373. char * cfr_strerror(CFRFILE *stream) {
  374. // Result is "stream-i/o: <stream-error> <compressor>[: <compressor error>]"
  375. // Do not modify result.
  376. // Result may change on subsequent call to this function.
  377. static char res[120];
  378. int ret;
  379. char * msg, * msg2;
  380. if (stream == NULL) {
  381. snprintf(res, sizeof(res), "%s", "Error: stream is NULL, i.e. not opened");
  382. return(res);
  383. }
  384. ret = asprintf(&msg,
  385. "stream-i/o: %s, %s [%s]",
  386. stream->eof?"EOF":"",
  387. strerror(stream->error1),
  388. cfr_compressor_str(stream));
  389. if (ret == -1)
  390. goto oom;
  391. if (stream->format == 2) {
  392. ret = asprintf(&msg2,
  393. "%s: %s",
  394. msg,
  395. _cfr_compressor_strerror(stream->format, stream->error2));
  396. free(msg);
  397. if (ret == -1)
  398. goto oom;
  399. msg = msg2;
  400. }
  401. if (stream->format == 3) {
  402. ret = asprintf(&msg2,
  403. "%s: %s",
  404. msg,
  405. gzerror((gzFile)(stream->data2), &(stream->error2)));
  406. free(msg);
  407. if (ret == -1)
  408. goto oom;
  409. msg = msg2;
  410. }
  411. snprintf(res, sizeof(res), "%s", msg);
  412. free(msg);
  413. return(res);
  414. oom:
  415. snprintf(res, sizeof(res), "%s", "Error: asprintf: out of memory");
  416. return(res);
  417. }
  418. const char * cfr_compressor_str(CFRFILE *stream) {
  419. // Returns the name of the compressor used
  420. if ((stream->format < 0) || (stream->format >= CFR_NUM_FORMATS)) {
  421. return("undefined compression type");
  422. } else {
  423. return (cfr_formats[stream->format]);
  424. }
  425. }
  426. // Utility functions for compressor errors.
  427. // * Not part of the API, do not call directly as they may change! *
  428. const char * _cfr_compressor_strerror(int format, int err) {
  429. // Transforms error code to string for all compressors
  430. switch (format) {
  431. case 0:
  432. return("file not open");
  433. break;
  434. case 1:
  435. return("file not compressed");
  436. break;
  437. case 2:
  438. return(_bz2_strerror(err));
  439. break;
  440. case 3:
  441. return NULL;
  442. break;
  443. default:
  444. return("unknowen compressor code");
  445. }
  446. }
  447. const char * _bz2_strerror(int err) {
  448. // Since bzlib does not have strerror, we do it here manually.
  449. // This works for version 1.0 of 21 March 2000 of bzlib.h
  450. switch (err) {
  451. case BZ_OK: return("BZ_OK");
  452. case BZ_RUN_OK: return("BZ_RUN_OK");
  453. case BZ_FLUSH_OK: return("BZ_FLUSH_OK");
  454. case BZ_FINISH_OK: return("BZ_FINISH_OK");
  455. case BZ_STREAM_END: return("BZ_STREAM_END");
  456. case BZ_SEQUENCE_ERROR: return("BZ_SEQUENCE_ERROR");
  457. case BZ_PARAM_ERROR: return("BZ_PARAM_ERROR");
  458. case BZ_MEM_ERROR: return("BZ_MEM_ERROR");
  459. case BZ_DATA_ERROR: return("BZ_DATA_ERROR");
  460. case BZ_DATA_ERROR_MAGIC: return("BZ_DATA_ERROR_MAGIC");
  461. case BZ_IO_ERROR: return("BZ_IO_ERROR");
  462. case BZ_UNEXPECTED_EOF: return("BZ_UNEXPECTED_EOF");
  463. case BZ_OUTBUFF_FULL: return("BZ_OUTBUFF_FULL");
  464. case BZ_CONFIG_ERROR: return("BZ_CONFIG_ERROR");
  465. default: return("unknowen bzip2 error code");
  466. }
  467. }