cfile_tools.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. /**
  2. cfile_tools.c
  3. A library to deal transparently with possibly compressed files.
  4. Documentation in the function headers and in cfile_tools.h
  5. Copyright (C) 2004 by Arno Wagner <arno.wagner@acm.org>
  6. Distributed under the Gnu Public License version 2 or the modified
  7. BSD license (see file COPYING)
  8. Support for gzip added by Bernhard Tellenbach <bernhard.tellenbach@gmail.com>
  9. */
  10. #define _GNU_SOURCE
  11. #define _FILE_OFFSET_BITS 64
  12. //#ifndef DONT_HAVE_BZ2
  13. //#include <bzlib.h>
  14. //#endif
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #include <errno.h>
  18. #include <assert.h>
  19. #include "cfile_tools.h"
  20. // Concrete formats. remember to adjust CFR_NUM_FORMATS if changed!
  21. // Note: 0, 1 are special entries.
  22. const char * cfr_formats[CFR_NUM_FORMATS] = {
  23. "not open", // 0
  24. "uncompressed", // 1
  25. #ifndef DONT_HAVE_BZ2
  26. "bzip2", // 2
  27. #endif
  28. #ifndef DONT_HAVE_GZ
  29. "gzip", // 3
  30. #endif
  31. };
  32. const char * cfr_extensions[CFR_NUM_FORMATS] = {
  33. "", // 0
  34. "", // 1
  35. #ifndef DONT_HAVE_BZ2
  36. ".bz2", // 2
  37. #endif
  38. #ifndef DONT_HAVE_GZ
  39. ".gz" // 3
  40. #endif
  41. };
  42. // Prototypes of non API functions (don't use these from outside this file)
  43. const char * _cfr_compressor_strerror(int format, int err);
  44. const char * _bz2_strerror(int err);
  45. // API Functions
  46. CFRFILE *cfr_open(const char *path) {
  47. /*******************************/
  48. // Analog to 'fopen'. Error in result has to be tested using
  49. // 'cfr_error' on the result!
  50. // Note: The user needs to free the reurn value!
  51. // Opens a possibly compressed file for reading.
  52. // File type is determined by file name ending
  53. int format, ext_len, name_len;
  54. CFRFILE * retval = NULL;
  55. // determine file format
  56. name_len = strlen(path);
  57. format = 2; // skip specials 0, 1
  58. // Do action dependent on file format
  59. retval = (CFRFILE *) calloc(1,sizeof(CFRFILE));
  60. retval->eof = 0;
  61. retval->error1 = 0;
  62. retval->error2 = 0;
  63. #ifndef DONT_HAVE_GZ
  64. if((path == NULL) || (strcmp(path, "-") == 0)) {
  65. /* dump from stdin */
  66. gzFile *f;
  67. while (format < CFR_NUM_FORMATS) {
  68. if (strcmp(cfr_extensions[format], ".gz") == 0)
  69. break;
  70. format ++;
  71. }
  72. f = gzdopen(0, "r");
  73. if(f == NULL) {
  74. free(retval);
  75. return (NULL);
  76. }
  77. retval->data2 = f;
  78. retval->format = format;
  79. return (retval);
  80. }
  81. #endif
  82. while (format < CFR_NUM_FORMATS) {
  83. ext_len = strlen(cfr_extensions[format]);
  84. if (strncmp(cfr_extensions[format],
  85. path+(name_len-ext_len),
  86. ext_len) == 0
  87. ) break;
  88. format ++;
  89. }
  90. if (format >= CFR_NUM_FORMATS)
  91. format = 1; // uncompressed
  92. retval->format = format;
  93. switch (format) {
  94. case 1: // uncompressed
  95. {
  96. FILE * in;
  97. in = fopen(path,"r");
  98. if (in == NULL) {
  99. free(retval);
  100. return(NULL);
  101. }
  102. retval->data1 = in;
  103. return(retval);
  104. }
  105. break;
  106. #ifndef DONT_HAVE_BZ2
  107. case 2: // bzip2
  108. {
  109. int bzerror;
  110. BZFILE * bzin;
  111. FILE * in;
  112. retval->bz2_stream_end = 0;
  113. // get file
  114. in = fopen(path,"r");
  115. if (in == NULL) {
  116. free(retval);
  117. return(NULL);
  118. }
  119. retval->data1 = in;
  120. // bzip2ify file
  121. bzin = BZ2_bzReadOpen( &bzerror, in, 0, 0, NULL, 0);
  122. if (bzerror != BZ_OK) {
  123. errno = bzerror;
  124. BZ2_bzReadClose( &bzerror, bzin);
  125. fclose(in);
  126. free(retval);
  127. return(NULL);
  128. }
  129. retval->data2 = bzin;
  130. return(retval);
  131. }
  132. break;
  133. #endif
  134. #ifndef DONT_HAVE_GZ
  135. case 3: // gzip
  136. {
  137. gzFile *f;
  138. // get file
  139. f = gzopen(path, "r");
  140. if(f == NULL) {
  141. free(retval);
  142. return (NULL);
  143. }
  144. retval->data2 = f;
  145. return (retval);
  146. }
  147. break;
  148. #endif
  149. default: // this is an internal error, no diag yet.
  150. fprintf(stderr,"illegal format '%d' in cfr_open!\n", format);
  151. exit(1);
  152. }
  153. return NULL;
  154. }
  155. int cfr_close(CFRFILE *stream) {
  156. /**************************/
  157. // Analog to 'fclose'.
  158. // FIXME - why is stream->* set, then freed?
  159. if (stream == NULL || stream->closed) {
  160. errno = EBADF;
  161. return -1;
  162. }
  163. int retval = -1;
  164. switch (stream->format) {
  165. case 1: // uncompressed
  166. retval = fclose((FILE *)(stream->data1));
  167. stream->error1 = retval;
  168. break;
  169. case 2: // bzip2
  170. #ifndef DONT_HAVE_BZ2
  171. BZ2_bzReadClose( &stream->error2, (BZFILE *)stream->data2);
  172. stream->error1 = retval = fclose((FILE *)(stream->data1));
  173. break;
  174. #endif
  175. case 3: // gzip
  176. #ifndef DONT_HAVE_GZ
  177. if(stream->data2!=NULL)
  178. retval = gzclose(stream->data2);
  179. stream->error2 = retval;
  180. break;
  181. #endif
  182. default: // internal error
  183. assert("illegal stream->format" && 0);
  184. }
  185. free(stream);
  186. return(retval);
  187. }
  188. size_t cfr_read_n(CFRFILE *stream, void *ptr, size_t bytes) {
  189. /******************************************************************/
  190. // Wrapper, will return either 'bytes' (the number of bytes to read) or 0
  191. return(cfr_read(ptr, bytes, 1, stream)*bytes);
  192. }
  193. size_t cfr_read(void *ptr, size_t size, size_t nmemb, CFRFILE *stream) {
  194. /******************************************************************/
  195. // Analog to 'fread'. Will not return with partial elements, only
  196. // full ones. Hence calling this function with one large element
  197. // size will result in a complete or no read.
  198. size_t retval = 0;
  199. if (stream == NULL) return(0);
  200. // shortcut
  201. if (stream->eof) return(0);
  202. switch (stream->format) {
  203. case 1: // uncompressed
  204. {
  205. FILE * in;
  206. in = (FILE *)(stream->data1);
  207. retval = fread(ptr, size, nmemb, in);
  208. if (retval != nmemb) {
  209. // fprintf(stderr,"short read!!!\n");
  210. stream->eof = feof(in);
  211. stream->error1 = ferror(in);
  212. retval = 0;
  213. }
  214. return (retval);
  215. }
  216. break;
  217. #ifndef DONT_HAVE_BZ2
  218. case 2: // bzip2
  219. {
  220. BZFILE * bzin;
  221. int bzerror;
  222. int buffsize;
  223. if (stream->bz2_stream_end == 1) {
  224. // feof-behaviour: Last read did consume last byte but not more
  225. stream->eof = 1;
  226. return(0);
  227. }
  228. bzerror = BZ_OK;
  229. bzin = (BZFILE *) (stream->data2);
  230. buffsize = size * nmemb;
  231. retval = BZ2_bzRead(&bzerror, bzin, ptr, buffsize);
  232. if (bzerror == BZ_STREAM_END ) {
  233. stream->bz2_stream_end = 1;
  234. stream->error2 = bzerror;
  235. if (retval == buffsize) {
  236. // feof-behaviour: no eof yet
  237. } else {
  238. // feof-behaviour: read past end, set eof
  239. stream->eof = 1;
  240. retval = 0;
  241. }
  242. return(retval/size);
  243. }
  244. if (bzerror == BZ_OK) {
  245. // Normal case, no error.
  246. // A short read here is an error, so catch it
  247. if (retval == buffsize) {
  248. return(retval/size);
  249. }
  250. }
  251. // Other error...
  252. stream->error2 = bzerror;
  253. BZ2_bzReadClose( &bzerror, bzin );
  254. if (bzerror != BZ_OK) {
  255. stream->error2 = bzerror;
  256. }
  257. retval = fclose((FILE *)(stream->data1));
  258. stream->error1 = retval;
  259. stream->closed = 1;
  260. return(0);
  261. }
  262. break;
  263. #endif
  264. #ifndef DONT_HAVE_GZ
  265. case 3: // gzip
  266. {
  267. gzFile * in;
  268. in = (gzFile *)(stream->data2);
  269. retval = gzread(in, ptr, size*nmemb);
  270. if (retval != nmemb*size) {
  271. // fprintf(stderr,"short read!!!\n");
  272. stream->eof = gzeof(in);
  273. stream->error2 = errno;
  274. retval = 0;
  275. }
  276. return (retval/size);
  277. }
  278. break;
  279. #endif
  280. default: // this is an internal error, no diag yet.
  281. fprintf(stderr,"illegal format '%d' in cfr_read!\n",stream->format);
  282. exit(1);
  283. }
  284. }
  285. ssize_t cfr_getline(char **lineptr, size_t *n, CFRFILE *stream) {
  286. /************************************************************/
  287. // May not be very efficient, since it uses single-char reads
  288. // for formats where there is no native getline in the library.
  289. // For bzip2 the speedup for additional buffering was only 5%
  290. // so I dropped it.
  291. // Returns -1 in case of an error.
  292. if (stream == NULL) return(-1);
  293. switch (stream->format) {
  294. case 1: // uncompressed
  295. {
  296. if (fgets(*lineptr, *n, (FILE *)(stream->data1)) == NULL) {
  297. stream->error1 = errno;
  298. return -1;
  299. }
  300. return 0;
  301. }
  302. break;
  303. #ifndef DONT_HAVE_BZ2
  304. case 2: // bzip2
  305. {
  306. size_t count;
  307. char c;
  308. size_t ret;
  309. //bzin = (BZFILE *) (stream->data2);
  310. // allocate initial buffer if none was passed or size was zero
  311. if (*lineptr == NULL) {
  312. *lineptr = (char *) calloc(120, 1);
  313. *n = 120;
  314. }
  315. if (*n == 0) {
  316. *n = 120;
  317. *lineptr = (char *) realloc(*lineptr, *n); // to avoid memory-leaks
  318. }
  319. count = 0;
  320. // read until '\n'
  321. do {
  322. ret = cfr_read(&c, 1, 1, stream);
  323. if (ret != 1) {
  324. return(-1);
  325. }
  326. count ++;
  327. if (count >= *n) {
  328. *n = 2 * *n;
  329. *lineptr = (char *) realloc(*lineptr, *n);
  330. if (*lineptr == NULL) {
  331. stream->error1 = errno;
  332. return(-1);
  333. }
  334. }
  335. (*lineptr)[count-1] = c;
  336. } while (c != '\n');
  337. (*lineptr)[count] = 0;
  338. return(count);
  339. }
  340. break;
  341. #endif
  342. #ifndef DONT_HAVE_GZ
  343. case 3: // gzip
  344. {
  345. char * return_ptr = gzgets((gzFile *)(stream->data2), *lineptr, *n );
  346. if (return_ptr == Z_NULL) {
  347. stream->error2 = errno;
  348. return(-1);
  349. }
  350. return *n;
  351. }
  352. break;
  353. #endif
  354. default: // this is an internal error, no diag yet.
  355. fprintf(stderr,"illegal format '%d' in cfr_getline!\n",stream->format);
  356. exit(1);
  357. return(-1);
  358. }
  359. }
  360. int cfr_eof(CFRFILE *stream) {
  361. // Returns true on end of file/end of compressed data.
  362. // The end of the compressed data is regarded as end of file
  363. // in this library, embedded or multiple compressed data per
  364. // file is not supported by this library.
  365. //
  366. // Note: The sematics is that cfr_eof is true only after
  367. // the first byte after the end of file was read. Some compressors
  368. // report EOF already when the last availale character has been
  369. // read (far more sensible IMO), but for consistency we follow the
  370. // convention of the standard c library here.
  371. return(stream->eof);
  372. }
  373. int cfr_error(CFRFILE *stream) {
  374. // Returns true on error.
  375. // Errors can be ordinary errors from fopen.fclose/fread
  376. // or can originate from the underlying compression.
  377. // This function just returns 0 when there is no error or
  378. // 1 in case of error.
  379. // To get a more detailed report cfr_strerror will try to
  380. // come up with a description of the whole situation.
  381. // For numeric details, more query functions would need to be
  382. // implemented.
  383. if (stream == NULL) return(1);
  384. return(stream->error1 || stream->error2);
  385. }
  386. char * cfr_strerror(CFRFILE *stream) {
  387. // Result is "stream-i/o: <stream-error> <compressor>[: <compressor error>]"
  388. // Do not modify result.
  389. // Result may change on subsequent call to this function.
  390. static char res[120];
  391. char * msg, * msg2;
  392. if (stream == NULL) {
  393. asprintf(&msg,"Error: stream is NULL, i.e. not opened");
  394. return(msg);
  395. }
  396. asprintf(&msg,
  397. "stream-i/o: %s, %s [%s]",
  398. stream->eof?"EOF":"",
  399. strerror(stream->error1),
  400. cfr_compressor_str(stream));
  401. if (stream->format == 2) {
  402. asprintf(&msg2,
  403. "%s: %s",
  404. msg,
  405. _cfr_compressor_strerror(stream->format, stream->error2));
  406. free(msg);
  407. msg = msg2;
  408. }
  409. if (stream->format == 3) {
  410. asprintf(&msg2,
  411. "%s: %s",
  412. msg,
  413. gzerror((gzFile*)(stream->data2), &(stream->error2)));
  414. free(msg);
  415. msg = msg2;
  416. }
  417. snprintf(res, 120, "%s", msg);
  418. res[119] = 0;
  419. free(msg);
  420. return(res);
  421. }
  422. const char * cfr_compressor_str(CFRFILE *stream) {
  423. // Returns the name of the compressor used
  424. if ((stream->format < 0) || (stream->format >= CFR_NUM_FORMATS)) {
  425. return("undefined compression type");
  426. } else {
  427. return (cfr_formats[stream->format]);
  428. }
  429. }
  430. // Utility functions for compressor errors.
  431. // * Not part of the API, do not call directly as they may change! *
  432. const char * _cfr_compressor_strerror(int format, int err) {
  433. // Transforms error code to string for all compressors
  434. switch (format) {
  435. case 0:
  436. return("file not open");
  437. break;
  438. case 1:
  439. return("file not compressed");
  440. break;
  441. #ifndef DONT_HAVE_BZ2
  442. case 2:
  443. return(_bz2_strerror(err));
  444. break;
  445. #endif
  446. #ifndef DONT_HAVE_GZ
  447. case 3:
  448. return NULL;
  449. break;
  450. #endif
  451. default:
  452. return("unknowen compressor code");
  453. }
  454. }
  455. #ifndef DONT_HAVE_BZ2
  456. const char * _bz2_strerror(int err) {
  457. // Since bzlib does not have strerror, we do it here manually.
  458. // This works for version 1.0 of 21 March 2000 of bzlib.h
  459. switch (err) {
  460. case BZ_OK: return("BZ_OK");
  461. case BZ_RUN_OK: return("BZ_RUN_OK");
  462. case BZ_FLUSH_OK: return("BZ_FLUSH_OK");
  463. case BZ_FINISH_OK: return("BZ_FINISH_OK");
  464. case BZ_STREAM_END: return("BZ_STREAM_END");
  465. case BZ_SEQUENCE_ERROR: return("BZ_SEQUENCE_ERROR");
  466. case BZ_PARAM_ERROR: return("BZ_PARAM_ERROR");
  467. case BZ_MEM_ERROR: return("BZ_MEM_ERROR");
  468. case BZ_DATA_ERROR: return("BZ_DATA_ERROR");
  469. case BZ_DATA_ERROR_MAGIC: return("BZ_DATA_ERROR_MAGIC");
  470. case BZ_IO_ERROR: return("BZ_IO_ERROR");
  471. case BZ_UNEXPECTED_EOF: return("BZ_UNEXPECTED_EOF");
  472. case BZ_OUTBUFF_FULL: return("BZ_OUTBUFF_FULL");
  473. case BZ_CONFIG_ERROR: return("BZ_CONFIG_ERROR");
  474. default: return("unknowen bzip2 error code");
  475. }
  476. }
  477. #endif