ispell 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #------------------------------------------------------------------------------
  2. # $File: ispell,v 1.10 2023/10/23 19:49:58 christos Exp $
  3. # ispell: file(1) magic for ispell, MySpell, Hunspell and aspell
  4. #
  5. # Ispell 3.0 has a magic of 0x9601 and ispell 3.1 has 0x9602. This magic
  6. # will match 0x9600 through 0x9603 in *both* little endian and big endian.
  7. # (No other current magic entries collide.)
  8. #
  9. # Updated by Daniel Quinlan (quinlan@yggdrasil.com)
  10. #
  11. 0 leshort&0xFFFC 0x9600 little endian ispell
  12. >0 byte 0 hash file (?),
  13. >0 byte 1 3.0 hash file,
  14. >0 byte 2 3.1 hash file,
  15. >0 byte 3 hash file (?),
  16. >2 leshort 0x00 8-bit, no capitalization, 26 flags
  17. >2 leshort 0x01 7-bit, no capitalization, 26 flags
  18. >2 leshort 0x02 8-bit, capitalization, 26 flags
  19. >2 leshort 0x03 7-bit, capitalization, 26 flags
  20. >2 leshort 0x04 8-bit, no capitalization, 52 flags
  21. >2 leshort 0x05 7-bit, no capitalization, 52 flags
  22. >2 leshort 0x06 8-bit, capitalization, 52 flags
  23. >2 leshort 0x07 7-bit, capitalization, 52 flags
  24. >2 leshort 0x08 8-bit, no capitalization, 128 flags
  25. >2 leshort 0x09 7-bit, no capitalization, 128 flags
  26. >2 leshort 0x0A 8-bit, capitalization, 128 flags
  27. >2 leshort 0x0B 7-bit, capitalization, 128 flags
  28. >2 leshort 0x0C 8-bit, no capitalization, 256 flags
  29. >2 leshort 0x0D 7-bit, no capitalization, 256 flags
  30. >2 leshort 0x0E 8-bit, capitalization, 256 flags
  31. >2 leshort 0x0F 7-bit, capitalization, 256 flags
  32. >4 leshort >0 and %d string characters
  33. 0 beshort&0xFFFC 0x9600 big endian ispell
  34. >1 byte 0 hash file (?),
  35. >1 byte 1 3.0 hash file,
  36. >1 byte 2 3.1 hash file,
  37. >1 byte 3 hash file (?),
  38. >2 beshort 0x00 8-bit, no capitalization, 26 flags
  39. >2 beshort 0x01 7-bit, no capitalization, 26 flags
  40. >2 beshort 0x02 8-bit, capitalization, 26 flags
  41. >2 beshort 0x03 7-bit, capitalization, 26 flags
  42. >2 beshort 0x04 8-bit, no capitalization, 52 flags
  43. >2 beshort 0x05 7-bit, no capitalization, 52 flags
  44. >2 beshort 0x06 8-bit, capitalization, 52 flags
  45. >2 beshort 0x07 7-bit, capitalization, 52 flags
  46. >2 beshort 0x08 8-bit, no capitalization, 128 flags
  47. >2 beshort 0x09 7-bit, no capitalization, 128 flags
  48. >2 beshort 0x0A 8-bit, capitalization, 128 flags
  49. >2 beshort 0x0B 7-bit, capitalization, 128 flags
  50. >2 beshort 0x0C 8-bit, no capitalization, 256 flags
  51. >2 beshort 0x0D 7-bit, no capitalization, 256 flags
  52. >2 beshort 0x0E 8-bit, capitalization, 256 flags
  53. >2 beshort 0x0F 7-bit, capitalization, 256 flags
  54. >4 beshort >0 and %d string characters
  55. # ispell 4.0 hash files kromJx <kromJx@crosswinds.net>
  56. # Ispell 4.0
  57. 0 string ISPL ispell
  58. >4 long x hash file version %d,
  59. >8 long x lexletters %d,
  60. >12 long x lexsize %d,
  61. >16 long x hashsize %d,
  62. >20 long x stblsize %d
  63. # Summary: affixes defition text files for Ispell/MySpell/Hunspell
  64. # From: Joerg Jenderek
  65. # URL: https://www.openoffice.org/lingucomponent/affix.readme
  66. # https://man.archlinux.org/man/hunspell.5.en
  67. # Reference: http://mark0.net/download/triddefs_xml.7z/defs/a/affix.trid.xml
  68. # Note: called "Affix file" by TrID
  69. # variant starting with comment character
  70. 0 ubyte 0x23
  71. # look for SET character command followed by whitespace (seems to be often 1 space character) like in:
  72. # /usr/share/calibre/dictionaries/en-GB/en-GB.aff
  73. >0 search/60459 SET\040
  74. # skip scripts like /bin/affixcompress /bin/setupcon /bin/imdbpy2sql.py by checking for valid character SET argument
  75. # character SET argument like: UTF-8
  76. >>&0 string UTF-8
  77. >>>0 use spell-aff
  78. # character SET argument like: ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15
  79. >>&0 string ISO8859-
  80. >>>0 use spell-aff
  81. # character SET argument for Russian with Cyrillic alphabet like: KOI8-R KOI8-U
  82. # no russian support until war against ukraine
  83. >>&0 string KOI8-
  84. #>>>0 use spell-aff
  85. # character SET argument for languages with Cyrillic alphabet like: cp1251
  86. # no cyrillic support until russia war against ukraine
  87. >>&0 string cp1251
  88. #>>>0 use spell-aff
  89. # character SET argument for Indian Script Code for Information Interchange (ISCII) like: ISCII-DEVANAGARI
  90. >>&0 string ISCII-
  91. # no example found
  92. >>>0 use spell-aff
  93. # not "real" affix rule files but found as tests unit inside thunderbird sources like:
  94. # 1463589.aff 1695964.aff 2970240.aff
  95. >0 default x
  96. # look for suffix SFX command followed by whitespace like in:
  97. # 1695964.aff
  98. >>0 search/164 SFX\040
  99. >>>0 use spell-aff
  100. # if not real Hunspell/MySpell affix look for ispell variant
  101. >>0 default x
  102. # URL: https://manpages.debian.org/testing/ispell/ispell.5.en.html
  103. # look for ispell declaration like in: /usr/lib/ispell/espanol.aff
  104. >>>0 search/8251 defstringtype
  105. # defstringtype declaration start with unique name (like "list" "lat" "utf8" "iso" "nroff" often like formatter name)
  106. # followed by formatter name (like "nroff" "tex")
  107. # followed by suffix list (like ".mm" ".ms" ".me" ".man" ".NeXT" ".txt" ".list")
  108. #>>>>&1 string x DECLARATION=%s
  109. >>>>0 use spell-aff
  110. # ispell variant without declaration like in: /usr/lib/ispell/bulgarian.aff /usr/lib/ispell/russian.aff
  111. >>>0 default x
  112. # skip /etc/nilfs_cleanerd.conf by looking for ispell suffix section
  113. >>>>0 search/3233 suffixes\n
  114. >>>>>0 use spell-aff
  115. # variant starting with empty line and comment character at the beginning of 2nd line like in: /usr/lib/ispell/polish.aff
  116. 0 ubeshort 0x0a23
  117. # skip /etc/discover-modprobe.conf by looking for ispell declaration
  118. >2 search/3118 defstringtype
  119. >>0 use spell-aff
  120. # starting with UTF-8 Byte Order Mark (BOM) https://en.wikipedia.org/wiki/Byte_order_mark
  121. 0 string \xEF\xBB\xBF
  122. # starting with UTF-8 Byte Order Mark (BOM) followed by comment starting character
  123. >3 string \x23
  124. # starting with UTF-8 BOM and with SET character command followed by whitespace
  125. # like in: /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/lt.aff
  126. # look for character SET command used in MySpell and Hunspell
  127. >3 search/9883 SET\040
  128. >>0 use spell-aff
  129. # look for FLAG type command used in MySpell and Hunspell
  130. 0 string FLAG
  131. # followed by space character like in
  132. # /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/en_US.aff
  133. >4 ubyte 0x20
  134. >>0 use spell-aff
  135. # or followed by tabulator character like in
  136. # /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
  137. >4 ubyte 0x09
  138. >>0 use spell-aff
  139. # starting with character SET command used in MySpell and Hunspell like in: org/languagetool/resource/sv/hunspell/sv_SE.aff
  140. 0 string SET\040
  141. >0 use spell-aff
  142. # starting with language code LANG used in MySpell and Hunspell like in: /usr/share/hunspell/tr_TR.aff
  143. 0 string LANG\040
  144. >0 use spell-aff
  145. # starting with affix flag command AF used in MySpell and Hunspell like in: /usr/lib/thunderbird/extensions/langpack-hu@thunderbird.mozilla.org/dictionaries/hu.aff
  146. 0 string AF\040
  147. # look for number of flag vector aliases
  148. >3 regex [0-9]{1,4}
  149. >>0 use spell-aff
  150. # display information (encoding,language,...) about affixes rules text for Ispell/MySpell/Hunspell
  151. 0 name spell-aff
  152. >1 ubeshort x affix definition
  153. #!:mime text/plain
  154. !:mime text/x-affix
  155. !:ext aff
  156. # GRR: need extra test so that default clause works
  157. >0 ubyte x
  158. # look for ispell declaration
  159. >>0 search/8251 defstringtype for Ispell
  160. # ispell variant without declaration
  161. >>0 default x
  162. # look for ispell suffixes command
  163. >>>0 search/3233 suffixes
  164. # skip "suffixes used to create first part of a compound" by checking for flag argument like in: languagetool\resource\sv\hunspell\sv_SE.aff
  165. >>>>&0 search/2 flag for Ispell
  166. >>>>&0 default x for MySpell/Hunspell
  167. # without suffixes keyword
  168. >>>0 default x for MySpell/Hunspell
  169. # look for language code command used in MySpell and Hunspell
  170. # like in: /usr/share/hunspell/de_AT.aff /usr/share/hunspell/it_IT.aff /usr/share/hunspell/tr_TR.aff /usr/lib/firefox/browser/extensions/langpack-hu@firefox.mozilla.org/dictionaries/hu.aff
  171. >>0 search/1117643 LANG\040 \b, language
  172. # language code argument like: de_DE hu_HU it_IT mn_MN tr_TR
  173. >>>&0 string x %s
  174. # look for character SET command used in MySpell and Hunspell
  175. >>0 search/1117729 SET
  176. # skip SETTINGS like in /usr/lib/ispell/ngerman.aff
  177. # SET command followed often by space character (0x20) or tabulator (0x09) like in
  178. # /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
  179. >>>&0 ubyte&0xD6 =0x00
  180. # skip SSET # schosS in /usr/lib/ispell/ogerman.aff
  181. >>>>&0 ubyte >0x48 \b,
  182. # character SET argument like: cp1251 ISCII-DEVANAGAR ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15 KOI8-R KOI8-U UTF-8
  183. >>>>>&-1 string x "%s" encoded
  184. # for control reasons show first non empty lines for ASCII or ISO-8859 text variant
  185. >1 ubeshort !0xBBBF
  186. # 1st line starting with 0x0A like in /usr/src/dicts/sjp-ispell-pl-20140213/polish.aff
  187. >>0 ubyte =0x0A
  188. >>>1 ubyte !0x0A \b, 2nd line
  189. >>>>&-1 string x "%s"
  190. # 3rd line starting with 0x0A like in polish.aff
  191. >>>>>&1 ubyte =0x0A
  192. >>>>>>&0 string x \b, 4th line "%s"
  193. # 1st line starting with ASCII text like:
  194. # this is the affix file of the de_DE Hunspell dictionary
  195. >>0 ubyte !0x0A
  196. >>>0 string x \b, 1st line "%s"
  197. >>>>&1 ubyte >0x1F \b, 2nd line
  198. >>>>>&-1 string x "%s"
  199. # 2nd line starting with 0x0A like in /usr/lib/ispell/bulgarian.aff
  200. >>>>&1 ubyte =0x0A \b, 3rd line
  201. >>>>>&0 string x "%s"
  202. # for control reasons show first lines for variant starting with ByteOrderMark (BOM=\xEF\xBB\xBF)
  203. >1 ubeshort =0xBBBF \b, with BOM
  204. >>3 string x \b, 1st line "%s"
  205. >>>&1 ubyte >0x1F \b, 2nd line
  206. >>>>&-1 string x "%s"
  207. # From: Joerg Jenderek
  208. # URL: https://en.wikipedia.org/wiki/GNU_Aspell
  209. # https://manpages.ubuntu.com/manpages/trusty/en/man8/aspell-autobuildhash.8.html
  210. # Reference: http://mark0.net/download/triddefs_xml.7z/defs/r/rws-aspell.trid.xml
  211. # https://ftp.gnu.org/gnu/aspell/aspell-0.60.8.tar.gz
  212. # aspell-0.60.8/modules/speller/default/data.cpp
  213. # aspell-0.60.8/modules/speller/default/readonly_ws.cpp
  214. # Note: called "aspell dictionary" by TrID
  215. 0 string aspell\040default\040speller\040rowl aspell dictionary
  216. #!:mime application/octet-stream
  217. !:mime application/x-aspell-dictionary
  218. !:ext rws
  219. # version like: 1.10 1.4
  220. >28 string x \b, version %s
  221. # u32int endian_check; 12345678=00BC614Eh
  222. #>64 ulelong x \b, endian_check=%u
  223. >>64 ulelong 12345678 \b, little endian
  224. # not tested
  225. >>64 ubelong 12345678 \b, big endian
  226. # older aspell version not like 0.60.8
  227. >>64 default x \b, old
  228. # URL: https://en.wikipedia.org/wiki/GNU_Aspell
  229. # Reference http://aspell.net/man-html/Format-of-the-Personal-and-Replacement-Dictionaries.html
  230. # personal_ws-1.1 lang num [encoding]
  231. 0 string personal_ aspell personal
  232. # Reference: http://mark0.net/download/triddefs_xml.7z/defs/p/pws-aspell.trid.xml
  233. # Note: called "aspell Personal dictionary" by TrID
  234. >9 string ws- dictionary
  235. #!:mime text/plain
  236. !:mime text/x-aspell-dictionary
  237. # like: ~/.aspell.en.pws ~/.aspell.de_DE.pws ~/.aspell.it.pws
  238. !:ext pws
  239. # Reference: http://mark0.net/download/triddefs_xml.7z/defs/p/prepl-aspell.trid.xml
  240. # Note: called "aspell Personal Replacement dictionary" by TrID
  241. # personal_repl-1.1 lang num [encoding]
  242. >9 string repl- replacement dictionary
  243. #!:mime text/plain
  244. !:mime text/x-aspell-dictionary
  245. # like: ~/.aspell.en.prepl ~/.aspell.de_DE.prepl ~/.aspell.it.prepl
  246. !:ext prepl