Mstdlib-1.24.0
m_textcodec.h
1/* The MIT License (MIT)
2 *
3 * Copyright (c) 2018 Monetra Technologies, LLC.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 * THE SOFTWARE.
22 */
23
24#ifndef __M_TEXTCODEC_H__
25#define __M_TEXTCODEC_H__
26
27/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
28
29#include <mstdlib/base/m_defs.h>
30#include <mstdlib/base/m_types.h>
31
32/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
33
34__BEGIN_DECLS
35
36/*! \addtogroup m_textcodec Text Encoding Conversion
37 * \ingroup m_text
38 *
39 * Text codec conversion. E.g. utf-8 to X and X to utf-8.
40 *
41 * utf-8 is used as the base codec. Input for encode should be utf-8 and
42 * output from decode will be utf-8.
43 *
44 *
45 * Codec | Name | Alias
46 * -----------------|-----------------------------------|------
47 * UTF8 | utf8 | utf-8, utf_8
48 * ASCII | ascii | us-ascii
49 * CP037 | cp037 | ibm037, ibm-037, ibm039, ibm-039
50 * CP500 | cp500 | ibm500, ibm-500, ebcdic-cp-be, ebcdic-cp-ch
51 * CP874 | cp874 | windows-874
52 * CP1250 | cp1250 | windows-1250
53 * CP1251 | cp1251 | windows-1251
54 * CP1252 | cp1252 | windows-1252
55 * CP1253 | cp1253 | windows-1253
56 * CP1254 | cp1254 | windows-1254
57 * CP1255 | cp1255 | windows-1255
58 * CP1256 | cp1256 | windows-1256
59 * CP1257 | cp1257 | windows-1257
60 * CP1258 | cp1258 | windows-1258
61 * ISO8859_1 | latin_1 | latin-1, latin1, latin 1, latin, l1, iso-8859-1, iso8859-1, iso8859_1, iso88591, 8859, 88591, cp819
62 * ISO8859_2 | latin_2 | latin-2, latin2, latin 2, l2, iso-8859-2, iso8859-2, iso8859_2, iso88592, 88592
63 * ISO8859_3 | latin_3 | latin-3, latin3, latin 3, l3, iso-8859-3, iso8859-3, iso8859_3, iso88593, 88593
64 * ISO8859_4 | latin_4 | latin-4, latin4, latin 4, l4, iso-8859-4, iso8859-4, iso8859_4, iso88594, 88594
65 * ISO8859_5 | cyrillic | iso-8859-5, iso8859-5, iso8859_5, iso88595, 88595
66 * ISO8859_6 | arabic | iso-8859-6, iso8859-6, iso8859_6, iso88596, 88596
67 * ISO8859_7 | greek | iso-8859-7, iso8859-7, iso8859_7, iso88597, 88597
68 * ISO8859_8 | hebrew | iso-8859-8, iso8859-8, iso8859_8, iso88598, 88598
69 * ISO8859_9 | latin_5 | latin-5, latin5, latin 5, l5, iso-8859-9, iso8859-9, iso8859_9, iso88599, 88599
70 * ISO8859_10 | latin_6 | latin-6, latin6, latin 6, l6, iso-8859-10, iso8859-10, iso8859_10, iso885910, 885910
71 * ISO8859_11 | thai | iso-8859-11, iso8859-11, iso8859_11, iso885911, 885911
72 * ISO8859_13 | latin_7 | latin-7, latin7, latin 7, l7, iso-8859-13, iso8859-13, iso8859_13, iso885913, 885913
73 * ISO8859_14 | latin_8 | latin-8, latin8, latin 8, l8, iso-8859-14, iso8859-14, iso8859_14, iso885914, 885914
74 * ISO8859_15 | latin_9 | latin-9, latin9, latin 9, l9, iso-8859-15, iso8859-15, iso8859_15, iso885915, 885915
75 * ISO8859_16 | latin_10 | latin-10, latin10, latin 10, l10, iso-8859-16, iso8859-16, iso8859_16, iso885916, 885916
76 * PERCENT_URL | percent | url
77 * PERCENT_FORM | application/x-www-form-urlencoded | x-www-form-urlencoded, www-form-urlencoded, form-urlencoded, percent_plus url_plus, , percent-plus, url-plus, percentplus, urlplus
78 * PERCENT_URLMIN | percent_min | url_min
79 * PERCENT_FORMMIN | form_min | form-urlencoded-min
80 * PUNYCODE | punycode | puny
81 * QUOTED_PRINTABLE | puoted-printable | qp
82 *
83 * If validating UTF-8 strings, use M_utf8_is_valid().
84 *
85 * UTF-8 to UTF-8 conversion for decode and encode is supported and intended to be used with
86 * the replace error handler. Specifically when dealing with UTF-8 strings that are known to be
87 * or could be invalid and need to be "sanitized" for continued use. The difference between
88 * encode and decode with UTF-8 to UTF-8 is the replacement character.
89 *
90 * @{
91 */
92
93
94/*! Error handling logic. */
95typedef enum {
96 M_TEXTCODEC_EHANDLER_FAIL, /*!< Errors should be considered a hard failure. */
97 M_TEXTCODEC_EHANDLER_REPLACE, /*!< Encode replace with ?. Decode replace with U+FFFD. */
98 M_TEXTCODEC_EHANDLER_IGNORE /*!< Ignore data that cannot be encoded or decoded in the codec. */
100
101
102/*! Text codecs that can be used for encoding and decoding. */
103typedef enum {
104 M_TEXTCODEC_UNKNOWN, /*!< Unknown / invalid codec. */
105 M_TEXTCODEC_UTF8, /*!< Utf-8. */
106 M_TEXTCODEC_ASCII, /*!< Ascii. */
107 M_TEXTCODEC_CP037, /*!< EBCDIC US Canada. */
108 M_TEXTCODEC_CP500, /*!< EBCDIC International. */
109 M_TEXTCODEC_CP874, /*!< Windows code page 874, Thai. */
110 M_TEXTCODEC_CP1250, /*!< Windows code page 1250, Central and Eastern Europe. */
111 M_TEXTCODEC_CP1251, /*!< Windows code page 1251, Bulgarian, Byelorussian, Macedonian, Russian, Serbian. */
112 M_TEXTCODEC_CP1252, /*!< Windows code page 1252, Western Europe. */
113 M_TEXTCODEC_CP1253, /*!< Windows code page 1253, Greek. */
114 M_TEXTCODEC_CP1254, /*!< Windows code page 1254, Turkish. */
115 M_TEXTCODEC_CP1255, /*!< Windows code page 1255, Hebrew. */
116 M_TEXTCODEC_CP1256, /*!< Windows code page 1256, Arabic. */
117 M_TEXTCODEC_CP1257, /*!< Windows code page 1257, Baltic languages. */
118 M_TEXTCODEC_CP1258, /*!< Windows code page 1258, Vietnamese. */
119 M_TEXTCODEC_ISO8859_1, /*!< ISO-8859-1. Latin 1, Western Europe. */
120 M_TEXTCODEC_ISO8859_2, /*!< ISO-8859-2. Latin 2, Central and Eastern Europe. */
121 M_TEXTCODEC_ISO8859_3, /*!< ISO-8859-3. Latin 3, Esperanto, Maltese. */
122 M_TEXTCODEC_ISO8859_4, /*!< ISO-8859-4. Latin 4, Baltic languages. */
123 M_TEXTCODEC_ISO8859_5, /*!< ISO-8859-5. Cyrillic. */
124 M_TEXTCODEC_ISO8859_6, /*!< ISO-8859-6. Arabic. */
125 M_TEXTCODEC_ISO8859_7, /*!< ISO-8859-7. Greek. */
126 M_TEXTCODEC_ISO8859_8, /*!< ISO-8859-8. Hebrew. */
127 M_TEXTCODEC_ISO8859_9, /*!< ISO-8859-9. Latin 5, Turkish. */
128 M_TEXTCODEC_ISO8859_10, /*!< ISO-8859-10. Latin 6, Nordic languages. */
129 M_TEXTCODEC_ISO8859_11, /*!< ISO-8859-11. Thai. */
130 M_TEXTCODEC_ISO8859_13, /*!< ISO-8859-13. Latin 7, Baltic languages. */
131 M_TEXTCODEC_ISO8859_14, /*!< ISO-8859-14. Latin 8, Celtic languages. */
132 M_TEXTCODEC_ISO8859_15, /*!< ISO-8859-15. Latin 9, Western Europe. */
133 M_TEXTCODEC_ISO8859_16, /*!< ISO-8859-16. Latin 10, South-Eastern Europe. */
134 M_TEXTCODEC_PERCENT_URL, /*!< Percent encoding for use as a URL rules. Must be utf-8. */
135 M_TEXTCODEC_PERCENT_FORM, /*!< Percent suitable for use as form data. Space as + and ~ encoded. Must be utf-8. */
136 M_TEXTCODEC_PERCENT_URLMIN, /*!< Minimal percent encoding. space and non ascii characters will be encoded but
137 all other reserved characters are not encoded. This is intended as a fix up
138 for URLs that have already been built. Typically built by hand. Must be utf-8. */
139 M_TEXTCODEC_PERCENT_FORMMIN, /*!< Minimal percent encoding suitable for use as form data. Space as + and ~ encoded.
140 Space and non-ascii characters are encoded. All other reserved characters are not
141 encoded. This is intended as a fix up. Must be utf-8. */
142 M_TEXTCODEC_PUNYCODE, /*!< IDNA Punycode (RFC 3492). Primarily used for DNS.
143 Error handlers will be ignore and all error conditions are failures. */
144 M_TEXTCODEC_QUOTED_PRINTABLE /*!< RFC 2045 quoted printable content transfer encoding. */
146
147
148/*! Result of a codec conversion. */
149typedef enum {
150 M_TEXTCODEC_ERROR_SUCCESS, /*!< Successfully converted. */
151 M_TEXTCODEC_ERROR_SUCCESS_EHANDLER, /*!< Succesfully converted based on error handling logic. */
152 M_TEXTCODEC_ERROR_FAIL, /*!< Failure to convert. */
153 M_TEXTCODEC_ERROR_BADINPUT, /*!< Input not in specified encoding. This cannot always be detected and
154 should not be used as a means of determining input encoding. */
155 M_TEXTCODEC_ERROR_INVALID_PARAM, /*!< Invalid parameter. */
157
158
159/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
160
161/*! Encode a utf-8 string using the requested text encoding.
162 *
163 * \param[out] out Encoded string.
164 * \param[in] in Input utf-8 string.
165 * \param[in] ehandler Error handling logic to use.
166 * \param[in] codec Encoding to use for output.
167 *
168 * \return Result.
169 */
170M_API M_textcodec_error_t M_textcodec_encode(char **out, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
171
172
173/*! Encode a utf-8 string into an M_buf_t using the requested text encoding.
174 *
175 * \param[in] buf Buffer to put encoded string data.
176 * \param[in] in Input utf-8 string.
177 * \param[in] ehandler Error handling logic to use.
178 * \param[in] codec Encoding to use for output.
179 *
180 * \return Result.
181 */
182M_API M_textcodec_error_t M_textcodec_encode_buf(M_buf_t *buf, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
183
184
185/*! Encode a utf-8 string into an M_parser_t using the requested text encoding.
186 *
187 * \param[in] parser Parser to put encoded string data.
188 * \param[in] in Input utf-8 string.
189 * \param[in] ehandler Error handling logic to use.
190 * \param[in] codec Encoding to use for output.
191 *
192 * \return Result.
193 */
194M_API M_textcodec_error_t M_textcodec_encode_parser(M_parser_t *parser, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
195
196
197/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
198
199/*! Decode a string to utf-8.
200 *
201 * \param[out] out utf-8 string.
202 * \param[in] in Input encoded string.
203 * \param[in] ehandler Error handling logic to use.
204 * \param[in] codec Encoding of the input string.
205 *
206 * \return Result.
207 */
208M_API M_textcodec_error_t M_textcodec_decode(char **out, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
209
210
211/*! Decode a string to utf-8 into a M_buf_t.
212 *
213 * \param[in] buf Buffer to put decoded utf-8 string data.
214 * \param[in] in Input encoded string.
215 * \param[in] ehandler Error handling logic to use.
216 * \param[in] codec Encoding of the input string.
217 *
218 * \return Result.
219 */
220M_API M_textcodec_error_t M_textcodec_decode_buf(M_buf_t *buf, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
221
222
223/*! Decode a string to utf-8 into a M_parser_t.
224 *
225 * \param[in] parser Parser to put decoded utf-8 string data.
226 * \param[in] in Input encoded string.
227 * \param[in] ehandler Error handling logic to use.
228 * \param[in] codec Encoding of the input string.
229 *
230 * \return Result.
231 */
232M_API M_textcodec_error_t M_textcodec_decode_parser(M_parser_t *parser, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT M_WARN_NONNULL(1);
233
234
235/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
236
237/*! Returns if error code is a failure or not.
238 *
239 * \param[in] err Error to evaluate
240 *
241 * \return M_TRUE if error, M_FALSE if not.
242 */
244
245
246/*! Get the codec from the string name.
247 *
248 * \param[in] s Codec as a string.
249 *
250 * \return Codec.
251 */
253
254
255/*! Covert the codec to its string name.
256 *
257 * \param[in] codec Codec.
258 *
259 * \return String.
260 */
262
263/*! @} */
264
265__END_DECLS
266
267#endif /* __M_TEXTCODEC_H__ */
struct M_buf M_buf_t
Definition: m_buf.h:77
struct M_parser M_parser_t
Definition: m_parser.h:52
const char * M_textcodec_codec_to_str(M_textcodec_codec_t codec)
M_textcodec_error_t M_textcodec_encode_buf(M_buf_t *buf, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_error_t M_textcodec_decode(char **out, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_error_t M_textcodec_decode_parser(M_parser_t *parser, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_error_t M_textcodec_encode_parser(M_parser_t *parser, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_codec_t
Definition: m_textcodec.h:103
M_textcodec_error_t
Definition: m_textcodec.h:149
M_bool M_textcodec_error_is_error(M_textcodec_error_t err)
M_textcodec_error_t M_textcodec_decode_buf(M_buf_t *buf, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_error_t M_textcodec_encode(char **out, const char *in, M_textcodec_ehandler_t ehandler, M_textcodec_codec_t codec) M_WARN_UNUSED_RESULT
M_textcodec_ehandler_t
Definition: m_textcodec.h:95
M_textcodec_codec_t M_textcodec_codec_from_str(const char *s)
@ M_TEXTCODEC_PUNYCODE
Definition: m_textcodec.h:142
@ M_TEXTCODEC_UNKNOWN
Definition: m_textcodec.h:104
@ M_TEXTCODEC_UTF8
Definition: m_textcodec.h:105
@ M_TEXTCODEC_ISO8859_3
Definition: m_textcodec.h:121
@ M_TEXTCODEC_ISO8859_10
Definition: m_textcodec.h:128
@ M_TEXTCODEC_ISO8859_16
Definition: m_textcodec.h:133
@ M_TEXTCODEC_ISO8859_13
Definition: m_textcodec.h:130
@ M_TEXTCODEC_CP037
Definition: m_textcodec.h:107
@ M_TEXTCODEC_PERCENT_FORMMIN
Definition: m_textcodec.h:139
@ M_TEXTCODEC_CP1257
Definition: m_textcodec.h:117
@ M_TEXTCODEC_PERCENT_URLMIN
Definition: m_textcodec.h:136
@ M_TEXTCODEC_ISO8859_4
Definition: m_textcodec.h:122
@ M_TEXTCODEC_CP1256
Definition: m_textcodec.h:116
@ M_TEXTCODEC_ISO8859_1
Definition: m_textcodec.h:119
@ M_TEXTCODEC_PERCENT_FORM
Definition: m_textcodec.h:135
@ M_TEXTCODEC_ISO8859_6
Definition: m_textcodec.h:124
@ M_TEXTCODEC_ISO8859_7
Definition: m_textcodec.h:125
@ M_TEXTCODEC_ISO8859_14
Definition: m_textcodec.h:131
@ M_TEXTCODEC_ISO8859_15
Definition: m_textcodec.h:132
@ M_TEXTCODEC_CP1258
Definition: m_textcodec.h:118
@ M_TEXTCODEC_CP1254
Definition: m_textcodec.h:114
@ M_TEXTCODEC_CP1252
Definition: m_textcodec.h:112
@ M_TEXTCODEC_PERCENT_URL
Definition: m_textcodec.h:134
@ M_TEXTCODEC_ASCII
Definition: m_textcodec.h:106
@ M_TEXTCODEC_ISO8859_2
Definition: m_textcodec.h:120
@ M_TEXTCODEC_ISO8859_11
Definition: m_textcodec.h:129
@ M_TEXTCODEC_ISO8859_8
Definition: m_textcodec.h:126
@ M_TEXTCODEC_CP1255
Definition: m_textcodec.h:115
@ M_TEXTCODEC_CP1253
Definition: m_textcodec.h:113
@ M_TEXTCODEC_ISO8859_5
Definition: m_textcodec.h:123
@ M_TEXTCODEC_CP1250
Definition: m_textcodec.h:110
@ M_TEXTCODEC_QUOTED_PRINTABLE
Definition: m_textcodec.h:144
@ M_TEXTCODEC_ISO8859_9
Definition: m_textcodec.h:127
@ M_TEXTCODEC_CP874
Definition: m_textcodec.h:109
@ M_TEXTCODEC_CP1251
Definition: m_textcodec.h:111
@ M_TEXTCODEC_CP500
Definition: m_textcodec.h:108
@ M_TEXTCODEC_ERROR_BADINPUT
Definition: m_textcodec.h:153
@ M_TEXTCODEC_ERROR_FAIL
Definition: m_textcodec.h:152
@ M_TEXTCODEC_ERROR_INVALID_PARAM
Definition: m_textcodec.h:155
@ M_TEXTCODEC_ERROR_SUCCESS
Definition: m_textcodec.h:150
@ M_TEXTCODEC_ERROR_SUCCESS_EHANDLER
Definition: m_textcodec.h:151
@ M_TEXTCODEC_EHANDLER_IGNORE
Definition: m_textcodec.h:98
@ M_TEXTCODEC_EHANDLER_REPLACE
Definition: m_textcodec.h:97
@ M_TEXTCODEC_EHANDLER_FAIL
Definition: m_textcodec.h:96