lws-tokenize.h source code [include/libwebsockets/lws-tokenize.h]

1	/*
2	* libwebsockets - small server side websockets and web server implementation
3	*
4	* Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to
8	* deal in the Software without restriction, including without limitation the
9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10	* sell copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22	* IN THE SOFTWARE.
23	*/
24
25	/ Do not treat - as a terminal character, so "my-token" is one token /
26	#define LWS_TOKENIZE_F_MINUS_NONTERM (1 << 0)
27	/ Separately report aggregate colon-delimited tokens /
28	#define LWS_TOKENIZE_F_AGG_COLON (1 << 1)
29	/ Enforce sequencing for a simple token , token , token ... list /
30	#define LWS_TOKENIZE_F_COMMA_SEP_LIST (1 << 2)
31	/ Allow more characters in the tokens and less delimiters... default is*
32	* only alphanumeric + underscore in tokens */
33	#define LWS_TOKENIZE_F_RFC7230_DELIMS (1 << 3)
34	/ Do not treat . as a terminal character, so "warmcat.com" is one token /
35	#define LWS_TOKENIZE_F_DOT_NONTERM (1 << 4)
36	/ If something starts looking like a float, like 1.2, force to be string token.*
37	* This lets you receive dotted-quads like 192.168.0.1 as string tokens, and
38	* avoids illegal float format detection like 1.myserver.com */
39	#define LWS_TOKENIZE_F_NO_FLOATS (1 << 5)
40	/ Instead of LWS_TOKZE_INTEGER, report integers as any other string token /
41	#define LWS_TOKENIZE_F_NO_INTEGERS (1 << 6)
42	/ # makes the rest of the line a comment /
43	#define LWS_TOKENIZE_F_HASH_COMMENT (1 << 7)
44	/ Do not treat / as a terminal character, so "multipart/related" is one token /
45	#define LWS_TOKENIZE_F_SLASH_NONTERM (1 << 8)
46	/ Do not treat * as a terminal character, so "myfile" is one token /*
47	#define LWS_TOKENIZE_F_ASTERISK_NONTERM (1 << 9)
48	/ Do not treat = as a terminal character, so "x=y" is one token /
49	#define LWS_TOKENIZE_F_EQUALS_NONTERM (1 << 10)
50
51	typedef enum {
52
53	LWS_TOKZE_ERRS = `5`, / the number of errors defined /
54
55	LWS_TOKZE_ERR_BROKEN_UTF8 = -`5`, / malformed or partial utf8 /
56	LWS_TOKZE_ERR_UNTERM_STRING = -`4`, / ended while we were in "" /
57	LWS_TOKZE_ERR_MALFORMED_FLOAT = -`3`, / like 0..1 or 0.1.1 /
58	LWS_TOKZE_ERR_NUM_ON_LHS = -`2`, / like 123= or 0.1= /
59	LWS_TOKZE_ERR_COMMA_LIST = -`1`, / like ",tok", or, "tok,," /
60
61	LWS_TOKZE_ENDED = `0`, / no more content /
62
63	/ Note: results have ordinal 1+, EOT is 0 and errors are < 0 /
64
65	LWS_TOKZE_DELIMITER, / a delimiter appeared /
66	LWS_TOKZE_TOKEN, / a token appeared /
67	LWS_TOKZE_INTEGER, / an integer appeared /
68	LWS_TOKZE_FLOAT, / a float appeared /
69	LWS_TOKZE_TOKEN_NAME_EQUALS, / token [whitespace] = /
70	LWS_TOKZE_TOKEN_NAME_COLON, / token [whitespace] : (only with*
71	LWS_TOKENIZE_F_AGG_COLON flag) /*
72	LWS_TOKZE_QUOTED_STRING, / "", where may have any char /
73
74	} lws_tokenize_elem;
75
76	/*
77	* helper enums to allow caller to enforce legal delimiter sequencing, eg
78	* disallow "token,,token", "token,", and ",token"
79	*/
80
81	enum lws_tokenize_delimiter_tracking {
82	LWSTZ_DT_NEED_FIRST_CONTENT,
83	LWSTZ_DT_NEED_DELIM,
84	LWSTZ_DT_NEED_NEXT_CONTENT,
85	};
86
87	typedef struct lws_tokenize {
88	const char start; /*< set to the start of the string to tokenize /*
89	const char token; /*< the start of an identified token or delimiter /*
90	size_t len; /< set to the length of the string to tokenize /*
91	size_t token_len; /< the length of the identied token or delimiter /*
92
93	uint16_t flags; /< optional LWS_TOKENIZE_F_ flags, or 0 /*
94	uint8_t delim;
95
96	int8_t e; /< convenient for storing lws_tokenize return /*
97	} lws_tokenize_t;
98
99	/**
100	* lws_tokenize() - breaks down a string into tokens and delimiters in-place
101	*
102	* \param ts: the lws_tokenize struct to init
103	* \param start: the string to tokenize
104	* \param flags: LWS_TOKENIZE_F_ option flags
105	*
106	* This initializes the tokenize struct to point to the given string, and
107	* sets the length to 2GiB - 1 (so there must be a terminating NUL)... you can
108	* override this requirement by setting ts.len yourself before using it.
109	*
110	* .delim is also initialized to LWSTZ_DT_NEED_FIRST_CONTENT.
111	*/
112
113	LWS_VISIBLE LWS_EXTERN void
114	lws_tokenize_init(struct lws_tokenize ts, const* char start, int* flags);
115
116	/**
117	* lws_tokenize() - breaks down a string into tokens and delimiters in-place
118	*
119	* \param ts: the lws_tokenize struct with information and state on what to do
120	*
121	* The \p ts struct should have its start, len and flags members initialized to
122	* reflect the string to be tokenized and any options.
123	*
124	* Then `lws_tokenize()` may be called repeatedly on the struct, returning one
125	* of `lws_tokenize_elem` each time, and with the struct's `token` and
126	* `token_len` members set to describe the content of the delimiter or token
127	* payload each time.
128	*
129	* There are no allocations during the process.
130	*
131	* returns lws_tokenize_elem that was identified (LWS_TOKZE_ENDED means reached
132	* the end of the string).
133	*/
134
135	LWS_VISIBLE LWS_EXTERN lws_tokenize_elem
136	lws_tokenize(struct lws_tokenize *ts);
137
138	/**
139	* lws_tokenize_cstr() - copy token string to NUL-terminated buffer
140	*
141	* \param ts: pointer to lws_tokenize struct to operate on
142	* \param str: destination buffer
143	* \pparam max: bytes in destination buffer
144	*
145	* returns 0 if OK or nonzero if the string + NUL won't fit.
146	*/
147
148	LWS_VISIBLE LWS_EXTERN int
149	lws_tokenize_cstr(struct lws_tokenize ts, char* *str, size_t max);
150
151
152	/*
153	* lws_strexp: flexible string expansion helper api
154	*
155	* This stateful helper can handle multiple separate input chunks and multiple
156	* output buffer loads with arbitrary boundaries between literals and expanded
157	* symbols. This allows it to handle fragmented input as well as arbitrarily
158	* long symbol expansions that are bigger than the output buffer itself.
159	*
160	* A user callback is used to convert symbol names to the symbol value.
161	*
162	* A single byte buffer for input and another for output can process any
163	* length substitution then. The state object is around 64 bytes on a 64-bit
164	* system and it only uses 8 bytes stack.
165	*/
166
167
168	typedef int (lws_strexp_expand_cb)(void* priv, const* char name, char* *out,
169	size_t pos, size_t olen, size_t exp_ofs);
170
171	typedef struct lws_strexp {
172	char name[`32`];
173	lws_strexp_expand_cb cb;
174	void *priv;
175	char *out;
176	size_t olen;
177	size_t pos;
178
179	size_t exp_ofs;
180
181	uint8_t name_pos;
182	char state;
183	} lws_strexp_t;
184
185	enum {
186	LSTRX_DONE, / it completed OK /
187	LSTRX_FILLED_OUT, / out buf filled and needs resetting /
188	LSTRX_FATAL_NAME_TOO_LONG = -`1`, / fatal /
189	LSTRX_FATAL_NAME_UNKNOWN = -`2`,
190	};
191
192
193	/**
194	* lws_strexp_init() - initialize an lws_strexp_t for use
195	*
196	* \p exp: the exp object to init
197	* \p priv: the user's object pointer to pass to callback
198	* \p cb: the callback to expand named objects
199	* \p out: the start of the output buffer, or NULL just to get the length
200	* \p olen: the length of the output buffer in bytes
201	*
202	* Prepares an lws_strexp_t for use and sets the initial output buffer
203	*
204	* If \p out is NULL, substitution proceeds normally, but no output is produced,
205	* only the length is returned. olen should be set to the largest feasible
206	* overall length. To use this mode, the substitution callback must also check
207	* for NULL \p out and avoid producing the output.
208	*/
209	LWS_VISIBLE LWS_EXTERN void
210	lws_strexp_init(lws_strexp_t exp, void* *priv, lws_strexp_expand_cb cb,
211	char *out, size_t olen);
212
213	/**
214	* lws_strexp_reset_out() - reset the output buffer on an existing strexp
215	*
216	* \p exp: the exp object to init
217	* \p out: the start of the output buffer, or NULL to just get length
218	* \p olen: the length of the output buffer in bytes
219	*
220	* Provides a new output buffer for lws_strexp_expand() to continue to write
221	* into. It can be the same as the old one if it has been copied out or used.
222	* The position of the next write will be reset to the start of the given buf.
223	*
224	* If \p out is NULL, substitution proceeds normally, but no output is produced,
225	* only the length is returned. \p olen should be set to the largest feasible
226	* overall length. To use this mode, the substitution callback must also check
227	* for NULL \p out and avoid producing the output.
228	*/
229	LWS_VISIBLE LWS_EXTERN void
230	lws_strexp_reset_out(lws_strexp_t exp, char* *out, size_t olen);
231
232	/**
233	* lws_strexp_expand() - copy / expand a string into the output buffer
234	*
235	* \p exp: the exp object for the copy / expansion
236	* \p in: the start of the next input data
237	* \p len: the length of the input data
238	* \p pused_in: pointer to write the amount of input used
239	* \p pused_out: pointer to write the amount of output used
240	*
241	* Copies in to the output buffer set in exp, expanding any ${name} tokens using
242	* the callback. \p *pused_in is set to the number of input chars used and
243	* \p *pused_out the number of output characters used
244	*
245	* May return LSTRX_FILLED_OUT early with *pused < len if the output buffer is
246	* filled. Handle the output buffer and reset it with lws_strexp_reset_out()
247	* before calling again with adjusted in / len to continue.
248	*
249	* In the case of large expansions, the expansion itself may fill the output
250	* buffer, in which case the expansion callback returns the LSTRX_FILLED_OUT
251	* and will be called again to continue with its *exp_ofs parameter set
252	* appropriately.
253	*/
254	LWS_VISIBLE LWS_EXTERN int
255	lws_strexp_expand(lws_strexp_t exp, const* char *in, size_t len,
256	size_t pused_in, size_t pused_out);
257
258	/**
259	* lws_strcmp_wildcard() - strcmp but the first arg can have wildcards
260	*
261	* \p wildcard: a string that may contain zero to three *, and may lack a NUL
262	* \p wlen: length of the wildcard string
263	* \p check: string to test to see if it matches wildcard
264	* \p clen: length of check string
265	*
266	* Like strcmp, but supports patterns like "a", "ab", "ab" etc
267	* where a and b are arbitrary substrings. Both the wc and check strings need
268	* not be NUL terminated, but are specified by lengths.
269	*/
270	LWS_VISIBLE LWS_EXTERN int
271	lws_strcmp_wildcard(const char wildcard, size_t wlen, const* char *check,
272	size_t clen);
273

Browse the source code of include/libwebsockets/lws-tokenize.h