|
3 | 3 |
|
4 | 4 | See http://www.python-ldap.org/ for details. |
5 | 5 |
|
6 | | -\$Id: tokenizer.py,v 1.14 2017/02/18 15:32:01 stroeder Exp $ |
| 6 | +\$Id: tokenizer.py,v 1.15 2017/02/20 10:25:47 stroeder Exp $ |
7 | 7 | """ |
8 | 8 |
|
| 9 | +import re |
| 10 | + |
| 11 | +TOKENS_FINDALL = re.compile( |
| 12 | + r"(\()" # opening parenthesis |
| 13 | + r"|" # or |
| 14 | + r"(\))" # closing parenthesis |
| 15 | + r"|" # or |
| 16 | + r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace |
| 17 | + r"|" # or |
| 18 | + r"('.*?'(?!\w))" # any string or empty string surrounded by single quotes |
| 19 | + # except if right quote is succeeded by alphanumeric char |
| 20 | + r"|" # or |
| 21 | + r"([^\s]+?)", # residue, all non-whitespace strings |
| 22 | +).findall |
| 23 | + |
9 | 24 |
|
10 | 25 | def split_tokens(s): |
11 | | - """ |
12 | | - Returns list of syntax elements with quotes and spaces |
13 | | - stripped. |
14 | | - """ |
15 | | - result = [] |
16 | | - result_append = result.append |
17 | | - s_len = len(s) |
18 | | - i = 0 |
19 | | - while i<s_len: |
20 | | - start = i |
21 | | - while i<s_len and s[i]!="'": |
22 | | - if s[i]=="(" or s[i]==")": |
23 | | - if i>start: |
24 | | - result_append(s[start:i]) |
25 | | - result_append(s[i]) |
26 | | - i +=1 # Consume parentheses |
27 | | - start = i |
28 | | - elif s[i]==" " or s[i]=="$": |
29 | | - if i>start: |
30 | | - result_append(s[start:i]) |
31 | | - i +=1 |
32 | | - # Consume more space chars |
33 | | - while i<s_len and s[i]==" ": |
34 | | - i +=1 |
35 | | - start = i |
36 | | - else: |
37 | | - i +=1 |
38 | | - if i>start: |
39 | | - result_append(s[start:i]) |
40 | | - i +=1 |
41 | | - if i>=s_len: |
42 | | - break |
43 | | - start = i |
44 | | - while i<s_len and s[i]!="'": |
45 | | - i +=1 |
46 | | - if i>=start: |
47 | | - result_append(s[start:i]) |
48 | | - i +=1 |
49 | | - return result # split_tokens() |
| 26 | + """ |
| 27 | + Returns list of syntax elements with quotes and spaces |
| 28 | + stripped. |
| 29 | + """ |
| 30 | + parts = [] |
| 31 | + parens = 0 |
| 32 | + for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s): |
| 33 | + if unquoted: |
| 34 | + parts.append(unquoted) |
| 35 | + elif quoted: |
| 36 | + parts.append(quoted[1:-1]) |
| 37 | + elif opar: |
| 38 | + parens += 1 |
| 39 | + parts.append(opar) |
| 40 | + elif cpar: |
| 41 | + parens -= 1 |
| 42 | + parts.append(cpar) |
| 43 | + elif residue == '$': |
| 44 | + if not parens: |
| 45 | + raise ValueError("'$' outside parenthesis") |
| 46 | + else: |
| 47 | + raise ValueError(residue, s) |
| 48 | + if parens: |
| 49 | + raise ValueError('Unbalanced parenthesis in %r' % s) |
| 50 | + return parts |
50 | 51 |
|
51 | 52 |
|
52 | 53 | def extract_tokens(l,known_tokens): |
@@ -82,4 +83,3 @@ def extract_tokens(l,known_tokens): |
82 | 83 | else: |
83 | 84 | i += 1 # Consume unrecognized item |
84 | 85 | return result |
85 | | - |
0 commit comments