-
Notifications
You must be signed in to change notification settings - Fork 134
Expand file tree
/
Copy pathtokenizer.py
More file actions
84 lines (77 loc) · 2.39 KB
/
Copy pathtokenizer.py
File metadata and controls
84 lines (77 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
ldap.schema.tokenizer - Low-level parsing functions for schema element strings
See https://www.python-ldap.org/ for details.
"""
import re
TOKENS_FINDALL = re.compile(
r"(\()" # opening parenthesis
r"|" # or
r"(\))" # closing parenthesis
r"|" # or
r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace
r"|" # or
r"('(?:[^'\\]|\\.)*'(?!\w))"
# any string or empty string surrounded by unescaped
# single quotes except if right quote is succeeded by
# alphanumeric char
r"|" # or
r"([^\s]+?)", # residue, all non-whitespace strings
).findall
UNESCAPE_PATTERN = re.compile(r"\\(.)")
def split_tokens(s):
"""
Returns list of syntax elements with quotes and spaces stripped.
"""
parts = []
parens = 0
for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
if unquoted:
parts.append(unquoted)
elif quoted:
parts.append(UNESCAPE_PATTERN.sub(r'\1', quoted[1:-1]))
elif opar:
parens += 1
parts.append(opar)
elif cpar:
parens -= 1
parts.append(cpar)
elif residue == '$':
if not parens:
raise ValueError("'$' outside parenthesis in %r" % (s))
else:
raise ValueError(residue, s)
if parens:
raise ValueError("Unbalanced parenthesis in %r" % (s))
return parts
def extract_tokens(l,known_tokens):
"""
Returns dictionary of known tokens with all values
"""
assert l[0].strip()=="(" and l[-1].strip()==")",ValueError(l)
result = {}
result.update(known_tokens)
i = 0
l_len = len(l)
while i<l_len:
if l[i] in result:
token = l[i]
i += 1 # Consume token
if i<l_len:
if l[i] in result:
# non-valued
result[token] = (())
elif l[i]=="(":
# multi-valued
i += 1 # Consume left parentheses
start = i
while i<l_len and l[i]!=")":
i += 1
result[token] = tuple(filter(lambda v:v!='$',l[start:i]))
i += 1 # Consume right parentheses
else:
# single-valued
result[token] = l[i],
i += 1 # Consume single value
else:
i += 1 # Consume unrecognized item
return result