Skip to content

Commit adb6b47

Browse files
author
stroeder
committed
faster implementation of ldap.schema.tokenizer.split_tokens()
1 parent 909b2ea commit adb6b47

2 files changed

Lines changed: 44 additions & 42 deletions

File tree

CHANGES

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Released 2.4.33 2017-02-xx
33

44
Lib/
5+
* faster implementation of ldap.schema.tokenizer.split_tokens()
6+
(thanks to Christian Heimes)
57
* removed unused 2nd argument of ldap.schema.tokenizer.split_tokens()
68

79
Tests/
@@ -1345,4 +1347,4 @@ Released 2.0.0pre02 2002-02-01
13451347
----------------------------------------------------------------
13461348
Released 1.10alpha3 2000-09-19
13471349

1348-
$Id: CHANGES,v 1.411 2017/02/19 12:36:21 stroeder Exp $
1350+
$Id: CHANGES,v 1.412 2017/02/20 10:25:47 stroeder Exp $

Lib/ldap/schema/tokenizer.py

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,50 +3,51 @@
33
44
See http://www.python-ldap.org/ for details.
55
6-
\$Id: tokenizer.py,v 1.14 2017/02/18 15:32:01 stroeder Exp $
6+
\$Id: tokenizer.py,v 1.15 2017/02/20 10:25:47 stroeder Exp $
77
"""
88

9+
import re
10+
11+
TOKENS_FINDALL = re.compile(
12+
r"(\()" # opening parenthesis
13+
r"|" # or
14+
r"(\))" # closing parenthesis
15+
r"|" # or
16+
r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace
17+
r"|" # or
18+
r"('.*?'(?!\w))" # any string or empty string surrounded by single quotes
19+
# except if right quote is succeeded by alphanumeric char
20+
r"|" # or
21+
r"([^\s]+?)", # residue, all non-whitespace strings
22+
).findall
23+
924

1025
def split_tokens(s):
11-
"""
12-
Returns list of syntax elements with quotes and spaces
13-
stripped.
14-
"""
15-
result = []
16-
result_append = result.append
17-
s_len = len(s)
18-
i = 0
19-
while i<s_len:
20-
start = i
21-
while i<s_len and s[i]!="'":
22-
if s[i]=="(" or s[i]==")":
23-
if i>start:
24-
result_append(s[start:i])
25-
result_append(s[i])
26-
i +=1 # Consume parentheses
27-
start = i
28-
elif s[i]==" " or s[i]=="$":
29-
if i>start:
30-
result_append(s[start:i])
31-
i +=1
32-
# Consume more space chars
33-
while i<s_len and s[i]==" ":
34-
i +=1
35-
start = i
36-
else:
37-
i +=1
38-
if i>start:
39-
result_append(s[start:i])
40-
i +=1
41-
if i>=s_len:
42-
break
43-
start = i
44-
while i<s_len and s[i]!="'":
45-
i +=1
46-
if i>=start:
47-
result_append(s[start:i])
48-
i +=1
49-
return result # split_tokens()
26+
"""
27+
Returns list of syntax elements with quotes and spaces
28+
stripped.
29+
"""
30+
parts = []
31+
parens = 0
32+
for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
33+
if unquoted:
34+
parts.append(unquoted)
35+
elif quoted:
36+
parts.append(quoted[1:-1])
37+
elif opar:
38+
parens += 1
39+
parts.append(opar)
40+
elif cpar:
41+
parens -= 1
42+
parts.append(cpar)
43+
elif residue == '$':
44+
if not parens:
45+
raise ValueError("'$' outside parenthesis")
46+
else:
47+
raise ValueError(residue, s)
48+
if parens:
49+
raise ValueError('Unbalanced parenthesis in %r' % s)
50+
return parts
5051

5152

5253
def extract_tokens(l,known_tokens):
@@ -82,4 +83,3 @@ def extract_tokens(l,known_tokens):
8283
else:
8384
i += 1 # Consume unrecognized item
8485
return result
85-

0 commit comments

Comments
 (0)