faster implementation of ldap.schema.tokenizer.split_tokens()

stroeder · stroeder · commit adb6b471507a · 2017-02-20T10:25:47.000Z
diff --git a/CHANGES b/CHANGES
@@ -2,6 +2,8 @@
 Released 2.4.33 2017-02-xx
 
 Lib/
+* faster implementation of ldap.schema.tokenizer.split_tokens()
+  (thanks to Christian Heimes)
 * removed unused 2nd argument of ldap.schema.tokenizer.split_tokens()
 
 Tests/
@@ -1345,4 +1347,4 @@ Released 2.0.0pre02 2002-02-01
 ----------------------------------------------------------------
 Released 1.10alpha3 2000-09-19
 
-$Id: CHANGES,v 1.411 2017/02/19 12:36:21 stroeder Exp $
+$Id: CHANGES,v 1.412 2017/02/20 10:25:47 stroeder Exp $
diff --git a/Lib/ldap/schema/tokenizer.py b/Lib/ldap/schema/tokenizer.py
@@ -3,50 +3,51 @@
 
 See http://www.python-ldap.org/ for details.
 
-\$Id: tokenizer.py,v 1.14 2017/02/18 15:32:01 stroeder Exp $
+\$Id: tokenizer.py,v 1.15 2017/02/20 10:25:47 stroeder Exp $
 """
 
+import re
+
+TOKENS_FINDALL = re.compile(
+    r"(\()"           # opening parenthesis
+    r"|"              # or
+    r"(\))"           # closing parenthesis
+    r"|"              # or
+    r"([^'$()\s]+)"   # string of length >= 1 without '$() or whitespace
+    r"|"              # or
+    r"('.*?'(?!\w))"  # any string or empty string surrounded by single quotes
+                      # except if right quote is succeeded by alphanumeric char
+    r"|"              # or
+    r"([^\s]+?)",     # residue, all non-whitespace strings
+).findall
+
 
 def split_tokens(s):
-  """
-  Returns list of syntax elements with quotes and spaces
-  stripped.
-  """
-  result = []
-  result_append = result.append
-  s_len = len(s)
-  i = 0
-  while i<s_len:
-    start = i
-    while i<s_len and s[i]!="'":
-      if s[i]=="(" or s[i]==")":
-        if i>start:
-          result_append(s[start:i])
-        result_append(s[i])
-        i +=1 # Consume parentheses
-        start = i
-      elif s[i]==" " or s[i]=="$":
-        if i>start:
-          result_append(s[start:i])
-        i +=1
-        # Consume more space chars
-        while i<s_len and s[i]==" ":
-          i +=1
-        start = i
-      else:
-        i +=1
-    if i>start:
-      result_append(s[start:i])
-    i +=1
-    if i>=s_len:
-      break
-    start = i
-    while i<s_len and s[i]!="'":
-      i +=1
-    if i>=start:
-      result_append(s[start:i])
-    i +=1
-  return result # split_tokens()
+    """
+    Returns list of syntax elements with quotes and spaces
+    stripped.
+    """
+    parts = []
+    parens = 0
+    for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
+        if unquoted:
+            parts.append(unquoted)
+        elif quoted:
+            parts.append(quoted[1:-1])
+        elif opar:
+            parens += 1
+            parts.append(opar)
+        elif cpar:
+            parens -= 1
+            parts.append(cpar)
+        elif residue == '$':
+            if not parens:
+                raise ValueError("'$' outside parenthesis")
+        else:
+            raise ValueError(residue, s)
+    if parens:
+        raise ValueError('Unbalanced parenthesis in %r' % s)
+    return parts
 
 
 def extract_tokens(l,known_tokens):
@@ -82,4 +83,3 @@ def extract_tokens(l,known_tokens):
     else:
       i += 1 # Consume unrecognized item
   return result
-