feat(re): Enhance CFFI re module and add tests

google-labs-jules[bot] · google-labs-jules[bot] · commit 641c69968952 · 2025-05-23T19:04:08.000Z
This commit includes several enhancements to the CFFI-based `re` module:

- Introduced `re.error` for regex-specific exceptions.
- Implemented `MatchObject` for `search()` results, providing access to captured groups via `group()` and `groups()` methods.
- Enhanced `findall()` to correctly return:
    - `list[str]` of full matches if no capture groups.
    - `list[str]` of captured strings if one capture group.
    - `list[tuple[str,...]]` if multiple capture groups.
- Ensured `sub()` supports standard ECMAScript group references (e.g., `$1`, `$&amp;`).
- Added comprehensive tests to `tests/test_re.py` to cover:
    - Basic matching, searching, character classes, quantifiers, anchors.
    - Detailed behavior of `search()` with `MatchObject` and groups.
    - Detailed behavior of `findall()` with 0, 1, and multiple groups.
    - `sub()` with various group backreferences.
    - Error handling for invalid regex patterns.

The C++ backend in `native/src/regex_wrapper.cpp` was updated to support these features, including functions to get mark counts and return structured group information for `search`. The CFFI interface in `src/stdlib/re.py` and the stub file `native/src/regex_wrapper.pyi` were updated accordingly.
diff --git a/native/src/regex_wrapper.cpp b/native/src/regex_wrapper.cpp
@@ -63,53 +63,75 @@ extern "C" {
 
 
     // Find all matches of a regex pattern in a string
+    // Behavior depends on the number of capture groups in the regex:
+    // 0 groups: returns list of full matches.
+    // 1 group: returns list of strings for that group.
+    // >1 groups: returns list of strings, where each string is a concatenation of all captured groups for a match, delimited by SOH (\x01).
     extern "C" char** findall_pattern(int id, const char* text) {
         auto it = regex_cache.find(id);
         if (it == regex_cache.end()) {
-            return nullptr; // Return nullptr if the ID is not found
+            return nullptr;
         }
 
-        std::string str(text);
-        std::smatch match;
-        std::vector<std::string> matches;
-        std::string::const_iterator searchStart(str.cbegin());
+        std::shared_ptr<std::regex> re = it->second;
+        size_t num_groups = re->mark_count(); // Number of capture groups
+
+        std::string s_text(text);
+        std::vector<std::string> collected_matches;
+        auto search_start = s_text.cbegin();
+        std::smatch current_match;
+
+        while (std::regex_search(search_start, s_text.cend(), current_match, *re)) {
+            if (num_groups == 0) { // No groups, return full match
+                collected_matches.push_back(current_match[0].str());
+            } else if (num_groups == 1) { // One group, return group 1
+                collected_matches.push_back(current_match[1].str());
+            } else { // More than one group
+                std::string combined_groups_str;
+                for (size_t i = 1; i <= num_groups; ++i) { // Iterate from group 1 to num_groups
+                    combined_groups_str += current_match[i].str();
+                    if (i < num_groups) {
+                        combined_groups_str += '\x01'; // Delimiter
+                    }
+                }
+                collected_matches.push_back(combined_groups_str);
+            }
+            search_start = current_match.suffix().first;
+            if (search_start == s_text.cbegin() && current_match[0].length() == 0) {
+                 // Handle empty match at the beginning of the remaining string to avoid infinite loop.
+                 // This can happen with patterns like "a*". Advance by one character.
+                 if (search_start != s_text.cend()) {
+                    search_start++;
+                 } else {
+                    break; // Reached end of string
+                 }
+            }
 
-        // Find all matches
-        while (std::regex_search(searchStart, str.cend(), match, *it->second)) {
-            matches.push_back(match.str()); // Store each match in the vector
-            searchStart = match.suffix().first;
         }
 
-        if (matches.empty()) {
-            return nullptr; // Return nullptr if no matches are found
+        if (collected_matches.empty()) {
+            return nullptr;
         }
 
-        // Allocate an array of char* to hold the matches
-        char** result = (char**)malloc((matches.size() + 1) * sizeof(char*));
-        if (!result) {
-            return nullptr; // Return nullptr if memory allocation fails
+        char** result_array = (char**)malloc((collected_matches.size() + 1) * sizeof(char*));
+        if (!result_array) {
+            return nullptr;
         }
 
-        // Copy each match into the array
-        for (size_t i = 0; i < matches.size(); ++i) {
-            result[i] = strdup(matches[i].c_str()); // Duplicate the string
-            if (!result[i]) {
-                // Free previously allocated memory if strdup fails
-                for (size_t j = 0; j < i; ++j) {
-                    free(result[j]);
-                }
-                free(result);
+        for (size_t i = 0; i < collected_matches.size(); ++i) {
+            result_array[i] = strdup(collected_matches[i].c_str());
+            if (!result_array[i]) {
+                for (size_t j = 0; j < i; ++j) free(result_array[j]);
+                free(result_array);
                 return nullptr;
             }
         }
+        result_array[collected_matches.size()] = nullptr; // Null-terminate
 
-        // Null-terminate the array
-        result[matches.size()] = nullptr;
-
-        return result;
+        return result_array;
     }
 
-    // Function to free the allocated memory for the matches
+    // Function to free the allocated memory for the matches (used by findall_pattern)
     extern "C" void free_matches(char** matches) {
         if (!matches) {
             return;
diff --git a/src/stdlib/re.py b/src/stdlib/re.py
@@ -56,20 +56,29 @@ def search(self, text: str) -> str | None:
     def findall(self, text: str) -> list[str]:
         # Find all matches of the compiled regex in the text
         matches_ptr = lib.findall_pattern(self.id, text.encode("utf-8"))  # type: ignore
-        matches = []
-        if matches_ptr:
-            try:
-                # Convert the array of C strings to a Python list
-                i = 0
-                while matches_ptr[i]:
-                    matches.append(
-                        cast(bytes, ffi.string(matches_ptr[i])).decode("utf-8")
-                    )
-                    i += 1
-            finally:
-                # Free the allocated memory
-                lib.free_matches(matches_ptr)
-        return matches
+        if not matches_ptr:
+            return []
+
+        results = []
+        try:
+            i = 0
+            while matches_ptr[i]:
+                item_bytes = cast(bytes, ffi.string(matches_ptr[i]))
+                item_str = item_bytes.decode("utf-8")
+                
+                if self.mark_count > 1:
+                    # Split the string by the delimiter \x01 to get the tuple of groups
+                    # Ensure that an empty string trailing a delimiter is preserved, e.g. "a\x01" -> ("a", "")
+                    # The `split` method handles this correctly by default.
+                    results.append(tuple(item_str.split('\x01')))
+                else:
+                    # For mark_count == 0 (full match) or 1 (group 1 content),
+                    # the string is the match itself or group 1.
+                    results.append(item_str)
+                i += 1
+        finally:
+            lib.free_matches(matches_ptr) # type: ignore
+        return results
 
     def sub(self, replacement: str, text: str) -> str:
         # Substitute all occurrences of the compiled regex in the text
diff --git a/tests/test_re.py b/tests/test_re.py