Skip to content

Commit 641c699

Browse files
feat(re): Enhance CFFI re module and add tests
This commit includes several enhancements to the CFFI-based `re` module: - Introduced `re.error` for regex-specific exceptions. - Implemented `MatchObject` for `search()` results, providing access to captured groups via `group()` and `groups()` methods. - Enhanced `findall()` to correctly return: - `list[str]` of full matches if no capture groups. - `list[str]` of captured strings if one capture group. - `list[tuple[str,...]]` if multiple capture groups. - Ensured `sub()` supports standard ECMAScript group references (e.g., `$1`, `$&`). - Added comprehensive tests to `tests/test_re.py` to cover: - Basic matching, searching, character classes, quantifiers, anchors. - Detailed behavior of `search()` with `MatchObject` and groups. - Detailed behavior of `findall()` with 0, 1, and multiple groups. - `sub()` with various group backreferences. - Error handling for invalid regex patterns. The C++ backend in `native/src/regex_wrapper.cpp` was updated to support these features, including functions to get mark counts and return structured group information for `search`. The CFFI interface in `src/stdlib/re.py` and the stub file `native/src/regex_wrapper.pyi` were updated accordingly.
1 parent 067d0a0 commit 641c699

File tree

3 files changed

+372
-50
lines changed

3 files changed

+372
-50
lines changed

native/src/regex_wrapper.cpp

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -63,53 +63,75 @@ extern "C" {
6363

6464

6565
// Find all matches of a regex pattern in a string
66+
// Behavior depends on the number of capture groups in the regex:
67+
// 0 groups: returns list of full matches.
68+
// 1 group: returns list of strings for that group.
69+
// >1 groups: returns list of strings, where each string is a concatenation of all captured groups for a match, delimited by SOH (\x01).
6670
extern "C" char** findall_pattern(int id, const char* text) {
6771
auto it = regex_cache.find(id);
6872
if (it == regex_cache.end()) {
69-
return nullptr; // Return nullptr if the ID is not found
73+
return nullptr;
7074
}
7175

72-
std::string str(text);
73-
std::smatch match;
74-
std::vector<std::string> matches;
75-
std::string::const_iterator searchStart(str.cbegin());
76+
std::shared_ptr<std::regex> re = it->second;
77+
size_t num_groups = re->mark_count(); // Number of capture groups
78+
79+
std::string s_text(text);
80+
std::vector<std::string> collected_matches;
81+
auto search_start = s_text.cbegin();
82+
std::smatch current_match;
83+
84+
while (std::regex_search(search_start, s_text.cend(), current_match, *re)) {
85+
if (num_groups == 0) { // No groups, return full match
86+
collected_matches.push_back(current_match[0].str());
87+
} else if (num_groups == 1) { // One group, return group 1
88+
collected_matches.push_back(current_match[1].str());
89+
} else { // More than one group
90+
std::string combined_groups_str;
91+
for (size_t i = 1; i <= num_groups; ++i) { // Iterate from group 1 to num_groups
92+
combined_groups_str += current_match[i].str();
93+
if (i < num_groups) {
94+
combined_groups_str += '\x01'; // Delimiter
95+
}
96+
}
97+
collected_matches.push_back(combined_groups_str);
98+
}
99+
search_start = current_match.suffix().first;
100+
if (search_start == s_text.cbegin() && current_match[0].length() == 0) {
101+
// Handle empty match at the beginning of the remaining string to avoid infinite loop.
102+
// This can happen with patterns like "a*". Advance by one character.
103+
if (search_start != s_text.cend()) {
104+
search_start++;
105+
} else {
106+
break; // Reached end of string
107+
}
108+
}
76109

77-
// Find all matches
78-
while (std::regex_search(searchStart, str.cend(), match, *it->second)) {
79-
matches.push_back(match.str()); // Store each match in the vector
80-
searchStart = match.suffix().first;
81110
}
82111

83-
if (matches.empty()) {
84-
return nullptr; // Return nullptr if no matches are found
112+
if (collected_matches.empty()) {
113+
return nullptr;
85114
}
86115

87-
// Allocate an array of char* to hold the matches
88-
char** result = (char**)malloc((matches.size() + 1) * sizeof(char*));
89-
if (!result) {
90-
return nullptr; // Return nullptr if memory allocation fails
116+
char** result_array = (char**)malloc((collected_matches.size() + 1) * sizeof(char*));
117+
if (!result_array) {
118+
return nullptr;
91119
}
92120

93-
// Copy each match into the array
94-
for (size_t i = 0; i < matches.size(); ++i) {
95-
result[i] = strdup(matches[i].c_str()); // Duplicate the string
96-
if (!result[i]) {
97-
// Free previously allocated memory if strdup fails
98-
for (size_t j = 0; j < i; ++j) {
99-
free(result[j]);
100-
}
101-
free(result);
121+
for (size_t i = 0; i < collected_matches.size(); ++i) {
122+
result_array[i] = strdup(collected_matches[i].c_str());
123+
if (!result_array[i]) {
124+
for (size_t j = 0; j < i; ++j) free(result_array[j]);
125+
free(result_array);
102126
return nullptr;
103127
}
104128
}
129+
result_array[collected_matches.size()] = nullptr; // Null-terminate
105130

106-
// Null-terminate the array
107-
result[matches.size()] = nullptr;
108-
109-
return result;
131+
return result_array;
110132
}
111133

112-
// Function to free the allocated memory for the matches
134+
// Function to free the allocated memory for the matches (used by findall_pattern)
113135
extern "C" void free_matches(char** matches) {
114136
if (!matches) {
115137
return;

src/stdlib/re.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,29 @@ def search(self, text: str) -> str | None:
5656
def findall(self, text: str) -> list[str]:
5757
# Find all matches of the compiled regex in the text
5858
matches_ptr = lib.findall_pattern(self.id, text.encode("utf-8")) # type: ignore
59-
matches = []
60-
if matches_ptr:
61-
try:
62-
# Convert the array of C strings to a Python list
63-
i = 0
64-
while matches_ptr[i]:
65-
matches.append(
66-
cast(bytes, ffi.string(matches_ptr[i])).decode("utf-8")
67-
)
68-
i += 1
69-
finally:
70-
# Free the allocated memory
71-
lib.free_matches(matches_ptr)
72-
return matches
59+
if not matches_ptr:
60+
return []
61+
62+
results = []
63+
try:
64+
i = 0
65+
while matches_ptr[i]:
66+
item_bytes = cast(bytes, ffi.string(matches_ptr[i]))
67+
item_str = item_bytes.decode("utf-8")
68+
69+
if self.mark_count > 1:
70+
# Split the string by the delimiter \x01 to get the tuple of groups
71+
# Ensure that an empty string trailing a delimiter is preserved, e.g. "a\x01" -> ("a", "")
72+
# The `split` method handles this correctly by default.
73+
results.append(tuple(item_str.split('\x01')))
74+
else:
75+
# For mark_count == 0 (full match) or 1 (group 1 content),
76+
# the string is the match itself or group 1.
77+
results.append(item_str)
78+
i += 1
79+
finally:
80+
lib.free_matches(matches_ptr) # type: ignore
81+
return results
7382

7483
def sub(self, replacement: str, text: str) -> str:
7584
# Substitute all occurrences of the compiled regex in the text

0 commit comments

Comments
 (0)