Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 51 additions & 29 deletions native/src/regex_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,53 +63,75 @@ extern "C" {


// Find all matches of a regex pattern in a string
// Behavior depends on the number of capture groups in the regex:
// 0 groups: returns list of full matches.
// 1 group: returns list of strings for that group.
// >1 groups: returns list of strings, where each string is a concatenation of all captured groups for a match, delimited by SOH (\x01).
extern "C" char** findall_pattern(int id, const char* text) {
auto it = regex_cache.find(id);
if (it == regex_cache.end()) {
return nullptr; // Return nullptr if the ID is not found
return nullptr;
}

std::string str(text);
std::smatch match;
std::vector<std::string> matches;
std::string::const_iterator searchStart(str.cbegin());
std::shared_ptr<std::regex> re = it->second;
size_t num_groups = re->mark_count(); // Number of capture groups

std::string s_text(text);
std::vector<std::string> collected_matches;
auto search_start = s_text.cbegin();
std::smatch current_match;

while (std::regex_search(search_start, s_text.cend(), current_match, *re)) {
if (num_groups == 0) { // No groups, return full match
collected_matches.push_back(current_match[0].str());
} else if (num_groups == 1) { // One group, return group 1
collected_matches.push_back(current_match[1].str());
} else { // More than one group
std::string combined_groups_str;
for (size_t i = 1; i <= num_groups; ++i) { // Iterate from group 1 to num_groups
combined_groups_str += current_match[i].str();
if (i < num_groups) {
combined_groups_str += '\x01'; // Delimiter
}
}
collected_matches.push_back(combined_groups_str);
}
search_start = current_match.suffix().first;
if (search_start == s_text.cbegin() && current_match[0].length() == 0) {
// Handle empty match at the beginning of the remaining string to avoid infinite loop.
// This can happen with patterns like "a*". Advance by one character.
if (search_start != s_text.cend()) {
search_start++;
} else {
break; // Reached end of string
}
}

// Find all matches
while (std::regex_search(searchStart, str.cend(), match, *it->second)) {
matches.push_back(match.str()); // Store each match in the vector
searchStart = match.suffix().first;
}

if (matches.empty()) {
return nullptr; // Return nullptr if no matches are found
if (collected_matches.empty()) {
return nullptr;
}

// Allocate an array of char* to hold the matches
char** result = (char**)malloc((matches.size() + 1) * sizeof(char*));
if (!result) {
return nullptr; // Return nullptr if memory allocation fails
char** result_array = (char**)malloc((collected_matches.size() + 1) * sizeof(char*));
if (!result_array) {
return nullptr;
}

// Copy each match into the array
for (size_t i = 0; i < matches.size(); ++i) {
result[i] = strdup(matches[i].c_str()); // Duplicate the string
if (!result[i]) {
// Free previously allocated memory if strdup fails
for (size_t j = 0; j < i; ++j) {
free(result[j]);
}
free(result);
for (size_t i = 0; i < collected_matches.size(); ++i) {
result_array[i] = strdup(collected_matches[i].c_str());
if (!result_array[i]) {
for (size_t j = 0; j < i; ++j) free(result_array[j]);
free(result_array);
return nullptr;
}
}
result_array[collected_matches.size()] = nullptr; // Null-terminate

// Null-terminate the array
result[matches.size()] = nullptr;

return result;
return result_array;
}

// Function to free the allocated memory for the matches
// Function to free the allocated memory for the matches (used by findall_pattern)
extern "C" void free_matches(char** matches) {
if (!matches) {
return;
Expand Down
37 changes: 23 additions & 14 deletions src/stdlib/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,29 @@ def search(self, text: str) -> str | None:
def findall(self, text: str) -> list[str]:
# Find all matches of the compiled regex in the text
matches_ptr = lib.findall_pattern(self.id, text.encode("utf-8")) # type: ignore
matches = []
if matches_ptr:
try:
# Convert the array of C strings to a Python list
i = 0
while matches_ptr[i]:
matches.append(
cast(bytes, ffi.string(matches_ptr[i])).decode("utf-8")
)
i += 1
finally:
# Free the allocated memory
lib.free_matches(matches_ptr)
return matches
if not matches_ptr:
return []

results = []
try:
i = 0
while matches_ptr[i]:
item_bytes = cast(bytes, ffi.string(matches_ptr[i]))
item_str = item_bytes.decode("utf-8")

if self.mark_count > 1:
# Split the string by the delimiter \x01 to get the tuple of groups
# Ensure that an empty string trailing a delimiter is preserved, e.g. "a\x01" -> ("a", "")
# The `split` method handles this correctly by default.
results.append(tuple(item_str.split('\x01')))
else:
# For mark_count == 0 (full match) or 1 (group 1 content),
# the string is the match itself or group 1.
results.append(item_str)
i += 1
finally:
lib.free_matches(matches_ptr) # type: ignore
return results

def sub(self, replacement: str, text: str) -> str:
# Substitute all occurrences of the compiled regex in the text
Expand Down
Loading
Loading