diff --git a/.gitignore b/.gitignore index 09a178b1d..40d7ea4f3 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ script.R *~ \#*\# +*.o +*.so + *.Rcheck lintr_*.tar.gz testthat-problems.rds diff --git a/DESCRIPTION b/DESCRIPTION index 873b05caf..a7dc63be7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,6 +83,7 @@ Collate: 'expect_type_linter.R' 'extract.R' 'extraction_operator_linter.R' + 'fixed_regex_linter.R' 'function_left_parentheses.R' 'get_source_expressions.R' 'ids_with_token.R' diff --git a/NAMESPACE b/NAMESPACE index b0a6e945f..1ba306723 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,6 +55,7 @@ export(expect_s4_class_linter) export(expect_true_false_linter) export(expect_type_linter) export(extraction_operator_linter) +export(fixed_regex_linter) export(function_left_parentheses_linter) export(get_source_expressions) export(ids_with_token) @@ -123,3 +124,4 @@ importFrom(utils,relist) importFrom(utils,tail) importFrom(xml2,as_list) importFrom(xml2,xml_find_all) +useDynLib(lintr, .registration = TRUE, .fixes = "lintr_") diff --git a/R/fixed_regex_linter.R b/R/fixed_regex_linter.R new file mode 100644 index 000000000..1db39b51c --- /dev/null +++ b/R/fixed_regex_linter.R @@ -0,0 +1,108 @@ +#' Require usage of `fixed=TRUE` in regular expressions where appropriate +#' +#' Invoking a regular expression engine is overkill for cases when the search +#' pattern only involves static patterns. +#' +#' NB: for `stringr` functions, that means wrapping the pattern in `stringr::fixed()`. +#' +#' NB: This linter is likely not able to distinguish every possible case when +#' a fixed regular expression is preferable, rather it seeks to identify +#' likely cases. It should _never_ report false positives, however; please +#' report false positives as an error. +#' +#' @evalRd rd_tags("fixed_regex_linter") +#' @seealso [linters] for a complete list of linters available in lintr. +#' @export +fixed_regex_linter <- function() { + Linter(function(source_expression) { + if (length(source_expression$xml_parsed_content) == 0L) { + return(list()) + } + + xml <- source_expression$xml_parsed_content + + # regular expression pattern is the first argument + pos_1_regex_funs <- xp_text_in_table(c( + "grep", "gsub", "sub", "regexec", "grepl", "regexpr", "gregexpr" + )) + + # regular expression pattern is the second argument + pos_2_regex_funs <- xp_text_in_table(c( + "strsplit", "tstrsplit", + # stringr functions. even though the user action is different + # (setting fixed=TRUE vs. wrapping stringr::fixed()), + # detection of the lint is the same + "str_count", "str_detect", "str_ends", "str_extract", "str_extract_all", + "str_locate", "str_locate_all", "str_match", "str_match_all", + "str_remove", "str_remove_all", "str_replace", "str_replace_all", + "str_split", "str_starts", "str_subset", + "str_view", "str_view_all", "str_which" + )) + + # NB: strsplit doesn't have an ignore.case argument + # NB: we intentionally exclude cases like gsub(x, c("a" = "b")), where "b" is fixed + xpath <- glue::glue("//expr[ + SYMBOL_FUNCTION_CALL[ {pos_1_regex_funs} ] + and not(following-sibling::SYMBOL_SUB[ + (text() = 'fixed' or text() = 'ignore.case') + and following-sibling::expr[1][NUM_CONST[text() = 'TRUE'] or SYMBOL[text() = 'T']] + ]) + ] + /following-sibling::expr[1][STR_CONST and not(EQ_SUB)] + | + //expr[ + SYMBOL_FUNCTION_CALL[ {pos_2_regex_funs} ] + and not(following-sibling::SYMBOL_SUB[ + text() = 'fixed' + and following-sibling::expr[1][NUM_CONST[text() = 'TRUE'] or SYMBOL[text() = 'T']] + ]) + ] + /following-sibling::expr[2][STR_CONST and not(EQ_SUB)] + ") + + patterns <- xml2::xml_find_all(xml, xpath) + + return(lapply( + patterns[is_not_regex(xml2::xml_text(patterns))], + xml_nodes_to_lint, + source_expression = source_expression, + lint_message = paste( + "For static regular expression patterns, set `fixed = TRUE`.", + "Note that this includes regular expressions that can be expressed as", + "fixed patterns, e.g. [.] is really just . and \\$ is really just $", + "if there are no other regular expression specials. For functions from", + "the 'stringr' package, the way to declare a static string is to", + "wrap the pattern in stringr::fixed().", + "If this is being used in a dbplyr context (i.e., translated to sql),", + "replace the regular expression with the `LIKE` operator using the", + "`%LIKE%` infix function.", + "Lastly, take care to remember that the `replacement` argument of", + "`gsub()` is affected by the `fixed` argument as well." + ), + type = "warning" + )) + }) +} + +#' Determine whether a regex pattern actually uses regex patterns +#' +#' Note that is applies to the strings that are found on the XML parse tree, +#' _not_ plain strings. This is important for backslash escaping, which +#' happens at different layers of escaping than one might expect. So testing +#' this function is best done through testing the expected results of a lint +#' on a given file, rather than passing strings to this function, which can +#' be confusing. +#' +#' NB: Tried implementing this at the R level, but the backsplash escaping was +#' becoming nightmarish -- after changing to a character-based approach in R, +#' the code loooked 95% similar to what it would look like in C++, so moved +#' the logic there to get the efficiency boost as well. +#' +#' @param str A character vector. +#' @return A logical vector, `TRUE` wherever `str` could be replaced by a +#' string with `fixed = TRUE`. +#' @noRd +#' @useDynLib lintr, .registration = TRUE, .fixes = "lintr_" +is_not_regex <- function(str, skip_start = FALSE, skip_end = FALSE) { + .Call(lintr_is_not_regex, str, skip_start, skip_end) +} diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv index 9277bffec..06df7206a 100644 --- a/inst/lintr/linters.csv +++ b/inst/lintr/linters.csv @@ -26,6 +26,7 @@ expect_s4_class_linter,package_development best_practices expect_true_false_linter,package_development best_practices readability expect_type_linter,package_development best_practices extraction_operator_linter,style best_practices +fixed_regex_linter,best_practices readability efficiency function_left_parentheses_linter,style readability default ifelse_censor_linter,best_practices efficiency implicit_integer_linter,style consistency best_practices diff --git a/man/best_practices_linters.Rd b/man/best_practices_linters.Rd index 5f4d61347..d3321cf0e 100644 --- a/man/best_practices_linters.Rd +++ b/man/best_practices_linters.Rd @@ -30,6 +30,7 @@ The following linters are tagged with 'best_practices': \item{\code{\link{expect_true_false_linter}}} \item{\code{\link{expect_type_linter}}} \item{\code{\link{extraction_operator_linter}}} +\item{\code{\link{fixed_regex_linter}}} \item{\code{\link{ifelse_censor_linter}}} \item{\code{\link{implicit_integer_linter}}} \item{\code{\link{literal_coercion_linter}}} diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd index fd1a9165c..bd3c36cd9 100644 --- a/man/efficiency_linters.Rd +++ b/man/efficiency_linters.Rd @@ -14,6 +14,7 @@ The following linters are tagged with 'efficiency': \itemize{ \item{\code{\link{any_duplicated_linter}}} \item{\code{\link{any_is_na_linter}}} +\item{\code{\link{fixed_regex_linter}}} \item{\code{\link{ifelse_censor_linter}}} \item{\code{\link{inner_combine_linter}}} \item{\code{\link{literal_coercion_linter}}} diff --git a/man/fixed_regex_linter.Rd b/man/fixed_regex_linter.Rd new file mode 100644 index 000000000..12f508dc9 --- /dev/null +++ b/man/fixed_regex_linter.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fixed_regex_linter.R +\name{fixed_regex_linter} +\alias{fixed_regex_linter} +\title{Require usage of \code{fixed=TRUE} in regular expressions where appropriate} +\usage{ +fixed_regex_linter() +} +\description{ +Invoking a regular expression engine is overkill for cases when the search +pattern only involves static patterns. +} +\details{ +NB: for \code{stringr} functions, that means wrapping the pattern in \code{stringr::fixed()}. + +NB: This linter is likely not able to distinguish every possible case when +a fixed regular expression is preferable, rather it seeks to identify +likely cases. It should \emph{never} report false positives, however; please +report false positives as an error. +} +\seealso{ +\link{linters} for a complete list of linters available in lintr. +} +\section{Tags}{ +\link[=best_practices_linters]{best_practices}, \link[=efficiency_linters]{efficiency}, \link[=readability_linters]{readability} +} diff --git a/man/linters.Rd b/man/linters.Rd index 9f58fce23..a98e19e98 100644 --- a/man/linters.Rd +++ b/man/linters.Rd @@ -17,16 +17,16 @@ see also \code{\link[=available_tags]{available_tags()}}. \section{Tags}{ The following tags exist: \itemize{ -\item{\link[=best_practices_linters]{best_practices} (35 linters)} +\item{\link[=best_practices_linters]{best_practices} (36 linters)} \item{\link[=common_mistakes_linters]{common_mistakes} (6 linters)} \item{\link[=configurable_linters]{configurable} (19 linters)} \item{\link[=consistency_linters]{consistency} (16 linters)} \item{\link[=correctness_linters]{correctness} (7 linters)} \item{\link[=default_linters]{default} (24 linters)} \item{\link[=deprecated_linters]{deprecated} (4 linters)} -\item{\link[=efficiency_linters]{efficiency} (14 linters)} +\item{\link[=efficiency_linters]{efficiency} (15 linters)} \item{\link[=package_development_linters]{package_development} (14 linters)} -\item{\link[=readability_linters]{readability} (35 linters)} +\item{\link[=readability_linters]{readability} (36 linters)} \item{\link[=robustness_linters]{robustness} (12 linters)} \item{\link[=style_linters]{style} (35 linters)} } @@ -61,6 +61,7 @@ The following linters exist: \item{\code{\link{expect_true_false_linter}} (tags: best_practices, package_development, readability)} \item{\code{\link{expect_type_linter}} (tags: best_practices, package_development)} \item{\code{\link{extraction_operator_linter}} (tags: best_practices, style)} +\item{\code{\link{fixed_regex_linter}} (tags: best_practices, efficiency, readability)} \item{\code{\link{function_left_parentheses_linter}} (tags: default, readability, style)} \item{\code{\link{ifelse_censor_linter}} (tags: best_practices, efficiency)} \item{\code{\link{implicit_integer_linter}} (tags: best_practices, consistency, style)} diff --git a/man/readability_linters.Rd b/man/readability_linters.Rd index 91c38b71b..80483a5bf 100644 --- a/man/readability_linters.Rd +++ b/man/readability_linters.Rd @@ -23,6 +23,7 @@ The following linters are tagged with 'readability': \item{\code{\link{expect_named_linter}}} \item{\code{\link{expect_not_linter}}} \item{\code{\link{expect_true_false_linter}}} +\item{\code{\link{fixed_regex_linter}}} \item{\code{\link{function_left_parentheses_linter}}} \item{\code{\link{infix_spaces_linter}}} \item{\code{\link{inner_combine_linter}}} diff --git a/src/fixed_regex.c b/src/fixed_regex.c new file mode 100644 index 000000000..24ad8f2f4 --- /dev/null +++ b/src/fixed_regex.c @@ -0,0 +1,311 @@ +#include "fixed_regex.h" +#include +#include + +#ifndef LOGICAL_RO +#define LOGICAL_RO LOGICAL +#endif + +SEXP is_not_regex(SEXP patterns, SEXP skip_start, SEXP skip_end) { + if (TYPEOF(patterns) != STRSXP) { + Rf_error("Internal error: patterns should be a character vector, got %s", + Rf_type2char(TYPEOF(patterns))); + } + if (TYPEOF(skip_start) != LGLSXP || TYPEOF(skip_end) != LGLSXP) { + Rf_error("Internal error: skip_start and skip_end must be logical"); + } + + const int n_patterns = Rf_length(patterns); + const int n_starts = Rf_length(skip_start); + + if (n_starts != 1 && n_starts != n_patterns) { + Rf_error( + "Internal error: skip_start should be a scalar or same length as " + "patterns"); + } + // NB: this precludes length(skip_end)=1, length(skip_start)=N, + // but that flexibility doesn't do anything for us as of now. + if (n_starts != Rf_length(skip_end)) { + Rf_error("Internal error: skip_start & skip_end must be same length"); + } + + SEXP out = PROTECT(Rf_allocVector(LGLSXP, n_patterns)); + int *out_pointer = LOGICAL(out); + const int *skip_start_pointer = LOGICAL_RO(skip_start); + const int *skip_end_pointer = LOGICAL_RO(skip_end); + + for (R_len_t i = 0; i < n_patterns; ++i) { + const SEXP pattern_i = STRING_ELT(patterns, i); + + int pattern_i_size = Rf_length(pattern_i); + const char *letters = CHAR(pattern_i); + + out_pointer[i] = 1; // default value + // string as received here is the string literal as read by the R parser, + // so it will start & end with either ' or ", which we can skip. + // skip_start --> ignore the initial character, too. + // `% n_starts` mask unifies the code for the length=1 and length=n_patterns + int m = 1 + skip_start_pointer[i % n_starts]; + /* Two possible ways to be a regular expression: + * (1) Uses an unescaped special character + * (2) Uses a metacharacter like \s or \d + */ + while (m < pattern_i_size - 1 - skip_end_pointer[i % n_starts]) { + if (is_special(&letters[m])) { // case (1) + out_pointer[i] = 0; + break; + } + /* [a-z], [._], [0a] etc are regex, but [.], [*], etc. + * can be replaced by a fixed regex + */ + if (letters[m] == '[') { + int char_class_width = get_single_char_class_width(&letters[m + 1], pattern_i_size - m); + if (char_class_width) { + m += 2 + char_class_width; + continue; + } else { + out_pointer[i] = 0; + break; + } + } + /* NB: we're passing through several levels of backslash escapes here, + * among C, R, XML, and regex. Confusing. But note that \ cannot be + * the last character of a STR_CONST (it will always be interpreted as + * an escape in that case). + */ + if (letters[m] == '\\') { + int n = m + 1; + // consume backslashes + while (n < pattern_i_size && letters[n] == '\\') { + ++n; + } + if (n == pattern_i_size) { + break; // shouldn't really be possible, but if so, out[i]=TRUE + } + if ((n - m) % 4 == 2) { // 4k+2 slashes: a regex escape like \\s + if (is_escape_char(&letters[n])) { // case (2) + out_pointer[i] = 0; + break; + } else { // something escaped that needn't be, e.g. \\: or \\/ + m = n + 1; + } + } else { // 4k+0,1,3 slashes: escaped pairs + maybe an escape like \n + // unicode escapes like \u{HHHH} or \U{HHHHHHHH}. note that we only care about + // the em-braced version here -- versions like \U0123 will get recognized + // as non-regex regardless because the other hex characters are fixed + if ((n - m) % 2 == 1 && n + 3 < pattern_i_size && (letters[n] == 'u' || letters[n] == 'U') && letters[n + 1] == '{') { + int right_brace = n + 2; + while (right_brace < pattern_i_size && letters[right_brace] != '}') { + ++right_brace; + } + if (right_brace == pattern_i_size) { + break; // shouldn't really be possible, but if so, out[i]=TRUE + } + m = right_brace + 1; + } else { + m = n; + } + } + } else { + ++m; + } + } + } + UNPROTECT(1); // out + return out; +} + +// See ?regex and third_party/R/R/R_3_6_3/src/extra/tre/tre-parse.c +// We can skip: +// [ -- treated separately +// ] ) } -- only special when paired +// - , = -- only special within contexts matched by { or ( +// # -- only applies to extended regex, we don't set +// ~ -- I don't know what this is. Some search suggests it can +// be used to delimit a regex, e.g.: +// https://stackoverflow.com/q/938100/3576984 +// However that doesn't apply to R: grep("~HI~", "HI") is empty +bool is_special(const char *s) { + return *s == '^' || *s == '$' || *s == '{' || *s == '(' || *s == '.' || + *s == '*' || *s == '+' || *s == '?' || *s == '|'; +} + +// See ?regex -- we're looking for anything that can be used like \ and +// have special meaning in a regex. Most common cases are \s, \b, \d, \w, +// but ?regex lists *many* obscure examples, so we conservatively match any +// alphanumeric and < and >. Implicitly relies on ASCII-ish encoding in `s` +// insofar as the letters and numbers appear consecutively. +bool is_escape_char(const char *s) { + return (*s - 'a' >= 0 && *s - 'a' < 26) || (*s - 'A' >= 0 && *s - 'A' < 26) || + (*s - '0' >= 0 && *s - '0' < 10) || *s == '<' || *s == '>'; +} + +// Return the width of a valid one-character character class, +// e.g. [.], [\n], [\1], [\U{01234567}], etc. If the input +// is not a one-character class, return 0. +// max_width includes the terminal ']' to make sure we don't read out-of-bounds. +int get_single_char_class_width(const char *s, int max_width) { + if (max_width < 2) { + return 0; + } + // be sure to include []] and [[], but not [][] + if (*(s + 1) == ']') { + return 1; + } + // per ?Quote, all other valid single characters are \-escaped + if (*s != '\\') { + return 0; + } + if (max_width < 3) { + return 0; + } + // escaped single chars like [\n] or [\t] or [\"] + if (*(s + 2) == ']') { + return 2; + } + // hex escapes + if (*(s + 1) == 'x') { + if (max_width < 4 || !is_valid_hex(s + 2)) { + return 0; + } + if (*(s + 3) == ']') { + return 3; + } + if (max_width < 5 || !is_valid_hex(s + 3)) { + return 0; + } + if (*(s + 4) == ']') { + return 4; + } + return 0; + } + // octal escapes (NB: \777 is not technically valid, but the R parser handles this) + if( *(s + 1) - '0' >= 0 && *(s + 1) - '0' < 8) { + if (*(s + 2) == ']') { + return 2; + } + if (max_width < 4 || *(s + 2) - '0' < 0 || *(s + 2) - '0' >= 8) { + return 0; + } + if (*(s + 3) == ']') { + return 3; + } + if (max_width < 5 || *(s + 3) - '0' < 0 || *(s + 3) - '0' >= 8) { + return 0; + } + if (*(s + 4) == ']') { + return 4; + } + return 0; + } + // hex escapes like \uHHHH or \UHHHHHHHH + if (max_width < 4 || (*(s + 1) != 'u' && *(s + 1) != 'U')) { + return 0; + } + if (is_valid_hex(s + 2)) { + if (*(s + 3) == ']') { + return 3; + } + if (max_width < 5 || !is_valid_hex(s + 3)) { + return 0; + } + if (*(s + 4) == ']') { + return 4; + } + if (max_width < 6 || !is_valid_hex(s + 4)) { + return 0; + } + if (*(s + 5) == ']') { + return 5; + } + if (max_width < 7 || !is_valid_hex(s + 5)) { + return 0; + } + if (*(s + 6) == ']') { + return 6; + } + // only \u12345678 is parsed as \u{1234}5678, i.e. 5 characters + if (*(s + 1) != 'U' || max_width < 8 || !is_valid_hex(s + 6)) { + return 0; + } + if (*(s + 7) == ']') { + return 7; + } + if (max_width < 9 || !is_valid_hex(s + 7)) { + return 0; + } + if (*(s + 8) == ']') { + return 8; + } + if (max_width < 10 || !is_valid_hex(s + 8)) { + return 0; + } + if (*(s + 9) == ']') { + return 9; + } + if (max_width < 11 || !is_valid_hex(s + 9)) { + return 0; + } + if (*(s + 10) == ']') { + return 10; + } + return 0; + } + // hex escapes like \u{HHHH} or \U{HHHHHHHH} + if (*(s + 2) == '{') { + if (max_width >= 6 && is_valid_hex(s + 3)) { + if (*(s + 4) == '}' && *(s + 5) == ']') { + return 5; + } + if (max_width < 7 || !is_valid_hex(s + 4)) { + return 0; + } + if (*(s + 5) == '}' && *(s + 6) == ']') { + return 6; + } + if (max_width < 8 || !is_valid_hex(s + 5)) { + return 0; + } + if (*(s + 6) == '}' && *(s + 7) == ']') { + return 7; + } + if (max_width < 9 || !is_valid_hex(s + 6)) { + return 0; + } + if (*(s + 7) == '}' && *(s + 8) == ']') { + return 8; + } + if (*(s + 1) != 'U' || max_width < 10 || !is_valid_hex(s + 7)) { + return 0; + } + if (*(s + 8) == '}' && *(s + 9) == ']') { + return 9; + } + if (max_width < 11 || !is_valid_hex(s + 8)) { + return 0; + } + if (*(s + 9) == '}' && *(s + 10) == ']') { + return 10; + } + if (max_width < 12 || !is_valid_hex(s + 9)) { + return 0; + } + if (*(s + 10) == '}' && *(s + 11) == ']') { + return 11; + } + if (max_width < 13 || !is_valid_hex(s + 10)) { + return 0; + } + if (*(s + 11) == '}' && *(s + 12) == ']') { + return 12; + } + return 0; + } + return 0; + } + return 0; +} + +bool is_valid_hex(const char *s) { + return *s - '0' < 10 || *s - 'A' < 6 || *s - 'a' < 6; +} diff --git a/src/fixed_regex.h b/src/fixed_regex.h new file mode 100644 index 000000000..a88625ce2 --- /dev/null +++ b/src/fixed_regex.h @@ -0,0 +1,13 @@ +#ifndef LINTR_SRC_FIXED_REGEX_H_ +#define LINTR_SRC_FIXED_REGEX_H_ + +#include +#include + +SEXP is_not_regex(SEXP patterns, SEXP skip_start, SEXP skip_end); +bool is_special(const char *s); +bool is_escape_char(const char *s); +int get_single_char_class_width(const char *s, int max_width); +bool is_valid_hex(const char *s); + +#endif // LINTR_SRC_FIXED_REGEX_H_ diff --git a/src/init.c b/src/init.c new file mode 100644 index 000000000..a259962cd --- /dev/null +++ b/src/init.c @@ -0,0 +1,18 @@ +#include "fixed_regex.h" +#include // for R_registerRoutines and R_CallMethodDef + +static const R_CallMethodDef call_methods[] = { + {"is_not_regex", (DL_FUNC) &is_not_regex, 3}, + {NULL, NULL, 0} +}; + +void R_init_lintr(DllInfo *info) { + R_registerRoutines(info, + /*.C*/ NULL, + /*.Call*/ call_methods, + /*.Fortran*/ NULL, + /*.External*/ NULL); + R_useDynamicSymbols(info, FALSE); + R_forceSymbols(info, TRUE); +} + diff --git a/tests/testthat/test-fixed_regex_linter.R b/tests/testthat/test-fixed_regex_linter.R new file mode 100644 index 000000000..fb30730cc --- /dev/null +++ b/tests/testthat/test-fixed_regex_linter.R @@ -0,0 +1,214 @@ +# NB: escaping is confusing. We have to double-escape everything -- the first +# escape creates a string that will be parse()d, the second escape is normal +# escaping that would be done in R code. E.g. in "\\\\.", the R code would +# read like "\\.", but in order to create those two slashes, we need to write +# "\\\\." in the string here. + +test_that("fixed_regex_linter skips allowed usages", { + linter <- fixed_regex_linter() + + expect_lint("gsub('^x', '', y)", NULL, linter) + expect_lint("grep('x$', '', y)", NULL, linter) + expect_lint("sub('[a-zA-Z]', '', y)", NULL, linter) + expect_lint("grepl(fmt, y)", NULL, linter) + expect_lint("regexec('\\\\s', '', y)", NULL, linter) + expect_lint("grep('a(?=b)', x, perl = TRUE)", NULL, linter) + expect_lint("grep('0+1', x, perl = TRUE)", NULL, linter) + expect_lint("grep('1*2', x)", NULL, linter) + expect_lint("grep('a|b', x)", NULL, linter) + expect_lint("grep('\\\\[|\\\\]', x)", NULL, linter) + + # if fixed=TRUE is already set, regex patterns don't matter + expect_lint("gsub('\\\\.', '', y, fixed = TRUE)", NULL, linter) + + # ignore.case=TRUE implies regex interpretation + expect_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", NULL, linter) + + # char classes starting with [] might contain other characters -> not fixed + expect_lint("sub('[][]', '', y)", NULL, linter) + expect_lint("sub('[][ ]', '', y)", NULL, linter) + expect_lint("sub('[],[]', '', y)", NULL, linter) +}) + +test_that("fixed_regex_linter blocks simple disallowed usages", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("gsub('\\\\.', '', x)", msg, linter) + expect_lint("grepl('abcdefg', x)", msg, linter) + expect_lint("gregexpr('a-z', y)", msg, linter) + expect_lint("regexec('\\\\$', x)", msg, linter) + expect_lint("grep('\n', x)", msg, linter) + + # naming the argument doesn't matter (if it's still used positionally) + expect_lint("gregexpr(pattern = 'a-z', y)", msg, linter) +}) + +test_that("fixed_regex_linter catches regex like [.] or [$]", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("grep('[.]', x)", msg, linter) + expect_lint("grepl('a[*]b', x)", msg, linter) + + # also catch char classes for [ and ] + expect_lint("gregexpr('[]]', x)", msg, linter) +}) + +test_that("fixed_regex_linter catches null calls to strsplit as well", { + linter <- fixed_regex_linter() + + expect_lint("strsplit(x, '^x')", NULL, linter) + expect_lint("tstrsplit(x, '[a-zA-Z]')", NULL, linter) + expect_lint("tstrsplit(x, fmt)", NULL, linter) + expect_lint("strsplit(x, '\\\\s')", NULL, linter) + expect_lint("strsplit(x, 'a(?=b)', perl = TRUE)", NULL, linter) + expect_lint("strsplit(x, '0+1', perl = TRUE)", NULL, linter) + expect_lint("tstrsplit(x, '1*2')", NULL, linter) + expect_lint("strsplit(x, 'a|b')", NULL, linter) + + # if fixed=TRUE is already set, regex patterns don't matter + expect_lint("strsplit(x, '\\\\.', fixed = TRUE)", NULL, linter) + expect_lint("strsplit(x, '\\\\.', fixed = T)", NULL, linter) +}) + +test_that("fixed_regex_linter catches calls to strsplit as well", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("strsplit(x, '\\\\.')", msg, linter) + expect_lint("tstrsplit(x, 'abcdefg')", msg, linter) + expect_lint("strsplit(x, '[.]')", msg, linter) +}) + +test_that("fixed_regex_linter is more exact about distinguishing \\s from \\:", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("grep('\\\\s', '', x)", NULL, linter) + expect_lint("grep('\\\\:', '', x)", msg, linter) +}) + +## tests for stringr functions +test_that("fixed_regex_linter skips allowed stringr usages", { + linter <- fixed_regex_linter() + + expect_lint("str_replace(y, '[a-zA-Z]', '')", NULL, linter) + expect_lint("str_replace_all(y, '^x', '')", NULL, linter) + expect_lint("str_detect(y, fmt)", NULL, linter) + expect_lint("str_extract(y, '\\\\s')", NULL, linter) + expect_lint("str_extract_all(y, '\\\\s')", NULL, linter) + expect_lint("str_which(x, '1*2')", NULL, linter) + + # if fixed() is already set, regex patterns don't matter + expect_lint("str_replace(y, fixed('\\\\.'), '')", NULL, linter) + + # namespace qualification doesn't matter + expect_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", NULL, linter) +}) + +test_that("fixed_regex_linter blocks simple disallowed usages of stringr functions", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("str_replace_all(x, '\\\\.', '')", msg, linter) + expect_lint("str_detect(x, 'abcdefg')", msg, linter) + expect_lint("str_locate(y, 'a-z')", msg, linter) + expect_lint("str_subset(x, '\\\\$')", msg, linter) + expect_lint("str_which(x, '\n')", msg, linter) + + # named, positional arguments are still caught + expect_lint("str_locate(y, pattern = 'a-z')", msg, linter) + # nor do other named arguments throw things off + expect_lint("str_starts(x, '\\\\.', negate = TRUE)", msg, linter) +}) + +test_that("fixed_regex_linter catches calls to str_split as well", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("str_split(x, '^x')", NULL, linter) + expect_lint("str_split(x, fmt)", NULL, linter) + + # if fixed() is already set, regex patterns don't matter + expect_lint("str_split(x, fixed('\\\\.'))", NULL, linter) + expect_lint("str_split(x, '\\\\.')", msg, linter) + expect_lint("str_split(x, '[.]')", msg, linter) +}) + +test_that("str_replace_all's multi-replacement version is handled", { + linter <- fixed_regex_linter() + + # While each of the replacements is fixed, and this _could_ in principle be replaced by + # a pipeline where each step does one of the replacements and fixed() is used, this is overkill. + # Instead, ensure that no lint is returned for this case + expect_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', NULL, linter) + expect_lint('grepl(c("a" = "b"), x)', NULL, linter) +}) + +test_that("1- or 2-width octal escape sequences are handled", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint('strsplit(x, "\\1")', msg, linter) +}) + +test_that("one-character character classes with escaped characters are caught", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint("gsub('[\\n]', '', x)", msg, linter) + expect_lint("gsub('[\\\"]', '', x)", msg, linter) + expect_lint("str_split(x, '[\\1]')", msg, linter) + expect_lint("str_split(x, '[\\12]')", msg, linter) + expect_lint("str_split(x, '[\\123]')", msg, linter) + expect_lint("str_split(x, '[\\xa]')", msg, linter) + expect_lint("str_split(x, '[\\xA7]')", msg, linter) + expect_lint("str_split(x, '[\\uF]')", msg, linter) + expect_lint("str_split(x, '[\\u01]')", msg, linter) + expect_lint("str_split(x, '[\\u012]')", msg, linter) + expect_lint("str_split(x, '[\\u0123]')", msg, linter) + expect_lint("str_split(x, '[\\U8]')", msg, linter) + expect_lint("str_split(x, '[\\U1d4d7]')", msg, linter) + expect_lint("str_split(x, '[\\u{1}]')", msg, linter) + expect_lint("str_split(x, '[\\U{F7D5}]')", msg, linter) + expect_lint("str_split(x, '[\\U{1D4D7}]')", msg, linter) +}) + +test_that("bracketed unicode escapes are caught", { + linter <- fixed_regex_linter() + msg <- rex::rex("For static regular expression patterns, set `fixed = TRUE`.") + + expect_lint('gsub("\\u{A0}", " ", out, useBytes = TRUE)', msg, linter) + expect_lint('gsub("abc\\U{A0DEF}ghi", " ", out, useBytes = TRUE)', msg, linter) + expect_lint('gsub("\\u{A0}\\U{0001d4d7}", " ", out, useBytes = TRUE)', msg, linter) +}) + +test_that("escaped characters are handled correctly", { + linter <- fixed_regex_linter() + expect_lint("gsub('\\n+', '', sql)", NULL, linter) + expect_lint('gsub("\\n{2,}", "\n", D)', NULL, linter) + expect_lint('gsub("[\\r\\n]", "", x)', NULL, linter) + expect_lint('gsub("\\n $", "", y)', NULL, linter) + expect_lint('gsub("```\\n*```r*\\n*", "", x)', NULL, linter) + expect_lint('strsplit(x, "(;|\n)")', NULL, linter) + expect_lint('strsplit(x, "(;|\\n)")', NULL, linter) +}) + +# make sure the logic is properly vectorized +test_that("single expression with multiple regexes is OK", { + expect_lint('c(grep("^a", x), grep("b$", x))', NULL, fixed_regex_linter()) +}) + +# TODO(michaelchirico): one difference for stringr functions vs. base is that +# stringr is much friendlier to piping, so that +# str %>% str_replace_all("x$", "y") +# actually doesn't need fixed(), but the logic now is only looking at "y" +# since it's the second argument and a non-regex string. Similarly, +# str %>% str_detect("x") +# is a false negative. thankfully there appear to be few false positives here + +# TODO(michaelchirico): we could in principle build in logic to detect whether +# perl=TRUE and interpret "regex or not" accordingly. One place +# up in practice is for '\<', which is a special character in default +# regex but not in PCRE. Empirically relevant for HTML-related regex e.g. \\