Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# vroom (development version)

* `locale()` gains a `date_order` argument to control the component order used
when parsing dates and date-times (e.g. `"mdy"`, `"dmy"`, `"ymd_hms"`). This
makes it possible to read year-last formats such as `10/02/2024` that the
automatic type guesser would otherwise treat as character.

* Date and date-time auto-detection now accepts any non-alphanumeric separator
between components (e.g. `2024.10.02`, `2024/10/02`), and falls back to a
year-last heuristic so unambiguous `D/M/YYYY` values are recognised as dates.

# vroom 1.7.1

* Internal changes requested by CRAN for forward compatibility with clang 22.
Expand Down
42 changes: 40 additions & 2 deletions R/locale.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
#' DST. It is *not* Eastern Standard Time. It's better to use
#' "US/Eastern", "US/Central" etc.
#' @param encoding Default encoding.
#' @param date_order Order of date components for auto-detection. One of
#' `"ymd"`, `"ydm"`, `"mdy"`, `"myd"`, `"dmy"`, `"dym"`, or those combined
#' with a time suffix: `"_hms"`, `"_hm"`, or `"_h"` (e.g. `"mdy_hms"`).
#' Use `NULL` (default) for automatic detection.
#' @export
#' @examples
#' locale()
Expand All @@ -42,7 +46,8 @@ locale <- function(
decimal_mark = ".",
grouping_mark = ",",
tz = "UTC",
encoding = "UTF-8"
encoding = "UTF-8",
date_order = NULL
) {
if (is.character(date_names)) {
date_names <- date_names_lang(date_names)
Expand Down Expand Up @@ -73,6 +78,35 @@ locale <- function(
tz <- check_tz(tz)
check_encoding(encoding)

valid_date_orders <- c(
"ymd",
"ydm",
"mdy",
"myd",
"dmy",
"dym",
"ymd_hms",
"ymd_hm",
"ymd_h",
"mdy_hms",
"mdy_hm",
"mdy_h",
"dmy_hms",
"dmy_hm",
"dmy_h",
"ydm_hms",
"ydm_hm",
"ydm_h"
)
if (!is.null(date_order)) {
check_string(date_order)
}
if (!is.null(date_order) && !date_order %in% valid_date_orders) {
cli::cli_abort(
"{.arg date_order} must be NULL or one of: {.val {valid_date_orders}}"
)
}

structure(
list(
date_names = date_names,
Expand All @@ -81,7 +115,8 @@ locale <- function(
decimal_mark = decimal_mark,
grouping_mark = grouping_mark,
tz = tz,
encoding = encoding
encoding = encoding,
date_order = date_order
),
class = "locale"
)
Expand All @@ -108,6 +143,9 @@ print.locale <- function(x, ...) {
cat("Formats: ", x$date_format, " / ", x$time_format, "\n", sep = "")
cat("Timezone: ", x$tz, "\n", sep = "")
cat("Encoding: ", x$encoding, "\n", sep = "")
if (!is.null(x$date_order)) {
cat("Date order: ", x$date_order, "\n", sep = "")
}
print(x$date_names)
}

Expand Down
3 changes: 2 additions & 1 deletion R/path.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ connection_or_filepath <- function(path, write = FALSE, call = caller_env()) {
extension <- split_path_ext(extension)$extension
formats <- archive_formats(extension)
}
needs_archive <- !is.null(formats) && (write || extension != "zip")
needs_archive <- !is.null(formats) &&
(write || extension != "zip" || requireNamespace("archive", quietly = TRUE))

if (needs_archive) {
reason <- glue(
Expand Down
8 changes: 7 additions & 1 deletion man/locale.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

179 changes: 173 additions & 6 deletions src/DateTimeParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,16 @@ class DateTimeParser {
// parsing with a format string so it doesn't seem necessary to add individual
// parsers for other common formats.
bool parseISO8601(bool /* partial */ = true) {
// Date: YYYY-MM-DD, YYYYMMDD
// Date: YYYY-MM-DD, YYYYMMDD, YYYY/MM/DD, YYYY.MM.DD, etc.
// Accepts any non-alphanumeric separator between date components,
// similar to lubridate's ymd() flexible parsing.
if (!consumeInteger(4, &year_))
return false;
if (consumeThisChar('-'))
if (consumeDateSeparator())
compactDate_ = false;
if (!consumeInteger(2, &mon_))
return false;
if (!compactDate_ && !consumeThisChar('-'))
if (!compactDate_ && !consumeDateSeparator())
return false;
if (!consumeInteger(2, &day_))
return false;
Expand Down Expand Up @@ -164,21 +166,173 @@ class DateTimeParser {
}

bool parseDate() {
// Date: YYYY-MM-DD, YYYY/MM/DD
// Date: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD, etc.
// Accepts any non-alphanumeric separator between date components.
if (!consumeInteger(4, &year_))
return false;
if (!consumeThisChar('-') && !consumeThisChar('/'))
if (!consumeDateSeparator())
return false;
if (!consumeInteger(2, &mon_))
return false;
if (!consumeThisChar('-') && !consumeThisChar('/'))
if (!consumeDateSeparator())
return false;
if (!consumeInteger(2, &day_))
return false;

return isComplete();
}

// Parse a date (and optionally time) according to an explicit component order.
// dateOrder examples: "mdy", "dmy", "ymd", "mdy_hms", "dmy_hm", "ymd_h"
// Date components: y=year(4-digit), m=month(1-2 digit), d=day(1-2 digit)
// Time suffixes: h=HH, hm=HH:MM, hms=HH:MM:SS
bool parseDateOrder(const std::string& order) {
// Split on '_' into date part and optional time part
std::string datePart, timePart;
size_t underscore = order.find('_');
if (underscore != std::string::npos) {
datePart = order.substr(0, underscore);
timePart = order.substr(underscore + 1);
} else {
datePart = order;
timePart = "";
}

// Parse date components in the specified order
for (size_t i = 0; i < datePart.size(); i++) {
if (i > 0 && !consumeDateSeparator())
return false;

switch (datePart[i]) {
case 'y':
if (!consumeYearFlexible()) return false;
break;
case 'm':
if (!consumeInteger(2, &mon_, false)) return false;
break;
case 'd':
if (!consumeInteger(2, &day_, false)) return false;
break;
default:
return false;
}
}

// Date-only: must consume entire input
if (timePart.empty())
return isComplete();

// Date+time: consume separator (T or space)
char next;
if (!consumeChar(&next)) return false;
if (next != 'T' && next != ' ') return false;

// Parse hour (always present in any time suffix)
if (!consumeInteger(2, &hour_)) return false;

if (timePart == "h")
return isComplete();

// "hm" or "hms": parse minutes
consumeThisChar(':');
if (!consumeInteger(2, &min_)) return false;

if (timePart == "hm")
return isComplete();

// "hms": parse seconds (optional fractional)
consumeThisChar(':');
consumeSeconds(&sec_, &psec_);

// Optional timezone (same as ISO8601)
if (isComplete()) return true;
tz_ = "UTC";
consumeTzOffset(&tzOffsetHours_, &tzOffsetMinutes_);

return isComplete();
}

// Consume a year that may be 2 or 4 digits. 2-digit years use the same pivot
// as the %y format specifier (00-68 -> 2000s, 69-99 -> 1900s). 3-digit values
// (100-999) are implausible and rejected. (Issue #36088)
bool consumeYearFlexible() {
if (!consumeInteger(4, &year_, false)) return false;
if (year_ < 100) {
year_ += (year_ < 69) ? 2000 : 1900;
} else if (year_ < 1000) {
return false;
}
return true;
}

// Disambiguate a year-last date's first two components into month and day.
// part1 > 12 -> DMY; part2 > 12 -> MDY; otherwise default to MDY (US).
// Returns false if the resulting month/day are out of range.
bool disambiguateDayMonth(int part1, int part2) {
if (part1 > 12) {
day_ = part1;
mon_ = part2;
} else if (part2 > 12) {
mon_ = part1;
day_ = part2;
} else {
mon_ = part1;
day_ = part2;
}
if (mon_ < 1 || mon_ > 12) return false;
if (day_ < 1 || day_ > 31) return false;
return true;
}

// Heuristic for year-last date patterns: D/M/Y or M/D/Y (Y = 2 or 4 digits)
// Matches: \d{1,2}[sep]\d{1,2}[sep]\d{2,4}
// Disambiguation: if part1 > 12 → DMY; if part2 > 12 → MDY; else → MDY (default)
bool parseYearLastHeuristic() {
int part1, part2;

if (!consumeInteger(2, &part1, false)) return false;
if (!consumeDateSeparator()) return false;
if (!consumeInteger(2, &part2, false)) return false;
if (!consumeDateSeparator()) return false;
if (!consumeYearFlexible()) return false;
if (!isComplete()) return false;

return disambiguateDayMonth(part1, part2);
}

// Year-last datetime heuristic: a year-last date (M/D/Y or D/M/Y, 2 or 4 digit
// year) followed by a T/space separator and a HH[:MM[:SS]] time with optional
// timezone. Mirrors the time tail of parseISO8601. (Issue #36088)
bool parseYearLastHeuristicDateTime() {
int part1, part2;

if (!consumeInteger(2, &part1, false)) return false;
if (!consumeDateSeparator()) return false;
if (!consumeInteger(2, &part2, false)) return false;
if (!consumeDateSeparator()) return false;
if (!consumeYearFlexible()) return false;
if (!disambiguateDayMonth(part1, part2)) return false;

// Time portion is required (date-only is handled by parseYearLastHeuristic).
char next;
if (!consumeChar(&next)) return false;
if (next != 'T' && next != ' ') return false;

if (!consumeInteger(2, &hour_)) return false;
consumeThisChar(':');
consumeInteger(2, &min_);
consumeThisChar(':');
consumeSeconds(&sec_, &psec_);

if (isComplete()) return true;

// Optional timezone
tz_ = "UTC";
if (!consumeTzOffset(&tzOffsetHours_, &tzOffsetMinutes_)) return false;

return isComplete();
}

bool isComplete() { return dateItr_ == dateEnd_; }

void setDate(const char* start, const char* end) {
Expand Down Expand Up @@ -490,6 +644,19 @@ class DateTimeParser {
return true;
}

// Consume a single non-alphanumeric, non-space character as a date separator.
// Accepts: - / . , ; and other punctuation, similar to lubridate's ymd().
// Rejects: digits, letters, whitespace (to avoid false positives).
inline bool consumeDateSeparator() {
if (dateItr_ == dateEnd_)
return false;
char c = *dateItr_;
if (std::isalnum(c) || std::isspace(c))
return false;
dateItr_++;
return true;
}

inline bool consumeNonDigit() {
if (dateItr_ == dateEnd_ || std::isdigit(*dateItr_))
return false;
Expand Down
6 changes: 6 additions & 0 deletions src/LocaleInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,11 @@ LocaleInfo::LocaleInfo(const cpp11::list& x)
dateFormat_ = cpp11::as_cpp<std::string>(x["date_format"]);
timeFormat_ = cpp11::as_cpp<std::string>(x["time_format"]);

// date_order is optional (NULL in R becomes empty string)
SEXP date_order_sexp = x["date_order"];
dateOrder_ = (date_order_sexp == R_NilValue)
? ""
: cpp11::as_cpp<std::string>(date_order_sexp);

tz_ = cpp11::as_cpp<std::string>(x["tz"]);
}
1 change: 1 addition & 0 deletions src/LocaleInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class LocaleInfo {
// LC_TIME
std::vector<std::string> mon_, monAb_, day_, dayAb_, amPm_;
std::string dateFormat_, timeFormat_;
std::string dateOrder_; // date component order (e.g. "mdy", "dmy_hms"), empty = auto

// LC_NUMERIC
std::string decimalMark_, groupingMark_;
Expand Down
Loading
Loading