Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
cf971fe
EXP-001: 4-digit SWAR follow-up in ffc_loop_parse_if_eight_digits
fcostaoliveira May 26, 2026
bac793d
EXP-006: use local vars in too_many_digits path, allow GCC DSE of str…
fcostaoliveira May 26, 2026
647e29e
perf: FFC_IMPL_INLINE — force inline ffc_from_chars_double at call sites
fcostaoliveira May 26, 2026
f791ca4
docs: update benchmark results with 2026-05-26 metal VM numbers
fcostaoliveira May 26, 2026
9d2b7ca
perf: combined exponent range check in ffc_clinger_fast_path_impl (EX…
fcostaoliveira May 26, 2026
8cdf4cd
Unroll fraction byte-by-byte tail: 3 nested ifs replace while loop
fcostaoliveira May 27, 2026
6e2c993
perf: straight-line integer scan — nested-ifs replace while loop for …
fcostaoliveira May 27, 2026
de871b3
perf: extend integer nested-ifs to 5 levels (EXP-028)
fcostaoliveira May 27, 2026
9cf5785
perf: FFC_ROUNDS_TO_NEAREST macro eliminates FCMP chain (EXP-030)
fcostaoliveira May 27, 2026
c88481a
EXP-033: early exit for exponent==0 in ffc_from_chars_advanced
fcostaoliveira May 27, 2026
43e22b3
fix: guard zero mantissa under FE_DOWNWARD in exponent==0 fast path
fcostaoliveira May 27, 2026
6ccc765
EXP-039/042/044: checkpoint accepted shift-add acc10 + 2x SWAR unroll
fcostaoliveira May 31, 2026
88eeecd
fix: use caller's vk (not hardcoded DOUBLE) in ffc_negative_digit_comp
fcostaoliveira Jun 2, 2026
3bcf31a
review: address PR feedback — dedup always_inline macro, docs, commen…
fcostaoliveira Jun 3, 2026
3314128
Merge upstream/main (4-digit SWAR follow-up, #23) into perf/force-inl…
fcostaoliveira Jun 3, 2026
c901d3e
Merge remote-tracking branch 'upstream/main' into perf/force-inline-f…
fcostaoliveira Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 109 additions & 28 deletions ffc.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,16 @@ extern "C" {
#include <stddef.h>
#include <stdint.h>

/* always_inline marker, defined here so FFC_IMPL_INLINE (below) can reuse it.
* common.h re-uses this same definition under an #ifndef guard. */
#if defined(_MSC_VER)
#define ffc_inline __forceinline
#elif defined(__GNUC__) || defined(__clang__)
#define ffc_inline __attribute__((always_inline)) inline
#else
#define ffc_inline inline
#endif

typedef uint32_t ffc_outcome;
enum ffc_outcome_bits {
FFC_OUTCOME_OK = 0,
Expand Down Expand Up @@ -227,8 +237,19 @@ ffc_result ffc_parse_double(size_t len, const char *input, double *out);
* `fast_float::chars_format::general` which allows both `fixed` and
* `scientific`.
*/
ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
/* When included from a FFC_IMPL translation unit, the critical-path API
* functions are declared always_inline so GCC inlines them at call sites
* in the same TU. In non-FFC_IMPL TUs the declarations are plain extern.
* Under FFC_IMPL this is just ffc_inline (always_inline); the non-FFC_IMPL
* branch must stay empty so the symbols keep external linkage. */
#ifdef FFC_IMPL
# define FFC_IMPL_INLINE ffc_inline
#else
# define FFC_IMPL_INLINE
#endif

FFC_IMPL_INLINE ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);

/*
* A simplified API; the result will be 0.0 on error, not uninitialized.
Expand Down Expand Up @@ -295,13 +316,15 @@ ffc_result ffc_parse_json_number(const char *start, const char *end, ffc_json_nu
#define ffc_internal static
#endif

#ifndef ffc_inline
#if defined(_MSC_VER)
#define ffc_inline __forceinline
#elif defined(__GNUC__) || defined(__clang__)
#define ffc_inline __attribute__((always_inline)) inline
#else
#define ffc_inline inline
#endif
#endif

#if FFC_DEBUG
#include <stdio.h>
Expand Down Expand Up @@ -808,6 +831,13 @@ bool ffc_strncasecmp5(char *actual_mixedcase, char const *expected_lowercase, si

ffc_internal ffc_inline
bool ffc_rounds_to_nearest(void) {
#if defined(FFC_ROUNDS_TO_NEAREST)
Comment thread
kolemannix marked this conversation as resolved.
// We're being compiled under a mode where IEEE 754 round-to-nearest mode is
// guaranteed.
// We can simply return true; this is an optimization that eliminates the
// volatile float FCMP chain below.
return true;
#endif
// https://lemire.me/blog/2020/06/26/gcc-not-nearest/
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
return false;
Expand Down Expand Up @@ -1121,10 +1151,18 @@ bool ffc_simd_parse_if_eight_digits_unrolled_simd(uint16_t const *chars, uint64_

#endif // FFC_HAS_SIMD

// Compute acc*10 + d_expr using add+lsl on AArch64/Clang.
// GCC already strength-reduces i*10 to shift-adds naturally; the asm is
// Clang-only. The non-Clang path is a plain macro so GCC sees the original
// expression and can fuse the pointer increment with surrounding code.
// Compute acc*10 + d_expr.
//
// On AArch64, Clang emits `smaddl` (3-cycle latency) for `acc*10 + d`, which
// sits on the digit-accumulation critical path. Forcing the `add + lsl` form
// via inline asm shortens it and is worth ~+9% on Clang/AArch64.
//
// The asm is intentionally narrow: it is only correct/beneficial on AArch64,
// and *only* for Clang. GCC already strength-reduces `acc*10` to shift-adds
// and schedules them optimally; routing GCC through this asm measurably
// regressed it (canada -5.3%). Every other compiler therefore uses the plain
// expression below, which is also what makes this safe to leave unconditional
// at the call sites.
#if defined(__aarch64__) && defined(__clang__)
ffc_internal ffc_inline uint64_t
ffc_digit_acc10(uint64_t acc, uint64_t d) {
Expand Down Expand Up @@ -1256,7 +1294,7 @@ ffc_parsed ffc_parse_number_string(
}
} else {
// a sign must be followed by an integer or the dot
if (!ffc_is_integer(*p) && (*p != decimal_point)) {
if (!ffc_is_integer(*p) && (*p != decimal_point)) {
return ffc_report_parse_error(p, FFC_PARSE_OUTCOME_MISSING_INTEGER_OR_DOT_AFTER_SIGN);
}
}
Expand All @@ -1267,13 +1305,26 @@ ffc_parsed ffc_parse_number_string(

uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)

while ((p != pend) && ffc_is_integer(*p)) {
// Horner's method: only ever multiplies by the constant 10
// avoiding variable power-of-10 multiplies

// might overflow, we will handle the overflow later
i = FFC_DIGIT_ACC10(i, *p - '0');
++p;
// Integer scan: the first 5 digits are read straight-line, longer runs
// continue in the while loop. The common 1-5 digit case stays branch-light.
if ((p != pend) && ffc_is_integer(*p)) {
i = (uint64_t)(*p++ - '0');
if ((p != pend) && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, *p++ - '0');
if ((p != pend) && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, *p++ - '0');
if ((p != pend) && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, *p++ - '0');
if ((p != pend) && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, *p++ - '0');
while ((p != pend) && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, *p - '0'); // might overflow, handled later
++p;
}
}
}
}
}
}

char const *const end_of_integer_part = p;
Expand All @@ -1296,16 +1347,28 @@ ffc_parsed ffc_parse_number_string(
int64_t exponent = 0;
bool const has_decimal_point = (p != pend) && (*p == decimal_point);

// Fraction-part bounds, kept as locals so the too_many_digits re-scan below
// can reuse them without reloading the answer.fraction_part_* fields.
char const *before = NULL;
char const *frac_end_local = NULL;

/* post-decimal exponential part (calculates a negative exponent) */
if (has_decimal_point) {
++p;
char const *before = p;
before = p;
// can occur at most twice without overflowing, but let it occur more, since
// for integers with many digits, digit parsing is the primary bottleneck.
ffc_loop_parse_if_eight_digits(&p, pend, &i);

while ((p != pend) && ffc_is_integer(*p)) {
// manual unroll for the 1-3 digit case
if (p != pend && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0'))); // in rare cases overflows, ok
if (p != pend && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0')));
if (p != pend && ffc_is_integer(*p)) {
i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0')));
}
}
}

// pre: i = 123, digit_count = 3
Expand All @@ -1317,6 +1380,7 @@ ffc_parsed ffc_parse_number_string(
// i = 123456
// digit_count = 3 - (-3) = 6
exponent = before - p;
frac_end_local = p; // capture before p advances into explicit exponent
answer.fraction_part_start = (char*)before;
answer.fraction_part_len = (size_t)(p - before);
digit_count -= exponent;
Expand Down Expand Up @@ -1405,12 +1469,12 @@ ffc_parsed ffc_parse_number_string(

if (digit_count > 19) {
answer.too_many_digits = true;
// Let us start again, this time, avoiding overflows.
// We don't need to call if is_integer, since we use the
// pre-tokenized spans from above.
// Re-scan the digits into i, this time stopping before overflow. Reads
// from the local digit-range pointers (start_digits, end_of_integer_part,
// before, frac_end_local) rather than the answer struct fields.
i = 0;
p = answer.int_part_start;
char const *int_end = p + answer.int_part_len;
p = (char*)start_digits;
char const *int_end = (char*)end_of_integer_part;
uint64_t const minimal_nineteen_digit_integer = 1000000000000000000;
while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
i = i * 10 + (uint64_t)(*p - '0');
Expand All @@ -1419,13 +1483,13 @@ ffc_parsed ffc_parse_number_string(
if (i >= minimal_nineteen_digit_integer) { // We have a big integer
exponent = end_of_integer_part - p + exp_number;
} else { // We have a value with a fractional component.
p = answer.fraction_part_start;
char const *frac_end = p + answer.fraction_part_len;
p = (char*)before;
char const *frac_end = (char*)frac_end_local;
while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
i = i * 10 + (uint64_t)(*p - '0');
++p;
}
exponent = answer.fraction_part_start - p + exp_number;
exponent = before - p + exp_number;
}
// We have now corrected both exponent and i, to a truncated value
}
Expand Down Expand Up @@ -3031,8 +3095,10 @@ bool ffc_clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_neg
// selected on the thread.
// We proceed optimistically, assuming that detail::rounds_to_nearest()
// returns true.
if (ffc_const(value_kind, MIN_EXPONENT_FAST_PATH) <= exponent &&
exponent <= ffc_const(value_kind, MAX_EXPONENT_FAST_PATH)) {
// Single unsigned range check replaces two signed comparisons, matching
// fast_float's layout: (uint64_t)(e - MIN) <= (MAX - MIN) in one compare.
if ((uint64_t)((int64_t)exponent - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH)) <=
(uint64_t)((int64_t)ffc_const(value_kind, MAX_EXPONENT_FAST_PATH) - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH))) {
// Unfortunately, the conventional Clinger's fast path is only possible
// when the system rounds to the nearest float.
//
Expand Down Expand Up @@ -3099,6 +3165,19 @@ ffc_result ffc_from_chars_advanced(ffc_parsed const pns, ffc_value* value, ffc_v
answer.outcome = FFC_OUTCOME_OK; // be optimistic :')
answer.ptr = (char*)pns.lastmatch;

if (!pns.too_many_digits && pns.exponent == 0 &&
pns.mantissa <= ffc_const(vk, MAX_MANTISSA_FAST_PATH)) {
#if defined(__clang__) || defined(FFC_32BIT)
if (pns.mantissa == 0) {
ffc_set_value(value, vk, pns.negative ? -0. : 0.);
return answer;
}
#endif
ffc_set_value(value, vk, pns.mantissa);
if (pns.negative) { ffc_set_value(value, vk, -ffc_read_value(value, vk)); }
return answer;
}
Comment on lines +3168 to +3179

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a bugfix? just making a note to review more closely

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good eye — two things are colocated here. The -0.0-under-FE_DOWNWARD guard is a genuine bugfix (under Clang/32-bit a zero mantissa was mapped to -0.0 for negative inputs in downward rounding mode; caught by the supplemental corpus) and is already its own commit, 43e22b3, separate from the exponent==0 fast-path early-exit (c88481a). You can review the bugfix hunk standalone there — and I'm happy to cherry-pick it into its own small precursor PR so it can merge ahead of the perf work if that eases review. Just say the word.


if (!pns.too_many_digits &&
ffc_clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value, vk)) {
ffc_debug("fast path hit");
Expand Down Expand Up @@ -3171,7 +3250,9 @@ ffc_result ffc_from_chars(char* first, char* last, ffc_parse_options options, ff
return ffc_from_chars_advanced(pns, value, vk);
}

ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
/* extern FFC_IMPL_INLINE gives GCC the always_inline directive while also
* requesting external linkage so non-FFC_IMPL TUs can link these symbols. */
extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
Comment thread
kolemannix marked this conversation as resolved.
// It would be UB to directly use *out as our ffc_value, even though its the same layout
ffc_value out_value = {0};

Expand All @@ -3180,7 +3261,7 @@ ffc_result ffc_from_chars_double_options(const char *start, const char *end, dou
*out = out_value.d;
return result;
}
ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
ffc_parse_options options = ffc_parse_options_default();
return ffc_from_chars_double_options(first, last, out, options);
}
Expand Down
11 changes: 11 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ int main(void) {
For use within a larger parser, where you don't expect to reach the end of input, use
the non-simple variants as the `ffc_result` includes the stopping point, just like in fast_float

## Configuration Macros

Define these before including `ffc.h` to control compilation:

- `FFC_IMPL` — Include the implementation (define in exactly one translation unit).
- `FFC_ROUNDS_TO_NEAREST` — Assert at compile time that the floating-point
environment uses IEEE 754 round-to-nearest mode (the default on virtually all
platforms). When defined, the runtime rounding-mode check is elided, removing a
volatile-load/FCMP chain from the fast path. Only define this if you never change
the rounding mode (e.g. via `fesetround`) before parsing.

## API

### Float Parsing
Expand Down
25 changes: 23 additions & 2 deletions src/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@
#include <stddef.h>
#include <stdint.h>

/* always_inline marker, defined here so FFC_IMPL_INLINE (below) can reuse it.
* common.h re-uses this same definition under an #ifndef guard. */
#if defined(_MSC_VER)
#define ffc_inline __forceinline
#elif defined(__GNUC__) || defined(__clang__)
#define ffc_inline __attribute__((always_inline)) inline
#else
#define ffc_inline inline
#endif

typedef uint32_t ffc_outcome;
enum ffc_outcome_bits {
FFC_OUTCOME_OK = 0,
Expand Down Expand Up @@ -115,8 +125,19 @@ ffc_result ffc_parse_double(size_t len, const char *input, double *out);
* `fast_float::chars_format::general` which allows both `fixed` and
* `scientific`.
*/
ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
/* When included from a FFC_IMPL translation unit, the critical-path API
* functions are declared always_inline so GCC inlines them at call sites
* in the same TU. In non-FFC_IMPL TUs the declarations are plain extern.
* Under FFC_IMPL this is just ffc_inline (always_inline); the non-FFC_IMPL
* branch must stay empty so the symbols keep external linkage. */
#ifdef FFC_IMPL
# define FFC_IMPL_INLINE ffc_inline
#else
# define FFC_IMPL_INLINE
#endif

FFC_IMPL_INLINE ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);

/*
* A simplified API; the result will be 0.0 on error, not uninitialized.
Expand Down
9 changes: 9 additions & 0 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
#define ffc_internal static
#endif

#ifndef ffc_inline
#if defined(_MSC_VER)
#define ffc_inline __forceinline
#elif defined(__GNUC__) || defined(__clang__)
#define ffc_inline __attribute__((always_inline)) inline
#else
#define ffc_inline inline
#endif
#endif

#if FFC_DEBUG
#include <stdio.h>
Expand Down Expand Up @@ -524,6 +526,13 @@ bool ffc_strncasecmp5(char *actual_mixedcase, char const *expected_lowercase, si

ffc_internal ffc_inline
bool ffc_rounds_to_nearest(void) {
#if defined(FFC_ROUNDS_TO_NEAREST)
// We're being compiled under a mode where IEEE 754 round-to-nearest mode is
// guaranteed.
// We can simply return true; this is an optimization that eliminates the
// volatile float FCMP chain below.
return true;
#endif
// https://lemire.me/blog/2020/06/26/gcc-not-nearest/
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
return false;
Expand Down
Loading
Loading