kolemannix · fcostaoliveira · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/ffc.h b/ffc.h
@@ -227,8 +227,23 @@ ffc_result ffc_parse_double(size_t len, const char *input, double *out);
  * `fast_float::chars_format::general` which allows both `fixed` and
  * `scientific`.
  */
-ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
-ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
+/* When included from a FFC_IMPL translation unit, the critical-path API
+ * functions are declared always_inline so GCC inlines them at call sites
+ * in the same TU. In non-FFC_IMPL TUs the declarations are plain extern. */
+#ifdef FFC_IMPL
+#  if defined(__GNUC__) || defined(__clang__)
+#    define FFC_IMPL_INLINE __attribute__((always_inline)) inline
+#  elif defined(_MSC_VER)
+#    define FFC_IMPL_INLINE __forceinline
+#  else
+#    define FFC_IMPL_INLINE inline
+#  endif
+#else
+#  define FFC_IMPL_INLINE
+#endif
+
+FFC_IMPL_INLINE ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
+FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
 
 /*
  * A simplified API; the result will be 0.0 on error, not uninitialized.
@@ -808,6 +823,11 @@ bool ffc_strncasecmp5(char *actual_mixedcase, char const *expected_lowercase, si
 
 ffc_internal ffc_inline
 bool ffc_rounds_to_nearest(void) {
+#if defined(FFC_ROUNDS_TO_NEAREST)
+  // Compile-time constant: caller guarantees IEEE 754 round-to-nearest mode.
+  // Eliminates the volatile float FCMP chain entirely.
+  return true;
+#endif
   // https://lemire.me/blog/2020/06/26/gcc-not-nearest/
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return false;
@@ -1121,17 +1141,73 @@ bool ffc_simd_parse_if_eight_digits_unrolled_simd(uint16_t const *chars, uint64_
 
 #endif // FFC_HAS_SIMD
 
+// Compute acc*10 + d_expr using add+lsl on AArch64/Clang.
+// GCC already strength-reduces i*10 to shift-adds naturally; the asm is
+// Clang-only. The non-Clang path is a plain macro so GCC sees the original
+// expression and can fuse the pointer increment with surrounding code.
+#if defined(__aarch64__) && defined(__clang__)
+ffc_internal ffc_inline uint64_t
+ffc_digit_acc10(uint64_t acc, uint64_t d) {
+  uint64_t result;
+  __asm__("add %0, %2, %2, lsl #2\n\t"
+          "add %0, %1, %0, lsl #1"
+          : "=&r"(result) : "r"(d), "r"(acc));
+  return result;
+}
+#define FFC_DIGIT_ACC10(acc, d_expr) ffc_digit_acc10((acc), (uint64_t)(d_expr))
+#else
+#define FFC_DIGIT_ACC10(acc, d_expr) ((acc) * 10 + (uint64_t)(d_expr))
+#endif
+
 ffc_internal ffc_inline void
 ffc_loop_parse_if_eight_digits(char const **p, char const *const pend,
                            uint64_t* i) {
-  // optimizes better than parse_if_eight_digits_unrolled() for char.
-  while ((pend - *p >= 8) &&
-         ffc_is_made_of_eight_digits_fast(ffc_read8_to_u64(*p))) {
-    *i = (*i * 100000000) +
-        ffc_parse_eight_digits_unrolled_swar(ffc_read8_to_u64(*p)); 
-        // in rare cases, this will overflow, but that's ok
+#if defined(__aarch64__) && defined(__clang__)
+  // Clang/AArch64: manual 2x unroll converts the hot path from a while loop
+  // to a single linear block for typical float fractions (<=16 digits).
+  // Eliminating the back-edge allows Clang to keep SWAR constants in registers
+  // rather than rematerializing them on each iteration; also, the 8-digit
+  // block becomes an if (no back-edge) so constants are always in a straight-
+  // line context. GCC auto-unrolls this naturally so we keep its while loop.
+  while (pend - *p >= 16) {
+    uint64_t val1 = ffc_read8_to_u64(*p);
+    if (!ffc_is_made_of_eight_digits_fast(val1)) { break; }
+    uint64_t val2 = ffc_read8_to_u64(*p + 8);
+    if (!ffc_is_made_of_eight_digits_fast(val2)) {
+      *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val1); // in rare cases overflows, ok
+      *p += 8;
+      break;
+    }
+    uint64_t s1 = ffc_parse_eight_digits_unrolled_swar(val1);
+    uint64_t s2 = ffc_parse_eight_digits_unrolled_swar(val2);
+    *i = (*i * 100000000ULL + s1) * 100000000ULL + s2; // in rare cases overflows, ok
+    *p += 16;
+  }
+  if (pend - *p >= 8) {
+    uint64_t val = ffc_read8_to_u64(*p);
+    if (ffc_is_made_of_eight_digits_fast(val)) {
+      *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val); // in rare cases overflows, ok
+      *p += 8;
+    }
+  }
+#else
+  // GCC and other compilers: original while loop that GCC auto-unrolls well.
+  while (pend - *p >= 8) {
+    uint64_t val = ffc_read8_to_u64(*p);
+    if (!ffc_is_made_of_eight_digits_fast(val)) { break; }
+    *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val); // in rare cases, this will overflow, but that's ok
     *p += 8;
   }
+#endif
+  // 4-digit follow-up: handles sub-8 remainders (e.g. 7-digit fractions)
+  // without falling all the way to byte-by-byte for the first 4 digits.
+  if (pend - *p >= 4) {
+    uint32_t val4 = ffc_read4_to_u32(*p);
+    if (ffc_is_made_of_four_digits_fast(val4)) {
+      *i = (*i * 10000) + ffc_parse_four_digits_unrolled(val4);
+      *p += 4;
+    }
+  }
 }
 
 /* section end: read digits */
@@ -1200,7 +1276,7 @@ ffc_parsed ffc_parse_number_string(
       }
     } else {
       // a sign must be followed by an integer or the dot
-      if (!ffc_is_integer(*p) && (*p != decimal_point)) { 
+      if (!ffc_is_integer(*p) && (*p != decimal_point)) {
         return ffc_report_parse_error(p, FFC_PARSE_OUTCOME_MISSING_INTEGER_OR_DOT_AFTER_SIGN);
       }
     }
@@ -1211,14 +1287,27 @@ ffc_parsed ffc_parse_number_string(
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((p != pend) && ffc_is_integer(*p)) {
-    // Horner's method: only ever multiplies by the constant 10
-    // avoiding variable power-of-10 multiplies
-
-    // might overflow, we will handle the overflow later
-    uint64_t digit_value = (uint64_t)(*p - '0');
-    i = (10 * i) + digit_value; 
-    ++p;
+  // Straight-line integer scan — eliminates back-branches for the common
+  // 1–5 digit integer case (random "0", mesh 1–3 digits, canada 2–3 digits).
+  // Falls back to while loop for 6+ digits.
+  if ((p != pend) && ffc_is_integer(*p)) {
+    i = (uint64_t)(*p++ - '0');
+    if ((p != pend) && ffc_is_integer(*p)) {
+      i = FFC_DIGIT_ACC10(i, *p++ - '0');
+      if ((p != pend) && ffc_is_integer(*p)) {
+        i = FFC_DIGIT_ACC10(i, *p++ - '0');
+        if ((p != pend) && ffc_is_integer(*p)) {
+          i = FFC_DIGIT_ACC10(i, *p++ - '0');
+          if ((p != pend) && ffc_is_integer(*p)) {
+            i = FFC_DIGIT_ACC10(i, *p++ - '0');
+            while ((p != pend) && ffc_is_integer(*p)) {
+              i = FFC_DIGIT_ACC10(i, *p - '0'); // might overflow, handled later
+              ++p;
+            }
+          }
+        }
+      }
+    }
   }
 
   char const *const end_of_integer_part = p;
@@ -1241,18 +1330,30 @@ ffc_parsed ffc_parse_number_string(
   int64_t exponent = 0;
   bool const has_decimal_point = (p != pend) && (*p == decimal_point);
 
+  // Hoisted out of the if-block so the too_many_digits path can use them as
+  // local variables rather than reading back from the answer struct, which
+  // lets GCC DSE the answer.fraction_part_* stores on non-JSON callers.
+  char const *before = NULL;
+  char const *frac_end_local = NULL;
+
   /* post-decimal exponential part (calculates a negative exponent) */
   if (has_decimal_point) {
     ++p;
-    char const *before = p; 
+    before = p;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
     ffc_loop_parse_if_eight_digits(&p, pend, &i);
 
-    while ((p != pend) && ffc_is_integer(*p)) {
-      uint8_t digit = (uint8_t)(*p - (char)('0'));
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+    // ffc_loop_parse_if_eight_digits handles 8+4 digits, so at most 3 remain.
+    // Straight-line tail eliminates the back-branch for the common 1-3 digit case.
+    if (p != pend && ffc_is_integer(*p)) {
+      i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0'))); // in rare cases overflows, ok
+      if (p != pend && ffc_is_integer(*p)) {
+        i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0')));
+        if (p != pend && ffc_is_integer(*p)) {
+          i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0')));
+        }
+      }
     }
 
     // pre: i = 123, digit_count = 3
@@ -1264,6 +1365,7 @@ ffc_parsed ffc_parse_number_string(
     // i = 123456
     // digit_count = 3 - (-3) = 6
     exponent = before - p;
+    frac_end_local = p; // capture before p advances into explicit exponent
     answer.fraction_part_start = (char*)before;
     answer.fraction_part_len = (size_t)(p - before);
     digit_count -= exponent;
@@ -1312,7 +1414,7 @@ ffc_parsed ffc_parse_number_string(
       while ((p != pend) && ffc_is_integer(*p)) {
         uint8_t digit = (uint8_t)(*p - '0');
         if (exp_number < 0x10000000) {
-          exp_number = 10 * exp_number + digit;
+          exp_number = FFC_DIGIT_ACC10(exp_number, digit);
         }
         ++p;
       }
@@ -1353,11 +1455,14 @@ ffc_parsed ffc_parse_number_string(
     if (digit_count > 19) {
       answer.too_many_digits = true;
       // Let us start again, this time, avoiding overflows.
-      // We don't need to call if is_integer, since we use the
-      // pre-tokenized spans from above.
+      // Use local variables (start_digits, end_of_integer_part, before,
+      // frac_end_local) instead of reading back from answer struct fields.
+      // This allows GCC DSE to eliminate the answer.int_part_* and
+      // answer.fraction_part_* stores on non-JSON callers (where those fields
+      // are removed from the return ABI by ISRA).
       i = 0;
-      p = answer.int_part_start;
-      char const *int_end = p + answer.int_part_len;
+      p = (char*)start_digits;
+      char const *int_end = (char*)end_of_integer_part;
       uint64_t const minimal_nineteen_digit_integer = 1000000000000000000;
       while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
         i = i * 10 + (uint64_t)(*p - '0');
@@ -1366,13 +1471,13 @@ ffc_parsed ffc_parse_number_string(
       if (i >= minimal_nineteen_digit_integer) { // We have a big integer
         exponent = end_of_integer_part - p + exp_number;
       } else { // We have a value with a fractional component.
-        p = answer.fraction_part_start;
-        char const *frac_end = p + answer.fraction_part_len;
+        p = (char*)before;
+        char const *frac_end = (char*)frac_end_local;
         while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
           i = i * 10 + (uint64_t)(*p - '0');
           ++p;
         }
-        exponent = answer.fraction_part_start - p + exp_number;
+        exponent = before - p + exp_number;
       }
       // We have now corrected both exponent and i, to a truncated value
     }
@@ -2978,8 +3083,10 @@ bool ffc_clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_neg
   // selected on the thread.
   // We proceed optimistically, assuming that detail::rounds_to_nearest()
   // returns true.
-  if (ffc_const(value_kind, MIN_EXPONENT_FAST_PATH) <= exponent &&
-      exponent <= ffc_const(value_kind, MAX_EXPONENT_FAST_PATH)) {
+  // Single unsigned range check replaces two signed comparisons, matching
+  // fast_float's layout: (uint64_t)(e - MIN) <= (MAX - MIN) in one compare.
+  if ((uint64_t)((int64_t)exponent - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH)) <=
+      (uint64_t)((int64_t)ffc_const(value_kind, MAX_EXPONENT_FAST_PATH) - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH))) {
     // Unfortunately, the conventional Clinger's fast path is only possible
     // when the system rounds to the nearest float.
     //
@@ -3046,6 +3153,19 @@ ffc_result ffc_from_chars_advanced(ffc_parsed const pns, ffc_value* value, ffc_v
   answer.outcome = FFC_OUTCOME_OK; // be optimistic :')
   answer.ptr = (char*)pns.lastmatch;
 
+  if (!pns.too_many_digits && pns.exponent == 0 &&
+      pns.mantissa <= ffc_const(vk, MAX_MANTISSA_FAST_PATH)) {
+#if defined(__clang__) || defined(FFC_32BIT)
+    if (pns.mantissa == 0) {
+      ffc_set_value(value, vk, pns.negative ? -0. : 0.);
+      return answer;
+    }
+#endif
+    ffc_set_value(value, vk, pns.mantissa);
+    if (pns.negative) { ffc_set_value(value, vk, -ffc_read_value(value, vk)); }
+    return answer;
+  }
+
   if (!pns.too_many_digits &&
       ffc_clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value, vk)) {
     ffc_debug("fast path hit");
@@ -3118,7 +3238,9 @@ ffc_result ffc_from_chars(char* first, char* last, ffc_parse_options options, ff
   return ffc_from_chars_advanced(pns, value, vk);
 }
 
-ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
+/* extern FFC_IMPL_INLINE gives GCC the always_inline directive while also
+ * requesting external linkage so non-FFC_IMPL TUs can link these symbols. */
+extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
   // It would be UB to directly use *out as our ffc_value, even though its the same layout
   ffc_value out_value = {0};
 
@@ -3127,7 +3249,7 @@ ffc_result ffc_from_chars_double_options(const char *start, const char *end, dou
   *out = out_value.d;
   return result;
 }
-ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
+extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
   ffc_parse_options options = ffc_parse_options_default();
   return ffc_from_chars_double_options(first, last, out, options);
 }

diff --git a/src/api.h b/src/api.h
@@ -115,8 +115,23 @@ ffc_result ffc_parse_double(size_t len, const char *input, double *out);
  * `fast_float::chars_format::general` which allows both `fixed` and
  * `scientific`.
  */
-ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
-ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
+/* When included from a FFC_IMPL translation unit, the critical-path API
+ * functions are declared always_inline so GCC inlines them at call sites
+ * in the same TU. In non-FFC_IMPL TUs the declarations are plain extern. */
+#ifdef FFC_IMPL
+#  if defined(__GNUC__) || defined(__clang__)
+#    define FFC_IMPL_INLINE __attribute__((always_inline)) inline
+#  elif defined(_MSC_VER)
+#    define FFC_IMPL_INLINE __forceinline
+#  else
+#    define FFC_IMPL_INLINE inline
+#  endif
+#else
+#  define FFC_IMPL_INLINE
+#endif
+
+FFC_IMPL_INLINE ffc_result ffc_from_chars_double(const char *start, const char *end, double* out);
+FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options);
 
 /*
  * A simplified API; the result will be 0.0 on error, not uninitialized.

diff --git a/src/common.h b/src/common.h
@@ -524,6 +524,11 @@ bool ffc_strncasecmp5(char *actual_mixedcase, char const *expected_lowercase, si
 
 ffc_internal ffc_inline
 bool ffc_rounds_to_nearest(void) {
+#if defined(FFC_ROUNDS_TO_NEAREST)
+  // Compile-time constant: caller guarantees IEEE 754 round-to-nearest mode.
+  // Eliminates the volatile float FCMP chain entirely.
+  return true;
+#endif
   // https://lemire.me/blog/2020/06/26/gcc-not-nearest/
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return false;

diff --git a/src/ffc.h b/src/ffc.h
@@ -228,8 +228,10 @@ bool ffc_clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_neg
   // selected on the thread.
   // We proceed optimistically, assuming that detail::rounds_to_nearest()
   // returns true.
-  if (ffc_const(value_kind, MIN_EXPONENT_FAST_PATH) <= exponent &&
-      exponent <= ffc_const(value_kind, MAX_EXPONENT_FAST_PATH)) {
+  // Single unsigned range check replaces two signed comparisons, matching
+  // fast_float's layout: (uint64_t)(e - MIN) <= (MAX - MIN) in one compare.
+  if ((uint64_t)((int64_t)exponent - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH)) <=
+      (uint64_t)((int64_t)ffc_const(value_kind, MAX_EXPONENT_FAST_PATH) - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH))) {
     // Unfortunately, the conventional Clinger's fast path is only possible
     // when the system rounds to the nearest float.
     //
@@ -296,6 +298,19 @@ ffc_result ffc_from_chars_advanced(ffc_parsed const pns, ffc_value* value, ffc_v
   answer.outcome = FFC_OUTCOME_OK; // be optimistic :')
   answer.ptr = (char*)pns.lastmatch;
 
+  if (!pns.too_many_digits && pns.exponent == 0 &&
+      pns.mantissa <= ffc_const(vk, MAX_MANTISSA_FAST_PATH)) {
+#if defined(__clang__) || defined(FFC_32BIT)
+    if (pns.mantissa == 0) {
+      ffc_set_value(value, vk, pns.negative ? -0. : 0.);
+      return answer;
+    }
+#endif
+    ffc_set_value(value, vk, pns.mantissa);
+    if (pns.negative) { ffc_set_value(value, vk, -ffc_read_value(value, vk)); }
+    return answer;
+  }
+
   if (!pns.too_many_digits &&
       ffc_clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value, vk)) {
     ffc_debug("fast path hit");
@@ -368,7 +383,9 @@ ffc_result ffc_from_chars(char* first, char* last, ffc_parse_options options, ff
   return ffc_from_chars_advanced(pns, value, vk);
 }
 
-ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
+/* extern FFC_IMPL_INLINE gives GCC the always_inline directive while also
+ * requesting external linkage so non-FFC_IMPL TUs can link these symbols. */
+extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) {
   // It would be UB to directly use *out as our ffc_value, even though its the same layout
   ffc_value out_value = {0};
 
@@ -377,7 +394,7 @@ ffc_result ffc_from_chars_double_options(const char *start, const char *end, dou
   *out = out_value.d;
   return result;
 }
-ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
+extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) {
   ffc_parse_options options = ffc_parse_options_default();
   return ffc_from_chars_double_options(first, last, out, options);
 }