-
Notifications
You must be signed in to change notification settings - Fork 11
Nine profiled micro-optimizations: +63% ARM mesh, +26% ARM canada, +17% ARM random vs baseline #24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 12 commits
cf971fe
bac793d
647e29e
f791ca4
9d2b7ca
8cdf4cd
6e2c993
de871b3
9cf5785
c88481a
43e22b3
6ccc765
88eeecd
3bcf31a
3314128
c901d3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -227,8 +227,23 @@ ffc_result ffc_parse_double(size_t len, const char *input, double *out); | |
| * `fast_float::chars_format::general` which allows both `fixed` and | ||
| * `scientific`. | ||
| */ | ||
| ffc_result ffc_from_chars_double(const char *start, const char *end, double* out); | ||
| ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options); | ||
| /* When included from a FFC_IMPL translation unit, the critical-path API | ||
| * functions are declared always_inline so GCC inlines them at call sites | ||
| * in the same TU. In non-FFC_IMPL TUs the declarations are plain extern. */ | ||
| #ifdef FFC_IMPL | ||
| # if defined(__GNUC__) || defined(__clang__) | ||
| # define FFC_IMPL_INLINE __attribute__((always_inline)) inline | ||
| # elif defined(_MSC_VER) | ||
| # define FFC_IMPL_INLINE __forceinline | ||
| # else | ||
| # define FFC_IMPL_INLINE inline | ||
| # endif | ||
| #else | ||
| # define FFC_IMPL_INLINE | ||
| #endif | ||
|
|
||
| FFC_IMPL_INLINE ffc_result ffc_from_chars_double(const char *start, const char *end, double* out); | ||
| FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options); | ||
|
|
||
| /* | ||
| * A simplified API; the result will be 0.0 on error, not uninitialized. | ||
|
|
@@ -808,6 +823,11 @@ bool ffc_strncasecmp5(char *actual_mixedcase, char const *expected_lowercase, si | |
|
|
||
| ffc_internal ffc_inline | ||
| bool ffc_rounds_to_nearest(void) { | ||
| #if defined(FFC_ROUNDS_TO_NEAREST) | ||
| // Compile-time constant: caller guarantees IEEE 754 round-to-nearest mode. | ||
| // Eliminates the volatile float FCMP chain entirely. | ||
|
kolemannix marked this conversation as resolved.
Outdated
|
||
| return true; | ||
| #endif | ||
| // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ | ||
| #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) | ||
| return false; | ||
|
|
@@ -1121,17 +1141,73 @@ bool ffc_simd_parse_if_eight_digits_unrolled_simd(uint16_t const *chars, uint64_ | |
|
|
||
| #endif // FFC_HAS_SIMD | ||
|
|
||
| // Compute acc*10 + d_expr using add+lsl on AArch64/Clang. | ||
| // GCC already strength-reduces i*10 to shift-adds naturally; the asm is | ||
| // Clang-only. The non-Clang path is a plain macro so GCC sees the original | ||
| // expression and can fuse the pointer increment with surrounding code. | ||
| #if defined(__aarch64__) && defined(__clang__) | ||
| ffc_internal ffc_inline uint64_t | ||
| ffc_digit_acc10(uint64_t acc, uint64_t d) { | ||
| uint64_t result; | ||
| __asm__("add %0, %2, %2, lsl #2\n\t" | ||
| "add %0, %1, %0, lsl #1" | ||
| : "=&r"(result) : "r"(d), "r"(acc)); | ||
| return result; | ||
| } | ||
| #define FFC_DIGIT_ACC10(acc, d_expr) ffc_digit_acc10((acc), (uint64_t)(d_expr)) | ||
| #else | ||
| #define FFC_DIGIT_ACC10(acc, d_expr) ((acc) * 10 + (uint64_t)(d_expr)) | ||
| #endif | ||
|
|
||
| ffc_internal ffc_inline void | ||
| ffc_loop_parse_if_eight_digits(char const **p, char const *const pend, | ||
| uint64_t* i) { | ||
| // optimizes better than parse_if_eight_digits_unrolled() for char. | ||
| while ((pend - *p >= 8) && | ||
| ffc_is_made_of_eight_digits_fast(ffc_read8_to_u64(*p))) { | ||
| *i = (*i * 100000000) + | ||
| ffc_parse_eight_digits_unrolled_swar(ffc_read8_to_u64(*p)); | ||
| // in rare cases, this will overflow, but that's ok | ||
| #if defined(__aarch64__) && defined(__clang__) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we just unconditionally do this? I hate the complexity of the compiler-specific flagging; if its faster and more direct and correct, let's just do it this way. The branches can explode combinatorially and make things harder to optimize later
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hear you on the combinatorial branches, and I've tightened the comment so the gating is self-explanatory. It can't be made unconditional, though — the body is literal AArch64 asm. And on GCC/x86 the plain
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean |
||
| // Clang/AArch64: manual 2x unroll converts the hot path from a while loop | ||
| // to a single linear block for typical float fractions (<=16 digits). | ||
| // Eliminating the back-edge allows Clang to keep SWAR constants in registers | ||
| // rather than rematerializing them on each iteration; also, the 8-digit | ||
| // block becomes an if (no back-edge) so constants are always in a straight- | ||
| // line context. GCC auto-unrolls this naturally so we keep its while loop. | ||
| while (pend - *p >= 16) { | ||
| uint64_t val1 = ffc_read8_to_u64(*p); | ||
| if (!ffc_is_made_of_eight_digits_fast(val1)) { break; } | ||
| uint64_t val2 = ffc_read8_to_u64(*p + 8); | ||
| if (!ffc_is_made_of_eight_digits_fast(val2)) { | ||
| *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val1); // in rare cases overflows, ok | ||
| *p += 8; | ||
| break; | ||
| } | ||
| uint64_t s1 = ffc_parse_eight_digits_unrolled_swar(val1); | ||
| uint64_t s2 = ffc_parse_eight_digits_unrolled_swar(val2); | ||
| *i = (*i * 100000000ULL + s1) * 100000000ULL + s2; // in rare cases overflows, ok | ||
| *p += 16; | ||
| } | ||
| if (pend - *p >= 8) { | ||
| uint64_t val = ffc_read8_to_u64(*p); | ||
| if (ffc_is_made_of_eight_digits_fast(val)) { | ||
| *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val); // in rare cases overflows, ok | ||
| *p += 8; | ||
| } | ||
| } | ||
| #else | ||
| // GCC and other compilers: original while loop that GCC auto-unrolls well. | ||
| while (pend - *p >= 8) { | ||
| uint64_t val = ffc_read8_to_u64(*p); | ||
| if (!ffc_is_made_of_eight_digits_fast(val)) { break; } | ||
| *i = (*i * 100000000) + ffc_parse_eight_digits_unrolled_swar(val); // in rare cases, this will overflow, but that's ok | ||
| *p += 8; | ||
| } | ||
| #endif | ||
| // 4-digit follow-up: handles sub-8 remainders (e.g. 7-digit fractions) | ||
| // without falling all the way to byte-by-byte for the first 4 digits. | ||
| if (pend - *p >= 4) { | ||
| uint32_t val4 = ffc_read4_to_u32(*p); | ||
| if (ffc_is_made_of_four_digits_fast(val4)) { | ||
| *i = (*i * 10000) + ffc_parse_four_digits_unrolled(val4); | ||
| *p += 4; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /* section end: read digits */ | ||
|
|
@@ -1200,7 +1276,7 @@ ffc_parsed ffc_parse_number_string( | |
| } | ||
| } else { | ||
| // a sign must be followed by an integer or the dot | ||
| if (!ffc_is_integer(*p) && (*p != decimal_point)) { | ||
| if (!ffc_is_integer(*p) && (*p != decimal_point)) { | ||
| return ffc_report_parse_error(p, FFC_PARSE_OUTCOME_MISSING_INTEGER_OR_DOT_AFTER_SIGN); | ||
| } | ||
| } | ||
|
|
@@ -1211,14 +1287,27 @@ ffc_parsed ffc_parse_number_string( | |
|
|
||
| uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) | ||
|
|
||
| while ((p != pend) && ffc_is_integer(*p)) { | ||
| // Horner's method: only ever multiplies by the constant 10 | ||
| // avoiding variable power-of-10 multiplies | ||
|
|
||
| // might overflow, we will handle the overflow later | ||
| uint64_t digit_value = (uint64_t)(*p - '0'); | ||
| i = (10 * i) + digit_value; | ||
| ++p; | ||
| // Straight-line integer scan — eliminates back-branches for the common | ||
| // 1–5 digit integer case (random "0", mesh 1–3 digits, canada 2–3 digits). | ||
| // Falls back to while loop for 6+ digits. | ||
| if ((p != pend) && ffc_is_integer(*p)) { | ||
| i = (uint64_t)(*p++ - '0'); | ||
| if ((p != pend) && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, *p++ - '0'); | ||
| if ((p != pend) && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, *p++ - '0'); | ||
| if ((p != pend) && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, *p++ - '0'); | ||
| if ((p != pend) && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, *p++ - '0'); | ||
| while ((p != pend) && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, *p - '0'); // might overflow, handled later | ||
| ++p; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| char const *const end_of_integer_part = p; | ||
|
|
@@ -1241,18 +1330,30 @@ ffc_parsed ffc_parse_number_string( | |
| int64_t exponent = 0; | ||
| bool const has_decimal_point = (p != pend) && (*p == decimal_point); | ||
|
|
||
| // Hoisted out of the if-block so the too_many_digits path can use them as | ||
| // local variables rather than reading back from the answer struct, which | ||
| // lets GCC DSE the answer.fraction_part_* stores on non-JSON callers. | ||
|
kolemannix marked this conversation as resolved.
Outdated
|
||
| char const *before = NULL; | ||
| char const *frac_end_local = NULL; | ||
|
|
||
| /* post-decimal exponential part (calculates a negative exponent) */ | ||
| if (has_decimal_point) { | ||
| ++p; | ||
| char const *before = p; | ||
| before = p; | ||
| // can occur at most twice without overflowing, but let it occur more, since | ||
| // for integers with many digits, digit parsing is the primary bottleneck. | ||
| ffc_loop_parse_if_eight_digits(&p, pend, &i); | ||
|
|
||
| while ((p != pend) && ffc_is_integer(*p)) { | ||
| uint8_t digit = (uint8_t)(*p - (char)('0')); | ||
| ++p; | ||
| i = i * 10 + digit; // in rare cases, this will overflow, but that's ok | ||
| // ffc_loop_parse_if_eight_digits handles 8+4 digits, so at most 3 remain. | ||
| // Straight-line tail eliminates the back-branch for the common 1-3 digit case. | ||
|
kolemannix marked this conversation as resolved.
Outdated
|
||
| if (p != pend && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0'))); // in rare cases overflows, ok | ||
| if (p != pend && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0'))); | ||
| if (p != pend && ffc_is_integer(*p)) { | ||
| i = FFC_DIGIT_ACC10(i, (uint8_t)(*p++ - (char)('0'))); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // pre: i = 123, digit_count = 3 | ||
|
|
@@ -1264,6 +1365,7 @@ ffc_parsed ffc_parse_number_string( | |
| // i = 123456 | ||
| // digit_count = 3 - (-3) = 6 | ||
| exponent = before - p; | ||
| frac_end_local = p; // capture before p advances into explicit exponent | ||
| answer.fraction_part_start = (char*)before; | ||
| answer.fraction_part_len = (size_t)(p - before); | ||
| digit_count -= exponent; | ||
|
|
@@ -1312,7 +1414,7 @@ ffc_parsed ffc_parse_number_string( | |
| while ((p != pend) && ffc_is_integer(*p)) { | ||
| uint8_t digit = (uint8_t)(*p - '0'); | ||
| if (exp_number < 0x10000000) { | ||
| exp_number = 10 * exp_number + digit; | ||
| exp_number = FFC_DIGIT_ACC10(exp_number, digit); | ||
| } | ||
| ++p; | ||
| } | ||
|
|
@@ -1353,11 +1455,14 @@ ffc_parsed ffc_parse_number_string( | |
| if (digit_count > 19) { | ||
| answer.too_many_digits = true; | ||
| // Let us start again, this time, avoiding overflows. | ||
| // We don't need to call if is_integer, since we use the | ||
| // pre-tokenized spans from above. | ||
| // Use local variables (start_digits, end_of_integer_part, before, | ||
| // frac_end_local) instead of reading back from answer struct fields. | ||
| // This allows GCC DSE to eliminate the answer.int_part_* and | ||
| // answer.fraction_part_* stores on non-JSON callers (where those fields | ||
| // are removed from the return ABI by ISRA). | ||
|
kolemannix marked this conversation as resolved.
Outdated
|
||
| i = 0; | ||
| p = answer.int_part_start; | ||
| char const *int_end = p + answer.int_part_len; | ||
| p = (char*)start_digits; | ||
| char const *int_end = (char*)end_of_integer_part; | ||
| uint64_t const minimal_nineteen_digit_integer = 1000000000000000000; | ||
| while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { | ||
| i = i * 10 + (uint64_t)(*p - '0'); | ||
|
|
@@ -1366,13 +1471,13 @@ ffc_parsed ffc_parse_number_string( | |
| if (i >= minimal_nineteen_digit_integer) { // We have a big integer | ||
| exponent = end_of_integer_part - p + exp_number; | ||
| } else { // We have a value with a fractional component. | ||
| p = answer.fraction_part_start; | ||
| char const *frac_end = p + answer.fraction_part_len; | ||
| p = (char*)before; | ||
| char const *frac_end = (char*)frac_end_local; | ||
| while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { | ||
| i = i * 10 + (uint64_t)(*p - '0'); | ||
| ++p; | ||
| } | ||
| exponent = answer.fraction_part_start - p + exp_number; | ||
| exponent = before - p + exp_number; | ||
| } | ||
| // We have now corrected both exponent and i, to a truncated value | ||
| } | ||
|
|
@@ -2978,8 +3083,10 @@ bool ffc_clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_neg | |
| // selected on the thread. | ||
| // We proceed optimistically, assuming that detail::rounds_to_nearest() | ||
| // returns true. | ||
| if (ffc_const(value_kind, MIN_EXPONENT_FAST_PATH) <= exponent && | ||
| exponent <= ffc_const(value_kind, MAX_EXPONENT_FAST_PATH)) { | ||
| // Single unsigned range check replaces two signed comparisons, matching | ||
| // fast_float's layout: (uint64_t)(e - MIN) <= (MAX - MIN) in one compare. | ||
| if ((uint64_t)((int64_t)exponent - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH)) <= | ||
| (uint64_t)((int64_t)ffc_const(value_kind, MAX_EXPONENT_FAST_PATH) - (int64_t)ffc_const(value_kind, MIN_EXPONENT_FAST_PATH))) { | ||
| // Unfortunately, the conventional Clinger's fast path is only possible | ||
| // when the system rounds to the nearest float. | ||
| // | ||
|
|
@@ -3046,6 +3153,19 @@ ffc_result ffc_from_chars_advanced(ffc_parsed const pns, ffc_value* value, ffc_v | |
| answer.outcome = FFC_OUTCOME_OK; // be optimistic :') | ||
| answer.ptr = (char*)pns.lastmatch; | ||
|
|
||
| if (!pns.too_many_digits && pns.exponent == 0 && | ||
| pns.mantissa <= ffc_const(vk, MAX_MANTISSA_FAST_PATH)) { | ||
| #if defined(__clang__) || defined(FFC_32BIT) | ||
| if (pns.mantissa == 0) { | ||
| ffc_set_value(value, vk, pns.negative ? -0. : 0.); | ||
| return answer; | ||
| } | ||
| #endif | ||
| ffc_set_value(value, vk, pns.mantissa); | ||
| if (pns.negative) { ffc_set_value(value, vk, -ffc_read_value(value, vk)); } | ||
| return answer; | ||
| } | ||
|
Comment on lines
+3168
to
+3179
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like a bugfix? just making a note to review more closely
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good eye — two things are colocated here. The |
||
|
|
||
| if (!pns.too_many_digits && | ||
| ffc_clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value, vk)) { | ||
| ffc_debug("fast path hit"); | ||
|
|
@@ -3118,7 +3238,9 @@ ffc_result ffc_from_chars(char* first, char* last, ffc_parse_options options, ff | |
| return ffc_from_chars_advanced(pns, value, vk); | ||
| } | ||
|
|
||
| ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) { | ||
| /* extern FFC_IMPL_INLINE gives GCC the always_inline directive while also | ||
| * requesting external linkage so non-FFC_IMPL TUs can link these symbols. */ | ||
| extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double_options(const char *start, const char *end, double* out, ffc_parse_options options) { | ||
|
kolemannix marked this conversation as resolved.
|
||
| // It would be UB to directly use *out as our ffc_value, even though its the same layout | ||
| ffc_value out_value = {0}; | ||
|
|
||
|
|
@@ -3127,7 +3249,7 @@ ffc_result ffc_from_chars_double_options(const char *start, const char *end, dou | |
| *out = out_value.d; | ||
| return result; | ||
| } | ||
| ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) { | ||
| extern FFC_IMPL_INLINE ffc_result ffc_from_chars_double(char const* first, char const* last, double* out) { | ||
| ffc_parse_options options = ffc_parse_options_default(); | ||
| return ffc_from_chars_double_options(first, last, out, options); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.