flintlib
diff --git a/‎doc/source/mpn_extras.rst‎
Lines changed: 11 additions & 0 deletions b/‎doc/source/mpn_extras.rst‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎doc/source/mpn_mod.rst‎
Lines changed: 10 additions & 0 deletions b/‎doc/source/mpn_mod.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/gr_poly/tune/cutoffs.c‎
Lines changed: 3 additions & 3 deletions b/‎src/gr_poly/tune/cutoffs.c‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/mpn_extras.h‎
Lines changed: 4 additions & 0 deletions b/‎src/mpn_extras.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/mpn_extras/mulmod_precond.c‎
Lines changed: 80 additions & 4 deletions b/‎src/mpn_extras/mulmod_precond.c‎
Lines changed: 80 additions & 4 deletions
diff --git a/‎src/mpn_extras/mulmod_preinvn.c‎
Lines changed: 133 additions & 0 deletions b/‎src/mpn_extras/mulmod_preinvn.c‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎src/mpn_extras/test/main.c‎
Lines changed: 4 additions & 0 deletions b/‎src/mpn_extras/test/main.c‎
Lines changed: 4 additions & 0 deletions
@@ -367,6 +367,13 @@ Division and modular arithmetic with precomputed inverses
     The behavior is not exactly the same: `a` and `b` are assumed to
     be unshifted, and the output is unshifted.
 
+.. function:: void flint_mpn_fmmamod_preinvn(mp_ptr r, mp_srcptr a1, mp_srcptr b1, mp_srcptr a2, mp_srcptr b2, mp_size_t n, mp_srcptr dnormed, mp_srcptr dinv, ulong norm)
+              void flint_mpn_fmmamod_preinvn_2(mp_ptr r, mp_srcptr a1, mp_srcptr b1, mp_srcptr a2, mp_srcptr b2, mp_srcptr dnormed, mp_srcptr dinv, ulong norm)
+
+    Given ``dnormed`` containing a normalised integer `d 2^{norm}` with precomputed inverse ``dinv``
+    provided by ``flint_mpn_preinvn``, computes `a_1 b_1 + a_2 b_2 \pmod{d}`. We require
+    all operands to be reduced modulo `d`.
+
 .. function:: void flint_mpn_mulmod_precond(mp_ptr rp, mp_srcptr apre, mp_srcptr b, mp_size_t n, mp_srcptr dnormed, mp_srcptr dinv, ulong norm)
 
     Given ``dnormed`` containing a normalised integer `d 2^{norm}` with precomputed inverse ``dinv``
@@ -405,6 +412,10 @@ Division and modular arithmetic with precomputed inverses
     :func:`flint_mpn_mulmod_precond_precompute`
     given a modulus with `n` limbs.
 
+.. function:: void flint_mpn_fmmamod_precond(mp_ptr rp, mp_srcptr a1pre, mp_srcptr b1, mp_srcptr a2pre, mp_srcptr b2, mp_size_t n, mp_srcptr dnormed, mp_srcptr dinv, ulong norm)
+
+    Analogous to :func:`flint_mpn_mulmod_precond`, but computes `a_1 b_1 + a_2 b_2` modulo `d`.
+
 GCD
 --------------------------------------------------------------------------------
 
 
@@ -137,6 +137,7 @@ Basic operations and arithmetic
               int mpn_mod_submul_ui(nn_ptr res, nn_srcptr x, ulong y, gr_ctx_t ctx)
               int mpn_mod_submul_si(nn_ptr res, nn_srcptr x, slong y, gr_ctx_t ctx)
               int mpn_mod_submul_fmpz(nn_ptr res, nn_srcptr x, const fmpz_t y, gr_ctx_t ctx)
+              int mpn_mod_fmma(nn_ptr res, nn_srcptr x1, nn_srcptr y1, nn_srcptr x2, nn_srcptr y2, gr_ctx_t ctx)
               int mpn_mod_sqr(nn_ptr res, nn_srcptr x, gr_ctx_t ctx)
               int mpn_mod_inv(nn_ptr res, nn_srcptr x, gr_ctx_t ctx)
               int mpn_mod_div(nn_ptr res, nn_srcptr x, nn_srcptr y, gr_ctx_t ctx)
@@ -162,6 +163,7 @@ Vector functions
               int _mpn_mod_vec_mul_scalar(nn_ptr res, nn_srcptr x, slong len, nn_srcptr y, gr_ctx_t ctx)
               int _mpn_mod_scalar_mul_vec(nn_ptr res, nn_srcptr y, nn_srcptr x, slong len, gr_ctx_t ctx)
               int _mpn_mod_vec_addmul_scalar(nn_ptr res, nn_srcptr x, slong len, nn_srcptr y, gr_ctx_t ctx)
+              int _mpn_mod_vec_submul_scalar(nn_ptr res, nn_srcptr x, slong len, nn_srcptr y, gr_ctx_t ctx);
               int _mpn_mod_vec_dot(nn_ptr res, nn_srcptr initial, int subtract, nn_srcptr vec1, nn_srcptr vec2, slong len, gr_ctx_t ctx)
               int _mpn_mod_vec_dot_rev(nn_ptr res, nn_srcptr initial, int subtract, nn_srcptr vec1, nn_srcptr vec2, slong len, gr_ctx_t ctx)
 
@@ -268,6 +270,14 @@ Division
     Polynomial division with remainder implemented using the basecase
     algorithm with delayed reductions.
 
+.. function:: int _mpn_mod_poly_divrem_q1_preinv1_fmma(nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr invL, gr_ctx_t ctx);
+              int _mpn_mod_poly_divrem_q1_preinv1_fmma_precond(nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr invL, gr_ctx_t ctx);
+              int _mpn_mod_poly_divrem_q1_preinv1_karatsuba_precond(nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr invL, gr_ctx_t ctx);
+              int _mpn_mod_poly_divrem_q1_preinv1(nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr invL, gr_ctx_t ctx);
+
+    Algorithms for polynomial division in the special case where
+    `lenA = lenB + 1`. Require `lenB \ge 2`.
+
 .. function:: int _mpn_mod_poly_divrem(nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, gr_ctx_t ctx)
               int _mpn_mod_poly_div(nn_ptr Q, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, gr_ctx_t ctx)
 
 
@@ -61,7 +61,7 @@ void _nmod_poly_mul_mid_default_mpn_ctx(nn_ptr res, slong zl, slong zh, nn_srcpt
 #endif
 
 #if 1
-#define INIT_CTX fmpz_t t; fmpz_init(t); fmpz_ui_pow_ui(t, 2, bits - 1); fmpz_add_ui(t, t, 1); fmpz_nextprime(t, t, 0); GR_MUST_SUCCEED(gr_ctx_init_mpn_mod(ctx, t)); fmpz_clear(t);
+#define INIT_CTX fmpz_t t; fmpz_init(t); fmpz_ui_pow_ui(t, 2, bits - 1); fmpz_add_ui(t, t, 1); fmpz_nextprime(t, t, 0); GR_MUST_SUCCEED(gr_ctx_init_mpn_mod(ctx, t)); mpn_mod_ctx_set_is_field(ctx, T_TRUE); fmpz_clear(t);
 #define RANDCOEFF(t, ctx) fmpz_mod_rand(t, state, gr_ctx_data_as_ptr(ctx));
 #define STEP_BITS for (bits = 80, j = 0; bits <= 1024; bits = bits + 16, j++)
 #endif
@@ -212,7 +212,7 @@ void _nmod_poly_mul_mid_default_mpn_ctx(nn_ptr res, slong zl, slong zh, nn_srcpt
                _nmod_poly_mul_mid_default_mpn_ctx(C->coeffs, 0, B->length, A->coeffs, A->length, B->coeffs, B->length, ((nmod_t *) gr_ctx_data_ptr(ctx))[0]);
 #endif
 
-#if 1
+#if 0
 #define INFO "divexact (basecase -> bidirectional)"
 #define SETUP random_input(C, state, len, ctx); \
               random_input(B, state, len, ctx); \
@@ -221,7 +221,7 @@ void _nmod_poly_mul_mid_default_mpn_ctx(nn_ptr res, slong zl, slong zh, nn_srcpt
 #define CASE_B GR_IGNORE(gr_poly_divexact_bidirectional(C, A, B, ctx));
 #endif
 
-#if 0
+#if 1
 #define INFO "gcd"
 #define SETUP random_input(A, state, len, ctx); \
               random_input(B, state, len, ctx);
 
@@ -895,6 +895,10 @@ int flint_mpn_mulmod_want_precond(mp_size_t n, slong num);
 void flint_mpn_mulmod_precond_precompute(mp_ptr apre, mp_srcptr a, mp_size_t n, mp_srcptr d, mp_srcptr dinv, ulong norm);
 void flint_mpn_mulmod_precond(mp_ptr rp, mp_srcptr apre, mp_srcptr b, mp_size_t n, mp_srcptr d, mp_srcptr dinv, ulong norm);
 
+void flint_mpn_fmmamod_preinvn(mp_ptr r, mp_srcptr a, mp_srcptr b, mp_srcptr e, mp_srcptr f, mp_size_t n, mp_srcptr d, mp_srcptr dinv, ulong norm);
+void flint_mpn_fmmamod_preinvn_2(mp_ptr r, mp_srcptr a, mp_srcptr b, mp_srcptr e, mp_srcptr f, mp_srcptr d, mp_srcptr dinv, ulong norm);
+void flint_mpn_fmmamod_precond(mp_ptr rp, mp_srcptr apre1, mp_srcptr b1, mp_srcptr apre2, mp_srcptr b2, mp_size_t n, mp_srcptr d, mp_srcptr dinv, ulong norm);
+
 int flint_mpn_mulmod_2expp1_basecase(mp_ptr xp, mp_srcptr yp, mp_srcptr zp, int c, flint_bitcnt_t b, mp_ptr tp);
 
 /* miscellaneous *************************************************************/
 
@@ -112,21 +112,23 @@ flint_mpn_mulmod_precond(mp_ptr rp, mp_srcptr apre, mp_srcptr b, mp_size_t n, mp
     */
 
     mp_ptr tmp;
-    mp_limb_t cy;
+    mp_limb_t cy, cy1, cy2;
     slong i, rn;
     TMP_INIT;
 
     TMP_START;
     tmp = TMP_ALLOC((n + 2) * sizeof(mp_limb_t));
 
-    tmp[n] = mpn_mul_1(tmp, apre, n, b[0]);
-    tmp[n + 1] = 0;
+    cy1 = mpn_mul_1(tmp, apre, n, b[0]);
+    cy2 = 0;
     for (i = 1; i < n; i++)
     {
         cy = mpn_addmul_1(tmp, apre + i * n, n, b[i]);
-        add_ssaaaa(tmp[n + 1], tmp[n], tmp[n + 1], tmp[n], 0, cy);
+        add_ssaaaa(cy2, cy1, cy2, cy1, 0, cy);
     }
 
+    tmp[n] = cy1;
+    tmp[n + 1] = cy2;
     rn = (n + 2) - (tmp[n + 1] == 0);
 
 #if 0
@@ -143,3 +145,77 @@ flint_mpn_mulmod_precond(mp_ptr rp, mp_srcptr apre, mp_srcptr b, mp_size_t n, mp
     TMP_END;
 }
 
+void
+flint_mpn_fmmamod_precond(mp_ptr rp, mp_srcptr apre1, mp_srcptr b1, mp_srcptr apre2, mp_srcptr b2, mp_size_t n, mp_srcptr d, mp_srcptr dinv, ulong norm)
+{
+    mp_ptr tmp;
+    mp_limb_t cy, cy1, cy2;
+    slong i, rn;
+    TMP_INIT;
+
+    /* Something like this if we want a special case for n = 2 */
+    /*
+    if (n == 2)
+    {
+        ulong tmp[4];
+        ulong ump[4];
+
+        FLINT_MPN_MUL_2X1(tmp[2], tmp[1], tmp[0], apre1[1], apre1[0], b1[0]);
+        FLINT_MPN_MUL_2X1(ump[2], ump[1], ump[0], apre1[3], apre1[2], b1[1]);
+        add_ssssaaaaaaaa(tmp[3], tmp[2], tmp[1], tmp[0], tmp[3], tmp[2], tmp[1], tmp[0], 0, ump[2], ump[1], ump[0]);
+        FLINT_MPN_MUL_2X1(ump[2], ump[1], ump[0], apre2[3], apre2[2], b2[0]);
+        add_ssssaaaaaaaa(tmp[3], tmp[2], tmp[1], tmp[0], tmp[3], tmp[2], tmp[1], tmp[0], 0, ump[2], ump[1], ump[0]);
+        FLINT_MPN_MUL_2X1(ump[2], ump[1], ump[0], apre2[3], apre2[2], b2[1]);
+        add_ssssaaaaaaaa(tmp[3], tmp[2], tmp[1], tmp[0], tmp[3], tmp[2], tmp[1], tmp[0], 0, ump[2], ump[1], ump[0]);
+
+        rn = (n + 2) - (tmp[n + 1] == 0);
+        flint_mpn_mod_preinv1(tmp, rn, d, n, dinv[n - 1]);
+
+        if (norm)
+        {
+            rp[0] = (tmp[0] >> norm) | (tmp[1] << (FLINT_BITS - norm));
+            rp[1] = (tmp[1] >> norm);
+        }
+        else
+        {
+            rp[0] = tmp[0];
+            rp[1] = tmp[1];
+        }
+
+        return;
+    }
+    */
+
+    TMP_START;
+    tmp = TMP_ALLOC((n + 2) * sizeof(mp_limb_t));
+
+    cy1 = mpn_mul_1(tmp, apre1, n, b1[0]);
+    cy2 = 0;
+    for (i = 1; i < n; i++)
+    {
+        cy = mpn_addmul_1(tmp, apre1 + i * n, n, b1[i]);
+        add_ssaaaa(cy2, cy1, cy2, cy1, 0, cy);
+    }
+    for (i = 0; i < n; i++)
+    {
+        cy = mpn_addmul_1(tmp, apre2 + i * n, n, b2[i]);
+        add_ssaaaa(cy2, cy1, cy2, cy1, 0, cy);
+    }
+
+    tmp[n] = cy1;
+    tmp[n + 1] = cy2;
+    rn = (n + 2) - (tmp[n + 1] == 0);
+
+#if 0
+    flint_mpn_mod_preinvn(tmp, tmp, rn, d, n, dinv);
+#else
+    flint_mpn_mod_preinv1(tmp, rn, d, n, dinv[n - 1]);
+#endif
+
+    if (norm == 0)
+        flint_mpn_copyi(rp, tmp, n);
+    else
+        mpn_rshift(rp, tmp, n, norm);
+
+    TMP_END;
+}
@@ -160,3 +160,136 @@ void flint_mpn_mulmod_preinvn_2(mp_ptr r,
         r[1] = r1;
     }
 }
+
+void flint_mpn_fmmamod_preinvn(mp_ptr r,
+        mp_srcptr a, mp_srcptr b,
+        mp_srcptr e, mp_srcptr f,
+        mp_size_t n,
+        mp_srcptr d, mp_srcptr dinv, ulong norm)
+{
+    mp_ptr t, u;
+    ulong cy;
+    TMP_INIT;
+
+    TMP_START;
+    t = TMP_ALLOC((7 * n) * sizeof(mp_limb_t));
+    u = t + (5 * n);
+
+    if (a == b)
+        flint_mpn_sqr(t, a, n);
+    else
+        flint_mpn_mul_n(t, a, b, n);
+
+    if (e == f)
+        flint_mpn_sqr(u, e, n);
+    else
+        flint_mpn_mul_n(u, e, f, n);
+
+    if (norm)
+    {
+        mpn_add_n(t, t, u, 2 * n);
+        cy = mpn_lshift(t, t, 2 * n, norm);
+    }
+    else
+    {
+        cy = mpn_add_n(t, t, u, 2 * n);
+    }
+
+    if (cy != 0 || mpn_cmp(t + n, d, n) >= 0)
+    {
+        mpn_sub_n(t + n, t + n, d, n);
+    }
+
+    flint_mpn_mul_or_mulhigh_n(t + 3 * n, t + n, dinv, n);
+    mpn_add_n(t + 4 * n, t + 4 * n, t + n, n);
+
+    /* note: we rely on the fact that mul_or_mullow_n actually
+           writes at least n + 1 limbs */
+    flint_mpn_mul_or_mullow_n(t + 2 * n, t + 4 * n, d, n);
+    cy = t[n] - t[3 * n] - mpn_sub_n(r, t, t + 2 * n, n);
+
+    while (cy > 0)
+        cy -= mpn_sub_n(r, r, d, n);
+
+    if (mpn_cmp(r, d, n) >= 0)
+        mpn_sub_n(r, r, d, n);
+
+    FLINT_ASSERT(mpn_cmp(r, d, n) < 0);
+
+    if (norm)
+        mpn_rshift(r, r, n, norm);
+
+    TMP_END;
+}
+
+void flint_mpn_fmmamod_preinvn_2(mp_ptr r,
+        mp_srcptr a, mp_srcptr b,
+        mp_srcptr e, mp_srcptr f,
+        mp_srcptr d, mp_srcptr dinv, ulong norm)
+{
+    mp_limb_t cy, b0, b1, r0, r1;
+    mp_limb_t f0, f1;
+    mp_limb_t t[10], u[4];
+
+    if (norm)
+    {
+        /* mpn_lshift(b, b, n, norm) */
+        b0 = (b[0] << norm);
+        b1 = (b[1] << norm) | (b[0] >> (FLINT_BITS - norm));
+        f0 = (f[0] << norm);
+        f1 = (f[1] << norm) | (f[0] >> (FLINT_BITS - norm));
+    }
+    else
+    {
+        b0 = b[0];
+        b1 = b[1];
+        f0 = f[0];
+        f1 = f[1];
+    }
+
+    /* mpn_mul_n(t, a, b, n) */
+    FLINT_MPN_MUL_2X2(t[3], t[2], t[1], t[0], a[1], a[0], b1, b0);
+    /* mpn_mul_n(u, e, f, n) */
+    FLINT_MPN_MUL_2X2(u[3], u[2], u[1], u[0], e[1], e[0], f1, f0);
+    add_sssssaaaaaaaaaa(cy, t[3], t[2], t[1], t[0],
+                         0, t[3], t[2], t[1], t[0],
+                         0, u[3], u[2], u[1], u[0]);
+    if (cy || mpn_cmp(t + 2, d, 2) >= 0)
+        sub_ddmmss(t[3], t[2], t[3], t[2], d[1], d[0]);
+
+    /* mpn_mul_n(t + 3*n, t + n, dinv, n) */
+    FLINT_MPN_MUL_2X2(t[9], t[8], t[7], t[6], t[3], t[2], dinv[1], dinv[0]);
+
+    /* mpn_add_n(t + 4*n, t + 4*n, t + n, n) */
+    add_ssaaaa(t[9], t[8], t[9], t[8], t[3], t[2]);
+
+    /* mpn_mul_n(t + 2*n, t + 4*n, d, n) */
+    FLINT_MPN_MUL_3P2X2(t[6], t[5], t[4], t[9], t[8], d[1], d[0]);
+
+    /* cy = t[n] - t[3*n] - mpn_sub_n(r, t, t + 2*n, n) */
+    sub_dddmmmsss(cy, r1, r0, t[2], t[1], t[0], t[6], t[5], t[4]);
+
+    while (cy > 0)
+    {
+        /* cy -= mpn_sub_n(r, r, d, n) */
+        sub_dddmmmsss(cy, r1, r0, cy, r1, r0, 0, d[1], d[0]);
+    }
+
+    if ((r1 > d[1]) || (r1 == d[1] && r0 >= d[0]))
+    {
+        /* mpn_sub_n(r, r, d, n) */
+        sub_ddmmss(r1, r0, r1, r0, d[1], d[0]);
+    }
+
+    if (norm)
+    {
+        r[0] = (r0 >> norm) | (r1 << (FLINT_BITS - norm));
+        r[1] = (r1 >> norm);
+    }
+    else
+    {
+        r[0] = r0;
+        r[1] = r1;
+    }
+}
+
@@ -15,6 +15,8 @@
 #include "t-divides.c"
 #include "t-divrem_preinv1.c"
 #include "t-divrem_preinvn.c"
+#include "t-fmmamod_precond.c"
+#include "t-fmmamod_preinvn.c"
 #include "t-fmms1.c"
 #include "t-gcd_full.c"
 #include "t-mod_preinvn.c"
@@ -44,6 +46,8 @@ test_struct tests[] =
     TEST_FUNCTION(flint_mpn_divides),
     TEST_FUNCTION(flint_mpn_divrem_preinv1),
     TEST_FUNCTION(flint_mpn_divrem_preinvn),
+    TEST_FUNCTION(flint_mpn_fmmamod_precond),
+    TEST_FUNCTION(flint_mpn_fmmamod_preinvn),
     TEST_FUNCTION(flint_mpn_fmms1),
     TEST_FUNCTION(flint_mpn_gcd_full),
     TEST_FUNCTION(flint_mpn_mod_preinvn),