// file kernel/n/x86-64/smod.S: operations on residues modulo BASE^n - 1
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Arithmtique modulo BASE^n - 1                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                    # +----------------------------------+
                    # |  Soustraction modulo BASE^n - 1  |
                    # +----------------------------------+

# void xn(ssub)(chiffre *a, long la, chiffre *b, long lb)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur n > 0
#
# sortie :
# b <- (a - b) mod (BASE^n - 1), non normalis

#ifdef assembly_sn_ssub
#undef L
#define L(x) .Lsn_ssub_##x
ENTER(sn_ssub)

        # variables locales
        #undef _b_
        #undef _la_
	#undef _lb_
        #define _la_ %r8
        #define _lb_ %r11
        #define _b_  %r12

	movq   %rsi,    _la_
	movq   %rdi,    %rsi            # rsi <- &a
	movq   %rcx,    _lb_            # lb <- -lb
	negq   _lb_
        movq   %rdx,    %rdi            # rdi <- &b
        movq   %rdx,    %rbx            # rbx <- &b
	leaq   (%rdx,%rcx,8), _b_       # b <- &b[lb]
	addq   _lb_,    _la_            # la -= lb
	jns    L(big_a)

        # cas a petit (ne pas optimiser, ce cas est inutilis)

        # soustrait les chiffres communs
	addq   _la_,    %rcx            # rcx <- la
	clc
        jrcxz  1f
        call   .Lsn_fsub_1
1:
        # soustrait le reste de b de 0
	movq   _la_,    %rcx            # rcx <- la-lb
        jrcxz  3f
        ALIGN(8)
2:
        movq   $0,      %rax
        sbbq   (_b_,%rcx,8), %rax
        movq   %rax,   (_b_,%rcx,8)
        incq   %rcx
        jne    2b
3:
        # recycle la retenue ngative
        jnb    L(done)
4:
        movq   _lb_,    %rcx            # rcx <- -lb
5:
        subq   $1,      (_b_,%rcx,8)
        jnb    L(done)
        incq   %rcx
        jne    5b
        jmp    4b
        
        # cas la > lb
        ALIGN(8)
L(big_a):

        # soustrait les chiffres communs et recycle la retenue ngative
        call   .Lsn_fsub_1
        movq   %rsi,    %rbx            # rbx <- &a[lb]
        jnb    L(next)
1:
        movq   _lb_,    %rcx            # rcx <- -lb
2:
        subq   $1,      (%rdi,%rcx,8)
        jnb    L(next)
        incq   %rcx
        jne    2b
        jmp    1b

        # additionne les lb chiffres suivants et recycle la retenue positive
        ALIGN(8)
L(loop):
	movq   %rsi,    %rdi            # rdi <- &b
        movq   _lb_,    %rcx            # rcx <- lb
	negq   %rcx
        call  .Lsn_fadd_1
        jnc    L(next)
1:
        movq   _lb_,    %rcx            # rcx <- -lb
2:
        incq   (%rsi,%rcx,8)
        jne    L(next)
        incq   %rcx
        jne    2b
        jmp    1b
        
        # tranche suivante
        ALIGN(8)
L(next):
	leaq   (_b_,_lb_,8), %rsi       # rsi <- &b
        addq   _lb_,    _la_            # la -= lb
        jc     L(loop)

        # dernire tranche, incomplte
        movq   _lb_,    %rdx            # rdx <- lb
	negq   %rdx
	leaq   (_la_,%rdx,1), %rcx      # rcx <- la
	jrcxz  L(done)
        call   .Lsn_finc
        jnc    L(done)
1:
        movq   _lb_,    %rcx            # rcx <- -lb
2:
        incq   (_b_,%rcx,8)
        jne    L(done)
        incq   %rcx
        jne    2b
        jmp    1b

        # Termin       
        ALIGN(8)
L(done):
        RETURN_WITH_SP

#endif /* assembly_sn_ssub */

        

                      # +-------------------------------+
                      # |  Rduction modulo BASE^n - 1  |
                      # +-------------------------------+

# entre :
#   a = naturel de longueur la    rsi = &a,    rdx = la
#   b = naturel de longueur n     rdi = &b,    rcx = n
#
# contraintes : n > 0, la >= 0
#
# sortie :
#   b <- a mod BASE^n - 1
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10,r11,r12 <- ind.

        ALIGN(32)
.Lsn_fsred:
        
#undef L
#define L(x) .Lsn_fsred_##x

        # si la <= n, copie a dans b et complte par des zros
        cld
        cmpq   %rdx,    %rcx
        jb     L(big)
        xchgq  %rcx,    %rdx            # rcx <- la
        subq   %rcx,    %rdx            # rdx <- n-la
        jrcxz  1f
        rep movsq                       # b <- a
1:
        movq   %rdx,    %rcx
        jrcxz  L(done)
        xorq   %rax,    %rax
        rep    stosq                    # complte avec n-la zros
L(done):
        ret

        # variables locales
        #undef  _b_
        #undef  _la_
        #undef  _n_
        #undef  _r_
        #define _b_  %r9
        #define _la_ %r10
        #define _n_  %r11
        #define _r_  %r12

        ALIGN(8)
L(big):
        movq   %rdi,    _b_             # sauve &b
        subq   %rcx,    %rdx            # la -= n
        movq   %rdx,    _la_            # sauve la
        movq   %rcx,    _n_             # sauve n
        xorq   _r_,     _r_             # init retenue
        rep    movsq                    # copie les n premiers chiffres de a dans b
        movq   %rsi,    %rbx            # rbx <- &a[n]

        # cumule par blocs de n chiffres
L(loop):
        movq   _n_,    %rcx
        movq   _b_,    %rsi
        movq   _la_,   %rdx
        subq   %rcx,   %rdx
        jbe    L(last)
        movq   %rdx,   _la_             # la -= n
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1              # ajoute un bloc
        adcq   %rcx,   _r_              # maj retenue
        jmp    L(loop)
        ALIGN(8)

        # dernier bloc
L(last):
        movq   _la_,   %rcx
	movq   _n_,    %rdx
	call   .Lsn_finc                # ajoute le dernier bloc
	leaq   (_b_,_n_,8), %rsi        # rsi <- &b[n]
        movq   _n_,    %rcx
        not    %rcx
        incq   %rcx
        adcq   _r_,   (%rsi,%rcx,8)    # recycle la retenue
        jnc    L(done)
        incq   %rcx
1:
        incq   (%rsi,%rcx,8)
        jne    L(done)
        incq   %rcx
        jne    1b
        movq   _n_,     %rcx
        negq   %rcx
        jmp    1b
 
             # +-------------------------------------------------+
             # |  Dcomposition modulo BASE^p - 1 et BASE^p + 1  |
             # +-------------------------------------------------+

# entre :
#  a = naturel de longueur 2p       rsi = &a,   rdx = p
#  b = naturel de longueur p        rdi = &b
#  c = naturel de longueur p+1      rbx = &c
#
# contrainte :  p > 0, a,b,c non confondus
#
# sortie :
#   b  <- a mod BASE^p - 1
#   c  <- a mod BASE^p + 1
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10 <- ind.

#if defined(assembly_sn_smul) || defined(assembly_sn_ssqr)
        ALIGN(32)
.Lsn_fsplit_even:

        # variables locales
        #undef  _a_
        #undef  _c_
        #undef  _p_
        #define _a_  %r8
        #define _c_  %r9
        #define _p_  %r10

        movq    %rsi,  _a_
        movq    %rbx,  _c_
        movq    %rdx,  _p_

        # b[p..2p-1] <- a mod BASE^p - 1
	movq   %rdx,     %rcx
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &a[p]
        call   .Lsn_fadd_1              # b <- a0 + a1
        jnc    2f
        movq   _p_,     %rcx            # recycle la retenue
        not    %rcx
1:
        incq   %rcx
        incq  (%rdi,%rcx,8)
        jz     1b
2:

        # c <- a mod BASE^p + 1
        movq   %rsi,    %rbx            # rbx <- &a[p]
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rcx
        call   .Lsn_fsub_1              # c[0..p-1] <- a0 - a1
        movq   %rcx,   (%rdi)           # c[p] <- 0
        jnc    2f
        not    _p_                      # propage la retenue
1:
        incq   _p_
        incq  (%rdi,_p_,8)
        jz     1b
2:
        ret

#endif /* defined(assembly_sn_smul) || defined(assembly_sn_ssqr) */
        
       # +-------------------------------------------------------------+
       # |  Dcomposition modulo BASE^(p+1/2) - 1 et BASE^(p+1/2) + 1  |
       # +-------------------------------------------------------------+

# entre :
#  a = naturel de longueur 2p+1     rsi = &a,   rdx = p
#  b = naturel de longueur p+1      rdi = &b
#  c = naturel de longueur p+1      rbx = &c
#
# contrainte :  p > 0, a,b,c non confondus
#
# sortie :
#   b <- a mod BASE^(p+1/2) - 1
#   c <- a mod BASE^(p+1/2) + 1
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10,r11 <- ind.

#if defined(assembly_sn_smul) || defined(assembly_sn_ssqr)
        ALIGN(32)
.Lsn_fsplit_odd:

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _a_  %r8
        #define _b_  %r9
        #define _c_  %r10
        #define _p_  %r11
	
        movq    %rsi,   _a_
        movq    %rdi,   _b_
        movq    %rbx,   _c_
        movq    %rdx,   _p_

        # b <- a1 << 32
        leaq   8(%rsi,%rdx,8), %rsi     # rsi <- &a[p+1]
        movq   $32,     %rcx
        call   .Lsn_fshift_up
        shrq   %cl,     %rax
        movq   %rax,   (%rdi)

        # c <- a mod BASE^(p+1/2) + 1
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _c_,     %rdi
        leaq   1(_p_),  %rcx
        call   .Lsn_fsub_1
        jnb    2f
        movq   $0xffffffff, %rax        # recycle la retenue
	andq   %rax,    -8(%rdi)
        movq   _p_,     %rcx
        not    %rcx
1:
        incq   %rcx
        incq   -8(%rdi,%rcx,8)
        jz     1b
2:

        # b <- a mod BASE^(p+1/2) - 1
        movq   _b_,     %rsi
	movq   _b_,     %rdi
        movq   _a_,     %rbx
        leaq   1(_p_),  %rcx
        call   .Lsn_fadd_1
        jnc    2f
        negq   _p_                      # recycle la retenue
	movq   $1,      %rax
	shlq   $32,     %rax
        addq   %rax,    -8(%rsi,_p_,8)
        jnc    2f
1:
        incq   _p_
        incq   -8(%rsi,_p_,8)
        jz     1b
2:
        ret

#endif /* defined(assembly_sn_smul) || defined(assembly_sn_ssqr) */
        

                   # +------------------------------------+
                   # |  Multiplication modulo BASE^n - 1  |
                   # +------------------------------------+

# entre :
#   a = naturel de longueur n     rsi = &a,   rcx = n
#   b = naturel de longueur n     rbx = &b
#   c = naturel de longueur n     rdi = &c
#
# contrainte : n > 0
#   
# sortie :
#   c <- a*b mod BASE^n - 1
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10,r11 <- ind.

#ifdef assembly_sn_smul
        ALIGN(32)
.Lsn_fsmul:

#undef L
#define L(x) .Lsn_fsmul_##x

        # aiguillage selon la longueur et la parit
        movq    %rcx,   %rdx
        shrq    $1,     %rdx            # rdx <- p = n/2
        jnc     1f
        cmpq    $smul_lim_odd, %rcx
        jbe     L(small)
        jmp     L(big_odd)
        ALIGN(8)
1:
        cmpq    $smul_lim_even, %rcx
        jg      L(big_even)
        
	# petite multiplication => Toom
L(small):
	movq   %rcx,    %rax
        shlq   $4,      %rax
	ALLOCA                          # rserve 2n chiffres dans la pile
        pushq  %rcx                     # sauve n
        pushq  %rdi                     # sauve &c
        movq   %rcx,    %rdx            # rdx <- n
        leaq   16(%rsp), %rdi           # rdi <- &d
        call   .Lsn_ftoommul            # d <- a*b
        
        # point de chute pour fssqr
.Lsn_smul_aux_small:
        popq   %rdi                     # rcupre &c
	popq   %rcx                     # rcupre n
	movq   %rcx,    %r8             # r8 <- n
        movq   %rsp,    %rsi            # rsi <- &d
        leaq   (%rsp,%rcx,8), %rbx      # rbx <- &d[n]
        call   .Lsn_fadd_1              # c <- d[0..n-1] + d[n..2n-1]
        jnc    3f
1:
        movq   %r8,     %rcx            # recycle la retenue
        negq   %rcx
2:
        incq   (%rdi,%rcx,8)
        jne    3f
        incq   %rcx
        jne    2b
        jmp    1b
        ALIGN(8)
3:
        leaq   (%rsp,%r8,8), %rsp       # nettoie la pile
        leaq   (%rsp,%r8,8), %rsp
        ret
        ALIGN(8)

        # cas n grand pair : dcompose en deux produits modulaires
L(big_even):

        # variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _r_
        #define _d_ 24(%rsp)
        #define _r_ 16(%rsp)
        #define _c_  8(%rsp)
        #define _p_   (%rsp)
        
        leaq  1(%rdx,%rdx,2), %rax      # rserve 3p+4 chiffres dans la pile
	leaq  24(,%rax,8), %rax
	ALLOCA
	movq   %rbx,   %r11             # sauve &b
        movq   %rdi,   _c_              # sauve &c
        movq   %rdx,   _p_              # sauve p

        # dcompose a et b
        movq   %rdi,    %rbx
        leaq 32(%rsp,%rdx,8), %rdi      # rdi <- &d[p+1]
        call  .Lsn_fsplit_even
	
        movq   %r11,    %rsi            # rsi <- &b
        movq   _p_,     %rdx
        leaq   _d_,     %rbx
        leaq  8(%rbx,%rdx,8), %rdi      # rdi <- &d[2p+1]
        leaq   (%rdi,%rdx,8), %rdi
        call  .Lsn_fsplit_even

        # c[0..p] <- a*b mod BASE^p + 1
        movq   _c_,     %rdi
        leaq   _d_,     %rsi
        movq   _p_,     %rdx
        call   .Lsn_mmul

        # c[p..2p-1] <- a*b mod BASE^p - 1
        movq   _c_,     %rdi
        movq   _p_,     %rcx
        leaq   (%rdi,%rcx,8), %rdi      # rdi <- &c[p]
        leaq 32(%rsp,%rcx,8), %rsi      # rsi <- &d[p+1]
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &d[2p+1]
        movq   (%rdi),  %rax
        movq   %rax,    _r_             # r <- c[p]
        call   .Lsn_fsmul

        # point de chute pour ssqr
.Lsn_smul_aux_big_even:

	# raffectation des variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _r_
        #define _d_ %rsp
        #define _r_ %r9
        #define _c_ %r10
        #define _p_ %r11

	popq    _p_
	popq    _c_
	popq    _r_

        # c[p..2p-1] <- (c[p..2p-1] - c[0..p])/2 mod BASE^p - 1
        movq   _c_,     %rbx
        movq   _p_,     %rcx
        leaq   (_c_,_p_,8), %rsi        # rsi <- &c[p]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # c[p..2p-1] -= c[0..p-1]
        movq   _r_,     %rax            # rax <- sauvegarde de c[p]
1:
        movq   _p_,     %rdx
        not    %rdx
        incq   %rdx
2:
        sbbq   %rax,    (%rsi,%rdx,8)   # la retranche et propage la retenue
        jnb    3f
        movq   %rcx,     %rax
        incq   %rdx
        jne    2b
        jmp    1b
        ALIGN(8)
3:
        movq   _c_,     %rbx
        movq   _p_,     %rcx
        leaq   (_c_,_p_,8),  %rsi       # rsi <- &c[p]
        leaq   (%rsi,_p_,8), %rdi       # rdi <- &c[2p]
        call   .Lsn_fhalf               # c[p..2p-1] /= 2
        jnc    4f
        btsq   $63,     -8(%rdi)        # recycle la retenue
4:

        # c += c[p..2p-1]
        xchgq  %rsi,    %rbx            # rsi <- &c, rbx <- &c[p]
        movq   _p_,     %rcx
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1
        adcq   _r_,    (%rsi)           # rincorpore la sauvegarde de c[p]
        jnc    2f
        movq   _p_,     %rcx            # recycle la retenue
        negq   %rcx
        incq   %rcx
1:
        incq   (%rbx,%rcx,8)
        jne    2f
        incq   %rcx
        jne    1b
        leaq   (,_p_,2), %rcx
        negq   %rcx
        jmp    1b
        ALIGN(8)
2:

        # termin
        leaq   1(_p_,_p_,2), %rcx       # nettoie la pile
        leaq   (%rsp,%rcx,8), %rsp
        ret

        # cas n grand impair : dcompose en deux produits
        ALIGN(8)
L(big_odd):
        
        # variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #define _d_ 16(%rsp)
        #define _c_  8(%rsp)
        #define _p_   (%rsp)

        leaq   3(%rcx,%rcx,2), %rax     # rax <- 6p + 6
        leaq   16(,%rax,8), %rax   
        ALLOCA                          # rserve 6p+8 chiffres dans la pile
	movq   %rbx,     %r12           # sauve &b
        movq   %rdi,     _c_            # sauve &c
        movq   %rdx,     _p_            # sauve p

        # dcompose a et b
        leaq   _d_,     %rdi
        leaq  16(%rdi,%rdx,8), %rbx     # rbx <- &d[2p+2]
        leaq    (%rbx,%rdx,8), %rbx
        call  .Lsn_fsplit_odd
        movq   %r12,    %rsi
        movq   _p_,     %rdx
        leaq  24(%rsp,%rdx,8), %rdi     # rdi <- &d[p+1]
        leaq  16(%rdi,%rdx,8), %rbx     # rbx <- &d[3p+3]
        leaq    (%rbx,%rdx,8), %rbx
        call  .Lsn_fsplit_odd

        # d[4p+4..6p+5] <- a*b mod BASE^(p+1/2) + 1
        movq   _p_,     %rdx
        incq   %rdx                     # rdx <- p+1
        leaq   (%rdx,%rdx,1), %rcx      # rcx <- 2p+2
        leaq 16(%rsp,%rcx,8), %rsi      # rsi <- &d[2p+2]
        leaq   (%rsi,%rcx,8), %rdi      # rdi <- &d[4p+4]
        leaq   (%rsi,%rcx,4), %rbx      # rbx <- &d[3p+3]
	movq   %rdx,    %rcx            # rcx <- p+1
        call   .Lsn_ftoommul
        movq   _p_,     %rcx
        leaq   1(,%rcx,2), %rcx         # rcx <- 2p+1
	leaq  40(%rsp,%rcx,8), %rdi     # rdi <- &d[6p+6]
	leaq    (%rdi,%rcx,8), %rdi
	leaq    (%rdi,%rcx,8), %rdi
        movq   -8(%rdi), %rax
        negq   %rcx
        addq   %rax, -8(%rdi,%rcx,8)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incq   (%rdi,%rcx,8)
        jne    2f
        incq   %rcx
        jne    1b
	movq   _p_,    %rcx
        leaq   2(,%rcx,2), %rcx
        negq   %rcx
        jmp    1b
        ALIGN(8)
2:
        
        # d[2p+2..4p+3] <- a*b mod BASE^(p+1/2) - 1
        movq   _p_,     %rcx
        incq   %rcx                     # rcx <- p+1
        movq   %rcx,    %rdx            # rdx <- p+1
        leaq   _d_,     %rsi
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &d[p+1]
        leaq   (%rbx,%rcx,8), %rdi      # rdi <- &d[2p+2]
        call   .Lsn_ftoommul
        
        ALIGN(32)                       # point de chute pous ssqr
.Lsn_smul_aux_big_odd:
        
	# raffectation des variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #define _d_ %rsp
        #define _c_ %r10
        #define _p_ %r9
	
	popq   _p_
	popq   _c_
	
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
	leaq  16(_d_,%rcx,8), %rdi      # rdi <- &d[4p+4]
	leaq   (%rdi,%rcx,8), %rdi
        movq   -8(%rdi), %rax
        negq   %rcx
        addq   %rax, -8(%rdi,%rcx,8)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incq   (%rdi,%rcx,8)
        jne    2f
        incq   %rcx
        jne    1b
        leaq   2(,_p_,2), %rcx
        negq   %rcx
        jmp    1b
        ALIGN(8)
2:
        # d[2p+2..4p+2] <- (d[2p+2..4p+2] - d[4p+4..6p+4])/2 mod BASE^(2p+1) - 1
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        leaq   8(_d_,%rcx,8),  %rsi     # rsi <- &d[2p+2]
        leaq   8(%rsi,%rcx,8), %rbx     # rbx <- &d[4p+4]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # soustrait les deux rsidus
        jnb    3f
1:
        leaq   1(,_p_,2), %rcx          # recycle la retenue
        negq   %rcx
2:
        subq   $1, (%rsi,%rcx,8)
        jnb    3f
        incq   %rcx
        jne    2b
        jmp    1b
        ALIGN(8)
3:
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        leaq   8(%rsi), %rbx            # rbx <- &d[4p+4]
        leaq   8(_d_,%rcx,8), %rsi      # rsi <- &d[2p+2]
        call   .Lsn_fhalf               # d[2p+2..4p+2] /= 2
        jnc    4f
        btsq   $63,   -16(%rbx)         # recycle la retenue
4:

        # c <- d[2p+2..4p+2] + d[4p+4..6p+4]
        movq   _c_,     %rdi
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        call   .Lsn_fadd_1
        jnc    2f
        movq   _c_,     %rdi
1:
        incq   (%rdi)
        leaq   8(%rdi), %rdi
        jz     1b
2:

        # d[p+2..3p+2] <- d[2p+2..4p+2] << 32
        leaq   1(,_p_,2), %rdx          # rdx <- 2p+1
        leaq   8(_d_,%rdx,8), %rsi      # rsi <- &d[2p+2]
        movq   %rsi,    %rdi            # rdi <- &d[2p+2]
        movq   $32,     %rcx
        call   .Lsn_fshift_up
        shrq   %cl,     %rax
        movq   _p_,     %rcx
        leaq   16(_d_,_p_,8), %rdi      # rdi <- &d[p+2]
        orq    %rax, (%rdi,_p_,8)       # rinjecte les bits sortis
        leaq   8(%rdi,_p_,8), %rsi      # rsi <- &d[3p+3]
        leaq    (%rsi,_p_,8), %rsi
        cld;   rep movsq                # d[p+2..2p+1] <- d[3p+3..4p+2]

        # c <- c + d[p+2..3p+2]
        leaq   16(_d_,_p_,8), %rbx      # rbx <- &d[p+2]
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        movq   _c_,     %rsi
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1
        jnc    2f
        leaq   1(,_p_,2), %rcx          # recycle la retenue
        not    %rcx
1:
        incq   %rcx
        incq   (%rsi,%rcx,8)
        jz     1b
2:

        # termin
        leaq   3(_p_,_p_,2), %rcx       # rcx <- 3p+3
        leaq   (%rsp,%rcx,8),  %rsp     # nettoie la pile
        leaq   (%rsp,%rcx,8),  %rsp
        ret
        
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
                
# void xn(smul) (chiffre *a, long la, chiffre *b, long lb, chiffre *c, long n)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur lb
# c = naturel de longueur n
#
# contraintes : n > 0, 0 <= lb <= la
#
# sortie :
# c <- a*b mod (BASE^n - 1)

#ifdef debug_smul
ENTER(sn_smul_buggy)
#else
ENTER(sn_smul)
#endif


	movq   %r9,     %rax            # rserve 2n chiffres dans la pile
	shlq   $4,      %rax
	ALLOCA
	pushq  %r9                      # sauve n
	pushq  %r8                      # sauve &c
	pushq  %rcx                     # sauve lb
	pushq  %rdx                     # sauve &b

	movq   %rsi,    %rdx            # rdx <- la
	movq   %rdi,    %rsi            # rsi <- &a
	leaq   32(%rsp,%r9,8), %rdi     # rdi <- &x
	movq   %r9,     %rcx
        call   .Lsn_fsred               # x <- a mod BASE^n - 1

	popq   %rsi                     # rsi <- &b
	popq   %rdx                     # rdx <- lb
	leaq   16(%rsp), %rdi           # rdi <- &y
	movq   8(%rsp),  %rcx           # rcx <- n
        call   .Lsn_fsred               # y <- b mod BASE^n - 1
	
	popq   %rdi                     # rdi <- &c
	movq  (%rsp),    %rcx           # rcx <- n
	leaq 8(%rsp),    %rbx           # rbx <- &y
        leaq 8(%rsp,%rcx,8), %rsi       # rsi <- &x
        call   .Lsn_fsmul               # multipie les rsidus
	
        popq   %rax                     # nettoie la pile
	leaq  (%rsp,%rax,8), %rsp
	leaq  (%rsp,%rax,8), %rsp
        RETURN_WITH_SP
        
#endif /* assembly_sn_smul */   
        
        

                        # +---------------------------+
                        # |  Carr modulo BASE^n - 1  |
                        # +---------------------------+

# entre :
#   a = naturel de longueur n     rsi = &a,   rcx = n
#   c = naturel de longueur n     rdi = &c
#
# contrainte : n > 0
#   
# sortie :
#   c <- a^2 mod BASE^n - 1
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10,r11 <- ind.

#ifdef assembly_sn_ssqr
        ALIGN(32)
.Lsn_fssqr:

#undef L
#define L(x) .Lsn_fssqr_##x

        # aiguillage selon la longueur et la parit
        movq    %rcx,   %rdx
        shrq    $1,     %rdx            # rdx <- p = n/2
        jnc     1f
        cmpq    $ssqr_lim_odd, %rcx
        jbe     L(small)
        jmp     L(big_odd)
        ALIGN(8)
1:
        cmpq    $ssqr_lim_even, %rcx
        jg      L(big_even)
        
        # petit carr => Toom
L(small):
	movq   %rcx,    %rax
        shlq   $4,      %rax
	ALLOCA                          # rserve 2n chiffres dans la pile
        pushq  %rcx                     # sauve n
        pushq  %rdi                     # sauve &c
        movq   %rcx,    %rdx            # rdx <- n
        leaq   16(%rsp), %rdi           # rdi <- &d
        call   .Lsn_ftoomsqr            # d <- a^2
        jmp    .Lsn_smul_aux_small      # continue avec smul
        ALIGN(8)

        # cas n grand pair : dcompose en deux carrs modulaires
L(big_even):

        # variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _r_
        #define _d_ 24(%rsp)
        #define _r_ 16(%rsp)
        #define _c_  8(%rsp)
        #define _p_   (%rsp)
        
        leaq  1(%rdx,%rdx,2), %rax      # rserve 3p+1 chiffres dans la pile
	leaq  24(,%rax,8), %rax
	ALLOCA
        movq   %rdi,   _c_              # sauve &c
        movq   %rdx,   _p_              # sauve p

        # dcompose a
        movq   %rdi,    %rbx
        leaq 32(%rsp,%rdx,8), %rdi      # rdi <- &d[p+1]
        call   .Lsn_fsplit_even

        # c[0..p] <- a^2 mod BASE^p + 1
        movq   _c_,     %rdi
        movq   _p_,     %rsi
        call  .Lsn_msqr

        # c[p..2p-1] <- a^2 mod BASE^p - 1
        movq   _c_,     %rdi
        movq   _p_,     %rcx
        leaq   (%rdi,%rcx,8), %rdi      # rdi <- &c[p]
        leaq 32(%rsp,%rcx,8), %rsi      # rsi <- &d[p+1]
        movq   (%rdi),  %rax
        movq   %rax,    _r_             # r <- c[p]
        call   .Lsn_fssqr
        jmp    .Lsn_smul_aux_big_even   # continue avec smul
        ALIGN(8)

        # cas n grand impair : dcompose en deux carrs
L(big_odd):     

        # variables locales
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #define _d_ 16(%rsp)
        #define _c_  8(%rsp)
        #define _p_   (%rsp)

        leaq   3(%rcx,%rcx,2), %rax     # rax <- 6p + 6
        leaq  16(,%rax,8), %rax
        ALLOCA                          # rserve 6p+8 chiffres dans la pile
        movq   %rdi,     _c_            # sauve &c
        movq   %rdx,     _p_            # sauve p

        # dcompose a
        leaq   _d_,     %rdi
        leaq  16(%rdi,%rdx,8), %rbx     # rbx <- &d[2p+2]
        leaq    (%rbx,%rdx,8), %rbx
        call  .Lsn_fsplit_odd

        # d[4p+4..6p+5] <- a^2 mod BASE^(p+1/2) + 1
        movq   _p_,     %rdx
        incq   %rdx                     # rdx <- p+1
        leaq   (%rdx,%rdx,1), %rcx      # rcx <- 2p+2
        leaq 16(%rsp,%rcx,8), %rsi      # rsi <- &d[2p+2]
        leaq   (%rsi,%rcx,8), %rdi      # rdi <- &d[4p+4]
	movq   %rdx,    %rcx            # rcx <- p+1
        call   .Lsn_ftoomsqr
        movq   _p_,     %rcx
        leaq   1(,%rcx,2), %rcx         # rcx <- 2p+1
	leaq  40(%rsp,%rcx,8), %rdi     # rdi <- &d[6p+6]
	leaq    (%rdi,%rcx,8), %rdi
	leaq    (%rdi,%rcx,8), %rdi
        movq   -8(%rdi), %rax
        negq   %rcx
        addq   %rax, -8(%rdi,%rcx,8)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incq   (%rdi,%rcx,8)
        jne    2f
        incq   %rcx
        jne    1b
        movq   _p_,     %rcx
        leaq   2(,%rcx,2), %rcx
        negq   %rcx
        jmp    1b
        ALIGN(8)
2:
        
        # d[2p+2..4p+3] <- a^2 mod BASE^(p+1/2) - 1
        movq   _p_,     %rcx
        incq   %rcx                     # rcx <- p+1
        movq   %rcx,    %rdx            # rdx <- p+1
        leaq   _d_,     %rsi
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &d[p+1]
        leaq   (%rbx,%rcx,8), %rdi      # rdi <- &d[2p+2]
        call   .Lsn_ftoomsqr
        jmp    .Lsn_smul_aux_big_odd    # continue avec smul
        
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
                
# void xn(ssqr) (chiffre *a, long la, chiffre *b, long n)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur n
#
# contraintes : n > 0, la >= 0
#
# sortie :
# b <- a^2 mod (BASE^n - 1)

#ifdef debug_smul
ENTER(sn_ssqr_buggy)
#else
ENTER(sn_ssqr)
#endif

	leaq   (,%rcx,8), %rax          # rserve n chiffres dans la pile
	ALLOCA
	pushq  %rcx                     # sauve n
	pushq  %rdx                     # sauve &b
	
	movq   %rsi,    %rdx            # rdx <- la
	movq   %rdi,    %rsi            # rsi <- &a
	leaq   16(%rsp), %rdi           # rdi <- &x
        call   .Lsn_fsred               # x <- a mod BASE^n - 1

	popq   %rdi                     # rdi <- &b
	movq  (%rsp),    %rcx           # rcx <- n
	leaq 8(%rsp),    %rsi           # rbx <- &x
        call   .Lsn_fssqr               # lve au carr

        popq   %rax                     # nettoie la pile
	leaq  (%rsp,%rax,8), %rsp
        RETURN_WITH_SP

        
#endif /* assembly_sn_ssqr */   
        

                       # +----------------------------+
                       # |  Combinaison de 3 rsidus  |
                       # +----------------------------+

# void xn(sjoin3)(chiffre *a, long h, long k)
#
#  entre :
#  a = naturel de longueur n+p+q
#  n = (2h+2)k, p = (2h+1)k, q = (2h)k
#
#  contraintes : h >= 2, k >= 2
#
#  sortie :
#  a <- x mod ppcm(BASE^n - 1, BASE^p - 1, BASE^q - 1) normalis
#  avec
#    a[0..n-1]       = x mod (BASE^n - 1),
#    a[n..n+p-1]     = x mod (BASE^p - 1),
#    a[n+p..n+p+q-1] = x mod (BASE^q - 1)
#
#  remarque : ppcm = produit/(BASE^k - 1)/(BASE^(2k) - 1)

#ifdef assembly_sn_sjoin3
#undef L
#define L(x) .Lsn_sjoin3_##x
#ifdef debug_sjoin
ENTER(sn_sjoin3_buggy)
#else
ENTER(sn_sjoin3)
#endif

        # variables locales
        #undef _a_
        #undef _b_
        #undef _c_
        #undef _h_
        #undef _k_
        #undef _n_
        #undef _p_
        #undef _q_
        #undef _r_
	#define _k_  %r15
        #define _a_  %r14
        #define _q_  %r13
        #define _p_  %r12
        #define _b_  %r11
        #define _c_  %r10
        #define _r_  %r9

	movq   %rdx,    _k_
	movq   %rdi,    _a_
	leaq   (%rsi,%rsi,1), %rax      # rax <- 2h
        mulq   _k_
	movq   %rax,    _q_             # q <- 2*h*k
	leaq   (_k_,_q_,1), _p_         # p <- q+k
        leaq   (_k_,_p_,1), %rdx        # rdx <- n = p+k
        leaq   (_a_,%rdx,8), _b_
        leaq   (_b_,_p_,8),  _c_
	xorq   _r_,     _r_             # r <- 0
        cld
        
        # normalise a
        movq   $-1,     %rax
        movq   %rdx,    %rcx            # rcx <- n
        repe   scasq                    # a = BASE^n-1 ?
        jne    1f
        incq   %rax                     # alors a <- 0
        movq   %rdx,    %rcx
        movq   _a_,     %rdi
        rep    stosq
        ALIGN(8)
1:
        
        # b <- (a - b) mod (BASE^p - 1)
	movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _b_,     %rdi
        movq   _p_,     %rcx
        call   .Lsn_fsub_1              # retranche les p premiers chiffres
        sbbq   %rcx,    _r_             # r <- retenue
        movq   %rsi,    %rbx            # rbx <- &a[p]
        movq   _b_,     %rsi
        movq   _k_,     %rcx
        movq   _p_,     %rdx
        call   .Lsn_finc                # ajoute les k derniers chiffres
	movq   _c_,     %rsi
        adcq   _r_,     %rcx            # rcx <- somme des retenues
        jz     5f

#if 0
#----------------------------------------------------------------------
# il n est pas possible d avoir une retenue positive, car ceci impliquerait
# a = BASE^n - 1 et b = 0, en contradiction avec la normalisation de a.
# code conserv pour le cas o, mais non test pour cause d impossibilit
        js     3f

1:                                       # recycle la retenue positive
        movq   _p_,     %rcx
        negq   %rcx
2:
        incq   (%rsi,%rcx,8)
        jnz    5f
        incq   %rcx
        jne    2b
        jmp    1b
        ALIGN(8)
#----------------------------------------------------------------------
#endif
3:                                       # recycle la retenue ngative
        movq   _p_,     %rcx
        negq   %rcx
4:
        subq   $1,      (%rsi,%rcx,8)
        jnb    5f
        incq   %rcx
        jne    4b
# on ne peut pas arriver ici
        jmp    3b
        ALIGN(8)
5:

        # normalise b vers le haut
        movq  _b_,      %rdi
        movq  _p_,      %rcx
        xorq  %rax,     %rax
        repe  scasq                     # b = 0 ?
        jne   1f
        decq  %rax                      # alors b <- BASE^p - 1
        movq  _b_,      %rdi
        movq  _p_,      %rcx
        repe  stosq
        ALIGN(8)
1:
        
        # c <- (c - a) + (BASE^k + 1)*b - (BASE^(2*k) - 1) mod (BASE^q - 1)
        movq   _c_,     %rsi
        movq   _c_,     %rdi
        movq   _a_,     %rbx
        movq   _q_,     %rcx
        call   .Lsn_fsub_1              # c <- c - a[0..q-1]
        sbbq   %rcx,    %rcx
        movq   %rcx,    _r_             # r <- retenue
        movq   _c_,     %rsi
        leaq   (,_k_,2), %rcx           # rcx <- 2k
        movq   _q_,     %rdx
        call   .Lsn_fdec                # c <- c - a[q..n-1]
        sbbq   %rcx,    _r_             # maj retenue
        movq   _c_,     %rsi
        movq   _c_,     %rdi
        movq   _q_,     %rcx
        call   .Lsn_fadd_1              # c <- c + b[0..q-1]
        adcq   %rcx,    _r_             # maj retenue
        movq   _c_,     %rsi
        movq   _k_,     %rcx
        movq   _q_,     %rdx
        call   .Lsn_finc                # c <- c + b[q..p-1]
        adcq   %rcx,    _r_             # maj retenue
        movq   _b_,     %rbx
        movq   _q_,     %rcx
        movq   _k_,     %rdx
	leaq   (_c_,_k_,8), %rsi        # rsi <- &c[k]
	movq   %rsi,    %rdi            # rdi <- &c[k]
        subq   %rdx,    %rcx            # rcx <- q-k
        call   .Lsn_fadd_1              # c[k..q-1] += b[0..q-k-1]
        adcq   %rcx,    _r_             # maj retenue
        movq   _c_,     %rsi
        movq   _c_,     %rdi
        leaq   (,_k_,2), %rcx           # rcx <- 2k
        call   .Lsn_fadd_1              # c <- c + b[q-k..p-1]
	leaq   (_c_,_q_,8), %rsi        # rsi <- &c[q]
        jc     2f
	leaq   (,_k_,2), %rcx           # rcx <- 2k-q
	subq   _q_,      %rcx
1:
        subq   $1,  (%rsi,%rcx,8)
        jnb    2f
        incq   %rcx
        jne    1b
        decq   _r_                      # maj retenue
        ALIGN(8)
2:
        movq   _q_,     %rcx            # rcx <- -q
        negq   %rcx
        movq   _r_,     %rax
        incq   %rax                     # rax <- retenue + 1
        jz     6f
        js     4f

        addq   %rax,    (%rsi,%rcx,8)   # recycle la retenue positive
        jnc    6f
        incq   %rcx
3:
        incq   (%rsi,%rcx,8)
        jne    6f
        incq   %rcx
        jne    3b
        movq   _q_,     %rcx
        negq   %rcx
        jmp    3b
        ALIGN(8)
4:
	addq   %rax,   (%rsi,%rcx,8)   # recycle la retenue ngative
        jc     6f
        incq   %rcx
5:
        subq   $1,     (%rsi,%rcx,8)
        jnb    6f
        incq   %rcx
        jne    5b
        movq   _q_,     %rcx
        negq   %rcx
        jmp    5b
        ALIGN(8)
6:

        # c = 0 mod (BASE^q - 1) ?
        movq   _c_,     %rdi
        movq   (%rdi),  %rax            # rax <- c[0]
        testq  %rax,    %rax            # c[0] = 0 ?
        jz     1f
        incq   %rax                     # c[0] = BASE-1 ?
        jnz    L(c_non_nul)
        decq   %rax
1:
        movq   _q_,     %rcx
        decq   %rcx                     # rcx <- q-1
        leaq   8(%rdi), %rdi            # rdi <- &c[1]
        repe   scasq                    # compare les autres chiffres  c[0]
        jne    L(c_non_nul)
        testq  %rax,    %rax            # si c = 0, alors c <- BASE^q - 1
        jne    2f
        movq   _c_,     %rdi
        movq   _q_,     %rcx
        decq   %rax
        rep    stosq
2:
        movq   _b_,     %rsi
        movq   _p_,     %rcx
        addq   _q_,     %rcx            # rcx <- p+q
        jmp    L(inc_b)                 # c:b += 1
        ALIGN(8)

        # si c <> 0 mod (BASE^q - 1),
        # alors c:b <- b + (BASE^p - 1)*(c/(1-BASE^(2*k)) - 1) + BASE^q
L(c_non_nul):
        movq   _c_,     %rbx
        movq   _q_,     %rcx
	leaq   (,_k_,2), %rdx           # rdx <- 2k
        leaq   (_c_,%rdx,8), %rsi       # rsi <- &c[2k]
	movq   %rsi,    %rdi            # rdi <- &c[2k]
        subq   %rdx,    %rcx            # rcx <- q-2k
        call   .Lsn_fadd_1              # c <- c/(1-BASE^(2k))
        movq   _q_,     %rcx
        not    %rcx
1:
        incq   %rcx
        subq   $1,      (%rsi,%rcx,8)   # c <- c-1
        jb     1b
        movq   _b_,     %rsi
	movq   _b_,     %rdi
        movq   _c_,     %rbx
        movq   _q_,     %rcx
        call   .Lsn_fsub_1              # b[0..q-1] <- b[0..q-1] - c
        jb     3f                       # s il y a retenue, n ajoute pas BASE^q
# passage non test
#ifdef debug_sjoin
	TRACE("sjoin3, checkpoint 1\n")
#endif
        movq   _p_,     %rcx
L(inc_b):
        leaq   (%rsi,%rcx,8), %rsi      # rsi <- &c[q]
        negq   %rcx
2:
        incq   (%rsi,%rcx,8)            # c:b += BASE^q
        jne    3f
        incq   %rcx
        jne    2b
3:

        # c:b:a <- a + (BASE^n - 1)*(c:b)/(1-BASE^k)
        leaq   (,_q_,2), %rcx           # rcx <- 2q
        movq   _k_,     %rdx
        movq   _b_,     %rbx
        leaq   (_b_,_k_,8), %rsi        # rsi <- &b[k]
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1              # b:c <- b:c/(1 - BASE^k)
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _p_,     %rcx
        addq   _q_,     %rcx            # rcx <- p+q
	movq   _a_,     %rdi
        call   .Lsn_fsub_1              # a[0..p+q-1] -= b:c
        jnb    2f
1:
# passage non test
#ifdef debug_sjoin
	TRACE("sjoin3, checkpoint 2\n")
#endif
        subq   $1,      (%rsi)          # propage la retenue
        leaq   8(%rsi), %rsi
        jb     1b
2:
        RETURN_WITH_SP

#endif /* assembly_sn_sjoin3 */
