18 #include "runtime.hpp"
19 #include "integer.hpp"
20 #include "modarith.hpp"
29 #if defined(_M_X64) || defined(_M_IA64)
31 #pragma intrinsic(_umul128)
41 #ifdef SSE2_INTRINSICS_AVAILABLE
43 #include <xmmintrin.h>
44 #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
50 #include <emmintrin.h>
52 #elif defined(_MSC_VER) && defined(_M_IX86)
53 #pragma message("You do not seem to have the Visual C++ Processor Pack ")
54 #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
55 #elif defined(__GNUC__) && defined(__i386__)
65 #ifdef SSE2_INTRINSICS_AVAILABLE
68 CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(
69 size_type
n,
const void *)
78 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
79 p = _mm_malloc(
sizeof(T)*n, 16);
80 #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
81 p = memalign(16,
sizeof(T)*n);
82 #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
83 p = malloc(
sizeof(T)*n);
85 p = (byte *)malloc(
sizeof(T)*n + 8);
89 #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
91 if (!IsAlignedOn(p, 16))
104 void AlignedAllocator<T>::deallocate(
void* p, size_type n)
106 memset(p, 0, n*
sizeof(T));
109 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
111 #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
119 tcArrayDelete((T *)p);
135 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
136 explicit DWord(word low)
141 explicit DWord(word low)
148 DWord(word low, word high)
154 static DWord Multiply(word a, word b)
158 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
159 r.whole_ = (dword)a * b;
161 #elif defined(_M_X64) || defined(_M_IA64)
162 r.halfs_.low = _umul128(a, b, &r.halfs_.high);
164 #elif defined(__alpha__)
167 __asm__(
"umulh %1,%2,%0" :
"=r" (r.halfs_.high)
169 #elif defined(__DECCXX)
170 r.halfs_.high =
asm(
"umulh %a0, %a1, %v0", a, b);
172 #error unknown alpha compiler
175 #elif defined(__ia64__)
177 __asm__(
"xmpy.hu %0=%1,%2" :
"=f" (r.halfs_.high)
180 #elif defined(_ARCH_PPC64)
182 __asm__(
"mulhdu %0,%1,%2" :
"=r" (r.halfs_.high)
183 :
"r" (a),
"r" (b) :
"cc");
185 #elif defined(__x86_64__)
186 __asm__(
"mulq %3" :
"=d" (r.halfs_.high),
"=a" (r.halfs_.low) :
187 "a" (a),
"rm" (b) :
"cc");
189 #elif defined(__mips64)
190 __asm__(
"dmultu %2,%3" :
"=h" (r.halfs_.high),
"=l" (r.halfs_.low)
193 #elif defined(_M_IX86)
195 word64 t = (word64)a * b;
196 r.halfs_.high = ((word32 *)(&t))[1];
197 r.halfs_.low = (word32)t;
199 #error can not implement DWord
205 static DWord MultiplyAndAdd(word a, word b, word c)
207 DWord r = Multiply(a, b);
211 DWord & operator+=(word a)
213 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
217 halfs_.high += (halfs_.low < a);
222 DWord operator+(word a)
225 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
226 r.whole_ = whole_ + a;
228 r.halfs_.low = halfs_.low + a;
229 r.halfs_.high = halfs_.high + (r.halfs_.low < a);
237 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
238 r.whole_ = whole_ - a.whole_;
240 r.halfs_.low = halfs_.low - a.halfs_.low;
241 r.halfs_.high = halfs_.high - a.halfs_.high -
242 (r.halfs_.low > halfs_.low);
247 DWord operator-(word a)
250 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
251 r.whole_ = whole_ - a;
253 r.halfs_.low = halfs_.low - a;
254 r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
260 word operator/(word divisor);
262 word operator%(word a);
264 bool operator!()
const
266 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
269 return !halfs_.high && !halfs_.low;
273 word GetLowHalf()
const {
return halfs_.low;}
274 word GetHighHalf()
const {
return halfs_.high;}
275 word GetHighHalfAsBorrow()
const {
return 0-halfs_.high;}
280 #ifdef LITTLE_ENDIAN_ORDER
291 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
294 struct dword_struct halfs_;
308 Word(hword low, hword high)
310 whole_ = low | (
word(high) << (WORD_BITS/2));
313 static Word Multiply(hword a, hword b)
316 r.whole_ = (
word)a * b;
323 r.whole_ = whole_ - a.whole_;
327 Word operator-(hword a)
330 r.whole_ = whole_ - a;
335 hword operator/(hword divisor)
337 return hword(whole_ / divisor);
340 bool operator!()
const
345 word GetWhole()
const {
return whole_;}
346 hword GetLowHalf()
const {
return hword(whole_);}
347 hword GetHighHalf()
const {
return hword(whole_>>(WORD_BITS/2));}
348 hword GetHighHalfAsBorrow()
const {
return 0-hword(whole_>>(WORD_BITS/2));}
357 template <
class S,
class D>
358 S DivideThreeWordsByTwo(S*
A, S
B0, S
B1, D* dummy_VC6_WorkAround = 0)
365 Q = D(A[1], A[2]) / S(B1+1);
368 D p = D::Multiply(B0, Q);
369 D u = (D) A[0] - p.GetLowHalf();
370 A[0] = u.GetLowHalf();
371 u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() -
373 A[1] = u.GetLowHalf();
374 A[2] += u.GetHighHalf();
377 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
380 A[0] = u.GetLowHalf();
381 u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
382 A[1] = u.GetLowHalf();
383 A[2] += u.GetHighHalf();
392 template <
class S,
class D>
393 inline D DivideFourWordsByTwo(S *T,
const D &Al,
const D &Ah,
const D &B)
396 return D(Ah.GetLowHalf(), Ah.GetHighHalf());
400 T[0] = Al.GetLowHalf();
401 T[1] = Al.GetHighHalf();
402 T[2] = Ah.GetLowHalf();
403 T[3] = Ah.GetHighHalf();
404 Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
406 Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
407 return D(Q[0], Q[1]);
413 inline word DWord::operator/(
word a)
415 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
416 return word(whole_ / a);
419 return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
420 halfs_.high, a).GetWhole();
424 inline word DWord::operator%(
word a)
426 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
427 return word(whole_ % a);
429 if (a < (
word(1) << (WORD_BITS/2)))
432 word r = halfs_.high % h;
433 r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
434 return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
439 DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
440 return Word(r[0], r[1]).GetWhole();
453 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
455 static inline unsigned int RoundupSize(
unsigned int n)
458 return RoundupSizeTable[
n];
465 else return 1
U << BitPrecision(n-1);
469 static int Compare(
const word *A,
const word *B,
unsigned int N)
474 else if (A[N] < B[N])
480 static word Increment(
word *A,
unsigned int N,
word B=1)
486 for (
unsigned i=1;
i<N;
i++)
492 static word Decrement(
word *A,
unsigned int N,
word B=1)
498 for (
unsigned i=1;
i<N;
i++)
504 static void TwosComplement(
word *A,
unsigned int N)
507 for (
unsigned i=0;
i<N;
i++)
515 for(
unsigned i=0;
i<N;
i++)
517 DWord p = DWord::MultiplyAndAdd(A[
i], B, carry);
518 C[
i] = p.GetLowHalf();
519 carry = p.GetHighHalf();
525 static word AtomicInverseModPower2(
word A)
529 for (
unsigned i=3;
i<WORD_BITS;
i*=2)
541 static word TAOCRYPT_CDECL Add(word *C,
const word *A,
const word *B,
543 static word TAOCRYPT_CDECL Subtract(word *C,
const word *A,
const word*B,
545 static void TAOCRYPT_CDECL Multiply2(word *C,
const word *A,
const word *B);
546 static word TAOCRYPT_CDECL Multiply2Add(word *C,
547 const word *A,
const word *B);
548 static void TAOCRYPT_CDECL Multiply4(word *C,
const word *A,
const word *B);
549 static void TAOCRYPT_CDECL Multiply8(word *C,
const word *A,
const word *B);
550 static unsigned int TAOCRYPT_CDECL MultiplyRecursionLimit() {
return 8;}
552 static void TAOCRYPT_CDECL Multiply2Bottom(word *C,
const word *A,
554 static void TAOCRYPT_CDECL Multiply4Bottom(word *C,
const word *A,
556 static void TAOCRYPT_CDECL Multiply8Bottom(word *C,
const word *A,
558 static unsigned int TAOCRYPT_CDECL MultiplyBottomRecursionLimit(){
return 8;}
560 static void TAOCRYPT_CDECL Square2(word *R,
const word *A);
561 static void TAOCRYPT_CDECL Square4(word *R,
const word *A);
562 static unsigned int TAOCRYPT_CDECL SquareRecursionLimit() {
return 4;}
565 word Portable::Add(word *C,
const word *A,
const word *B,
unsigned int N)
568 for (
unsigned int i = 0;
i < N;
i+=2)
570 u =
DWord(A[
i]) + B[
i] + u.GetHighHalf();
571 C[
i] = u.GetLowHalf();
572 u =
DWord(A[
i+1]) + B[
i+1] + u.GetHighHalf();
573 C[
i+1] = u.GetLowHalf();
575 return u.GetHighHalf();
578 word Portable::Subtract(
word *C,
const word *A,
const word *B,
unsigned int N)
581 for (
unsigned int i = 0;
i < N;
i+=2)
583 u = (DWord) A[
i] - B[
i] - u.GetHighHalfAsBorrow();
584 C[
i] = u.GetLowHalf();
585 u = (DWord) A[
i+1] - B[
i+1] - u.GetHighHalfAsBorrow();
586 C[
i+1] = u.GetLowHalf();
588 return 0-u.GetHighHalf();
591 void Portable::Multiply2(
word *C,
const word *A,
const word *B)
621 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
622 unsigned int ai = A[1] < A[0];
623 unsigned int bi = B[0] < B[1];
624 unsigned int di = ai & bi;
625 DWord d = DWord::Multiply(D[di], D[di+2]);
627 unsigned int si = ai + !bi;
630 DWord A0B0 = DWord::Multiply(A[0], B[0]);
631 C[0] = A0B0.GetLowHalf();
633 DWord A1B1 = DWord::Multiply(A[1], B[1]);
634 DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
636 C[1] = t.GetLowHalf();
638 t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
639 + A1B1.GetHighHalf() - s;
640 C[2] = t.GetLowHalf();
641 C[3] = t.GetHighHalf();
644 void Portable::Multiply2Bottom(
word *C,
const word *A,
const word *B)
646 DWord t = DWord::Multiply(A[0], B[0]);
647 C[0] = t.GetLowHalf();
648 C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
653 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
654 unsigned int ai = A[1] < A[0];
655 unsigned int bi = B[0] < B[1];
656 unsigned int di = ai & bi;
657 DWord d = DWord::Multiply(D[di], D[di+2]);
659 unsigned int si = ai + !bi;
662 DWord A0B0 = DWord::Multiply(A[0], B[0]);
663 DWord t = A0B0 + C[0];
664 C[0] = t.GetLowHalf();
666 DWord A1B1 = DWord::Multiply(A[1], B[1]);
667 t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
668 A1B1.GetLowHalf() + C[1];
669 C[1] = t.GetLowHalf();
671 t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
672 d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
673 C[2] = t.GetLowHalf();
675 t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
676 C[3] = t.GetLowHalf();
677 return t.GetHighHalf();
681 #define MulAcc(x, y) \
682 p = DWord::MultiplyAndAdd(A[x], B[y], c); \
683 c = p.GetLowHalf(); \
684 p = (DWord) d + p.GetHighHalf(); \
685 d = p.GetLowHalf(); \
686 e += p.GetHighHalf();
688 #define SaveMulAcc(s, x, y) \
690 p = DWord::MultiplyAndAdd(A[x], B[y], d); \
691 c = p.GetLowHalf(); \
692 p = (DWord) e + p.GetHighHalf(); \
693 d = p.GetLowHalf(); \
696 #define SquAcc(x, y) \
697 q = DWord::Multiply(A[x], A[y]); \
699 c = p.GetLowHalf(); \
700 p = (DWord) d + p.GetHighHalf(); \
701 d = p.GetLowHalf(); \
702 e += p.GetHighHalf(); \
704 c = p.GetLowHalf(); \
705 p = (DWord) d + p.GetHighHalf(); \
706 d = p.GetLowHalf(); \
707 e += p.GetHighHalf();
709 #define SaveSquAcc(s, x, y) \
711 q = DWord::Multiply(A[x], A[y]); \
713 c = p.GetLowHalf(); \
714 p = (DWord) e + p.GetHighHalf(); \
715 d = p.GetLowHalf(); \
716 e = p.GetHighHalf(); \
718 c = p.GetLowHalf(); \
719 p = (DWord) d + p.GetHighHalf(); \
720 d = p.GetLowHalf(); \
721 e += p.GetHighHalf();
724 void Portable::Multiply4(
word *R,
const word *A,
const word *B)
729 p = DWord::Multiply(A[0], B[0]);
730 R[0] = p.GetLowHalf();
754 p = DWord::MultiplyAndAdd(A[3], B[3], d);
755 R[6] = p.GetLowHalf();
756 R[7] = e + p.GetHighHalf();
759 void Portable::Square2(
word *R,
const word *A)
764 p = DWord::Multiply(A[0], A[0]);
765 R[0] = p.GetLowHalf();
772 p = DWord::MultiplyAndAdd(A[1], A[1], d);
773 R[2] = p.GetLowHalf();
774 R[3] = e + p.GetHighHalf();
777 void Portable::Square4(
word *R,
const word *A)
790 p = DWord::Multiply(A[0], A[0]);
791 R[0] = p.GetLowHalf();
809 p = DWord::MultiplyAndAdd(A[3], A[3], d);
810 R[6] = p.GetLowHalf();
811 R[7] = e + p.GetHighHalf();
815 void Portable::Multiply8(
word *R,
const word *A,
const word *B)
820 p = DWord::Multiply(A[0], B[0]);
821 R[0] = p.GetLowHalf();
888 SaveMulAcc(10, 4, 7);
893 SaveMulAcc(11, 5, 7);
897 SaveMulAcc(12, 6, 7);
901 p = DWord::MultiplyAndAdd(A[7], B[7], d);
902 R[14] = p.GetLowHalf();
903 R[15] = e + p.GetHighHalf();
906 void Portable::Multiply4Bottom(
word *R,
const word *A,
const word *B)
911 p = DWord::Multiply(A[0], B[0]);
912 R[0] = p.GetLowHalf();
924 R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
927 void Portable::Multiply8Bottom(
word *R,
const word *A,
const word *B)
932 p = DWord::Multiply(A[0], B[0]);
933 R[0] = p.GetLowHalf();
971 R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
972 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
983 #ifdef TAOCRYPT_X86ASM_AVAILABLE
988 #ifdef SSE2_INTRINSICS_AVAILABLE
991 static jmp_buf s_env;
992 static void SigIllHandler(
int)
998 static bool HasSSE2()
1005 if ((cpuid[3] & (1 << 26)) == 0)
1011 __asm xorpd xmm0, xmm0
1019 typedef void (*SigHandler)(int);
1021 SigHandler oldHandler = signal(SIGILL, SigIllHandler);
1022 if (oldHandler == SIG_ERR)
1029 __asm __volatile (
"xorpd %xmm0, %xmm0");
1031 signal(SIGILL, oldHandler);
1035 #endif // SSE2_INTRINSICS_AVAILABLE
1046 return ((cpuid[0] >> 8) & 0xf) == 0xf;
1051 class PentiumOptimized :
public Portable
1058 static void TAOCRYPT_CDECL Multiply4(
word *C,
const word *A,
1060 static void TAOCRYPT_CDECL Multiply8(
word *C,
const word *A,
1062 static void TAOCRYPT_CDECL Multiply8Bottom(
word *C,
const word *A,
1073 #ifdef SSE2_INTRINSICS_AVAILABLE
1074 static void TAOCRYPT_CDECL Multiply4(
word *C,
const word *A,
1076 static void TAOCRYPT_CDECL Multiply8(
word *C,
const word *A,
1078 static void TAOCRYPT_CDECL Multiply8Bottom(
word *C,
const word *A,
1083 typedef word (TAOCRYPT_CDECL * PAddSub)(
word *C,
const word *A,
const word *B,
1085 typedef void (TAOCRYPT_CDECL * PMul)(
word *C,
const word *A,
const word *B);
1087 static PAddSub s_pAdd, s_pSub;
1088 #ifdef SSE2_INTRINSICS_AVAILABLE
1089 static PMul s_pMul4, s_pMul8, s_pMul8B;
1092 static void SetPentiumFunctionPointers()
1096 s_pAdd = &Portable::Add;
1097 s_pSub = &Portable::Subtract;
1101 s_pAdd = &P4Optimized::Add;
1102 s_pSub = &P4Optimized::Subtract;
1106 s_pAdd = &PentiumOptimized::Add;
1107 s_pSub = &PentiumOptimized::Subtract;
1110 #ifdef SSE2_INTRINSICS_AVAILABLE
1113 s_pMul4 = &Portable::Multiply4;
1114 s_pMul8 = &Portable::Multiply8;
1115 s_pMul8B = &Portable::Multiply8Bottom;
1119 s_pMul4 = &P4Optimized::Multiply4;
1120 s_pMul8 = &P4Optimized::Multiply8;
1121 s_pMul8B = &P4Optimized::Multiply8Bottom;
1125 s_pMul4 = &PentiumOptimized::Multiply4;
1126 s_pMul8 = &PentiumOptimized::Multiply8;
1127 s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1132 static const char s_RunAtStartupSetPentiumFunctionPointers =
1133 (SetPentiumFunctionPointers(), 0);
1136 class LowLevel :
public PentiumOptimized
1141 {
return s_pAdd(C, A, B, N);}
1144 {
return s_pSub(C, A, B, N);}
1145 inline static void Square4(
word *R,
const word *A)
1146 {Multiply4(R, A, A);}
1147 #ifdef SSE2_INTRINSICS_AVAILABLE
1148 inline static void Multiply4(
word *C,
const word *A,
const word *B)
1150 inline static void Multiply8(
word *C,
const word *A,
const word *B)
1152 inline static void Multiply8Bottom(
word *C,
const word *A,
const word *B)
1153 {s_pMul8B(C, A, B);}
1159 #define TAOCRYPT_NAKED __declspec(naked)
1160 #define AS1(x) __asm x
1161 #define AS2(x, y) __asm x, y
1162 #define AddPrologue \
1167 __asm mov ecx, [esp+20] \
1168 __asm mov edx, [esp+24] \
1169 __asm mov ebx, [esp+28] \
1170 __asm mov esi, [esp+32]
1171 #define AddEpilogue \
1177 #define MulPrologue \
1182 __asm mov ecx, [esp+28] \
1183 __asm mov esi, [esp+24] \
1185 #define MulEpilogue \
1193 #define TAOCRYPT_NAKED
1194 #define AS1(x) #x ";"
1195 #define AS2(x, y) #x ", " #y ";"
1196 #define AddPrologue \
1198 __asm__ __volatile__ \
1202 ".intel_syntax noprefix;" \
1204 #define AddEpilogue \
1206 ".att_syntax prefix;" \
1210 : "c" (C), "d" (A), "m" (B), "S" (N) \
1211 : "%edi", "memory", "cc" \
1215 #define MulPrologue \
1216 __asm__ __volatile__ \
1221 ".intel_syntax noprefix;"
1222 #define MulEpilogue \
1226 ".att_syntax prefix;" \
1228 : "rm" (Z), "S" (X), "c" (Y) \
1229 : "%eax", "%edx", "%edi", "memory", "cc" \
1233 TAOCRYPT_NAKED
word PentiumOptimized::Add(
word *C,
const word *A,
1234 const word *B,
unsigned int N)
1244 AS2( lea ebx, [ebx+4*esi])
1252 AS2( mov ebp,[edx+4])
1254 AS2( mov edi,[ebx+8*eax])
1255 AS2( lea edx,[edx+8])
1258 AS2( mov edi,[ebx+8*eax+4])
1263 AS2( mov [edx+ecx-8],esi)
1264 AS2( mov [edx+ecx-4],ebp)
1266 AS1( jnz loopstartAdd)
1274 TAOCRYPT_NAKED
word PentiumOptimized::Subtract(
word *C, const
word *A,
1275 const
word *B,
unsigned int N)
1285 AS2( lea ebx, [ebx+4*esi])
1293 AS2( mov ebp,[edx+4])
1295 AS2( mov edi,[ebx+8*eax])
1296 AS2( lea edx,[edx+8])
1299 AS2( mov edi,[ebx+8*eax+4])
1304 AS2( mov [edx+ecx-8],esi)
1305 AS2( mov [edx+ecx-4],ebp)
1307 AS1( jnz loopstartSub)
1317 TAOCRYPT_NAKED
word P4Optimized::Add(
word *C, const
word *A, const
word *B,
1325 AS1( jz loopendAddP4)
1327 AS2( mov edi, [edx])
1328 AS2( mov ebp, [ebx])
1329 AS1( jmp carry1AddP4)
1331 AS1(loopstartAddP4:)
1332 AS2( mov edi, [edx+8])
1335 AS2( mov ebp, [ebx])
1337 AS1( jc carry1AddP4)
1343 AS2( mov [ecx], edi)
1344 AS2( mov edi, [edx+4])
1345 AS2( cmovc eax, ebp)
1346 AS2( mov ebp, [ebx+4])
1349 AS1( jc carry2AddP4)
1355 AS2( cmovc eax, ebp)
1356 AS2( mov [ecx+4], edi)
1358 AS1( jnz loopstartAddP4)
1365 TAOCRYPT_NAKED
word P4Optimized::Subtract(
word *C, const
word *A,
1366 const
word *B,
unsigned int N)
1373 AS1( jz loopendSubP4)
1375 AS2( mov edi, [edx])
1376 AS2( mov ebp, [ebx])
1377 AS1( jmp carry1SubP4)
1379 AS1(loopstartSubP4:)
1380 AS2( mov edi, [edx+8])
1383 AS2( mov ebp, [ebx])
1385 AS1( jc carry1SubP4)
1391 AS2( mov [ecx], edi)
1392 AS2( mov edi, [edx+4])
1393 AS2( cmovc eax, ebp)
1394 AS2( mov ebp, [ebx+4])
1397 AS1( jc carry2SubP4)
1403 AS2( cmovc eax, ebp)
1404 AS2( mov [ecx+4], edi)
1406 AS1( jnz loopstartSubP4)
1415 #define MulStartup \
1420 #define MulShiftCarry \
1425 #define MulAccumulateBottom(i,j) \
1426 AS2(mov eax, [ecx+4*j]) \
1427 AS2(imul eax, dword ptr [esi+4*i]) \
1430 #define MulAccumulate(i,j) \
1431 AS2(mov eax, [ecx+4*j]) \
1432 AS1(mul dword ptr [esi+4*i]) \
1437 #define MulStoreDigit(i) \
1439 AS2(mov edi, [esp]) \
1440 AS2(mov [edi+4*i], ebp)
1442 #define MulLastDiagonal(digits) \
1443 AS2(mov eax, [ecx+4*(digits-1)]) \
1444 AS1(mul dword ptr [esi+4*(digits-1)]) \
1447 AS2(mov edi, [esp]) \
1448 AS2(mov [edi+4*(2*digits-2)], ebp) \
1449 AS2(mov [edi+4*(2*digits-1)], edx)
1451 TAOCRYPT_NAKED
void PentiumOptimized::Multiply4(
word* Z,
const word* X,
1494 TAOCRYPT_NAKED
void PentiumOptimized::Multiply8(
word* Z, const
word* X,
1609 TAOCRYPT_NAKED
void PentiumOptimized::Multiply8Bottom(
word* Z, const
word* X,
1664 MulAccumulateBottom(7,0)
1665 MulAccumulateBottom(6,1)
1666 MulAccumulateBottom(5,2)
1667 MulAccumulateBottom(4,3)
1668 MulAccumulateBottom(3,4)
1669 MulAccumulateBottom(2,5)
1670 MulAccumulateBottom(1,6)
1671 MulAccumulateBottom(0,7)
1679 #else // not x86 - no processor specific code at this layer
1681 typedef Portable LowLevel;
1685 #ifdef SSE2_INTRINSICS_AVAILABLE
1688 #define TAOCRYPT_FASTCALL
1690 #define TAOCRYPT_FASTCALL __fastcall
1693 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C,
const __m128i *A,
1696 __m128i a3210 = _mm_load_si128(A);
1697 __m128i b3210 = _mm_load_si128(B);
1701 __m128i z = _mm_setzero_si128();
1702 __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1705 __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1706 __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1707 __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1708 __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1709 __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1710 C[1] = _mm_add_epi64(a1b0, a0b1);
1712 __m128i a31 = _mm_srli_epi64(a3210, 32);
1713 __m128i b31 = _mm_srli_epi64(b3210, 32);
1714 __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1717 __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1718 __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1719 __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1720 __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1721 __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1722 sum = _mm_add_epi64(a1b1, a0b2);
1723 C[2] = _mm_add_epi64(sum, a2b0);
1725 __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1726 __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1727 __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1728 __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1729 __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1730 __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1731 __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1732 __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1733 __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1734 sum = _mm_add_epi64(a2b1, a0b3);
1735 C[3] = _mm_add_epi64(sum, sum1);
1737 __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1738 __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1739 __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1740 __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1741 sum = _mm_add_epi64(a2b2, a3b1);
1742 C[4] = _mm_add_epi64(sum, a1b3);
1744 __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1745 __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1746 __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1747 __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1748 __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1749 C[5] = _mm_add_epi64(a3b2, a2b3);
1752 void P4Optimized::Multiply4(
word *C,
const word *A,
const word *B)
1756 const __m64 *mw = (__m64 *)w;
1758 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1764 __m64 w1 = _mm_cvtsi32_si64(w[1]);
1775 __m64 w26 = _mm_cvtsi32_si64(w[26]);
1777 s1 = _mm_add_si64(w1, w4);
1778 C[1] = _mm_cvtsi64_si32(s1);
1779 s1 = _mm_srli_si64(s1, 32);
1781 s2 = _mm_add_si64(w6, w8);
1782 s1 = _mm_add_si64(s1, s2);
1783 C[2] = _mm_cvtsi64_si32(s1);
1784 s1 = _mm_srli_si64(s1, 32);
1786 s2 = _mm_add_si64(w10, w12);
1787 s1 = _mm_add_si64(s1, s2);
1788 C[3] = _mm_cvtsi64_si32(s1);
1789 s1 = _mm_srli_si64(s1, 32);
1791 s2 = _mm_add_si64(w14, w16);
1792 s1 = _mm_add_si64(s1, s2);
1793 C[4] = _mm_cvtsi64_si32(s1);
1794 s1 = _mm_srli_si64(s1, 32);
1796 s2 = _mm_add_si64(w18, w20);
1797 s1 = _mm_add_si64(s1, s2);
1798 C[5] = _mm_cvtsi64_si32(s1);
1799 s1 = _mm_srli_si64(s1, 32);
1801 s2 = _mm_add_si64(w22, w26);
1802 s1 = _mm_add_si64(s1, s2);
1803 C[6] = _mm_cvtsi64_si32(s1);
1804 s1 = _mm_srli_si64(s1, 32);
1806 C[7] = _mm_cvtsi64_si32(s1) + w[27];
1810 void P4Optimized::Multiply8(
word *C,
const word *A,
const word *B)
1814 const __m64 *mw = (__m64 *)w;
1816 const __m64 *mx = (__m64 *)x;
1817 const word *y = (
word *)temp+7*4*2;
1818 const __m64 *my = (__m64 *)y;
1819 const word *z = (
word *)temp+7*4*3;
1820 const __m64 *mz = (__m64 *)z;
1822 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1824 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1826 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1828 P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1832 __m64 s1, s2, s3, s4;
1834 __m64 w1 = _mm_cvtsi32_si64(w[1]);
1845 __m64 w26 = _mm_cvtsi32_si64(w[26]);
1846 __m64 w27 = _mm_cvtsi32_si64(w[27]);
1848 __m64 x0 = _mm_cvtsi32_si64(x[0]);
1849 __m64 x1 = _mm_cvtsi32_si64(x[1]);
1860 __m64 x26 = _mm_cvtsi32_si64(x[26]);
1861 __m64 x27 = _mm_cvtsi32_si64(x[27]);
1863 __m64 y0 = _mm_cvtsi32_si64(y[0]);
1864 __m64 y1 = _mm_cvtsi32_si64(y[1]);
1875 __m64 y26 = _mm_cvtsi32_si64(y[26]);
1876 __m64 y27 = _mm_cvtsi32_si64(y[27]);
1878 __m64 z0 = _mm_cvtsi32_si64(z[0]);
1879 __m64 z1 = _mm_cvtsi32_si64(z[1]);
1890 __m64 z26 = _mm_cvtsi32_si64(z[26]);
1892 s1 = _mm_add_si64(w1, w4);
1893 C[1] = _mm_cvtsi64_si32(s1);
1894 s1 = _mm_srli_si64(s1, 32);
1896 s2 = _mm_add_si64(w6, w8);
1897 s1 = _mm_add_si64(s1, s2);
1898 C[2] = _mm_cvtsi64_si32(s1);
1899 s1 = _mm_srli_si64(s1, 32);
1901 s2 = _mm_add_si64(w10, w12);
1902 s1 = _mm_add_si64(s1, s2);
1903 C[3] = _mm_cvtsi64_si32(s1);
1904 s1 = _mm_srli_si64(s1, 32);
1906 s3 = _mm_add_si64(x0, y0);
1907 s2 = _mm_add_si64(w14, w16);
1908 s1 = _mm_add_si64(s1, s3);
1909 s1 = _mm_add_si64(s1, s2);
1910 C[4] = _mm_cvtsi64_si32(s1);
1911 s1 = _mm_srli_si64(s1, 32);
1913 s3 = _mm_add_si64(x1, y1);
1914 s4 = _mm_add_si64(x4, y4);
1915 s1 = _mm_add_si64(s1, w18);
1916 s3 = _mm_add_si64(s3, s4);
1917 s1 = _mm_add_si64(s1, w20);
1918 s1 = _mm_add_si64(s1, s3);
1919 C[5] = _mm_cvtsi64_si32(s1);
1920 s1 = _mm_srli_si64(s1, 32);
1922 s3 = _mm_add_si64(x6, y6);
1923 s4 = _mm_add_si64(x8, y8);
1924 s1 = _mm_add_si64(s1, w22);
1925 s3 = _mm_add_si64(s3, s4);
1926 s1 = _mm_add_si64(s1, w26);
1927 s1 = _mm_add_si64(s1, s3);
1928 C[6] = _mm_cvtsi64_si32(s1);
1929 s1 = _mm_srli_si64(s1, 32);
1931 s3 = _mm_add_si64(x10, y10);
1932 s4 = _mm_add_si64(x12, y12);
1933 s1 = _mm_add_si64(s1, w27);
1934 s3 = _mm_add_si64(s3, s4);
1935 s1 = _mm_add_si64(s1, s3);
1936 C[7] = _mm_cvtsi64_si32(s1);
1937 s1 = _mm_srli_si64(s1, 32);
1939 s3 = _mm_add_si64(x14, y14);
1940 s4 = _mm_add_si64(x16, y16);
1941 s1 = _mm_add_si64(s1, z0);
1942 s3 = _mm_add_si64(s3, s4);
1943 s1 = _mm_add_si64(s1, s3);
1944 C[8] = _mm_cvtsi64_si32(s1);
1945 s1 = _mm_srli_si64(s1, 32);
1947 s3 = _mm_add_si64(x18, y18);
1948 s4 = _mm_add_si64(x20, y20);
1949 s1 = _mm_add_si64(s1, z1);
1950 s3 = _mm_add_si64(s3, s4);
1951 s1 = _mm_add_si64(s1, z4);
1952 s1 = _mm_add_si64(s1, s3);
1953 C[9] = _mm_cvtsi64_si32(s1);
1954 s1 = _mm_srli_si64(s1, 32);
1956 s3 = _mm_add_si64(x22, y22);
1957 s4 = _mm_add_si64(x26, y26);
1958 s1 = _mm_add_si64(s1, z6);
1959 s3 = _mm_add_si64(s3, s4);
1960 s1 = _mm_add_si64(s1, z8);
1961 s1 = _mm_add_si64(s1, s3);
1962 C[10] = _mm_cvtsi64_si32(s1);
1963 s1 = _mm_srli_si64(s1, 32);
1965 s3 = _mm_add_si64(x27, y27);
1966 s1 = _mm_add_si64(s1, z10);
1967 s1 = _mm_add_si64(s1, z12);
1968 s1 = _mm_add_si64(s1, s3);
1969 C[11] = _mm_cvtsi64_si32(s1);
1970 s1 = _mm_srli_si64(s1, 32);
1972 s3 = _mm_add_si64(z14, z16);
1973 s1 = _mm_add_si64(s1, s3);
1974 C[12] = _mm_cvtsi64_si32(s1);
1975 s1 = _mm_srli_si64(s1, 32);
1977 s3 = _mm_add_si64(z18, z20);
1978 s1 = _mm_add_si64(s1, s3);
1979 C[13] = _mm_cvtsi64_si32(s1);
1980 s1 = _mm_srli_si64(s1, 32);
1982 s3 = _mm_add_si64(z22, z26);
1983 s1 = _mm_add_si64(s1, s3);
1984 C[14] = _mm_cvtsi64_si32(s1);
1985 s1 = _mm_srli_si64(s1, 32);
1987 C[15] = z[27] + _mm_cvtsi64_si32(s1);
1991 void P4Optimized::Multiply8Bottom(
word *C,
const word *A,
const word *B)
1995 const __m64 *mw = (__m64 *)w;
1997 const __m64 *mx = (__m64 *)x;
1998 const word *y = (
word *)temp+7*4*2;
1999 const __m64 *my = (__m64 *)y;
2001 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
2003 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
2005 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
2009 __m64 s1, s2, s3, s4;
2011 __m64 w1 = _mm_cvtsi32_si64(w[1]);
2022 __m64 w26 = _mm_cvtsi32_si64(w[26]);
2024 __m64 x0 = _mm_cvtsi32_si64(x[0]);
2025 __m64 x1 = _mm_cvtsi32_si64(x[1]);
2030 __m64 y0 = _mm_cvtsi32_si64(y[0]);
2031 __m64 y1 = _mm_cvtsi32_si64(y[1]);
2036 s1 = _mm_add_si64(w1, w4);
2037 C[1] = _mm_cvtsi64_si32(s1);
2038 s1 = _mm_srli_si64(s1, 32);
2040 s2 = _mm_add_si64(w6, w8);
2041 s1 = _mm_add_si64(s1, s2);
2042 C[2] = _mm_cvtsi64_si32(s1);
2043 s1 = _mm_srli_si64(s1, 32);
2045 s2 = _mm_add_si64(w10, w12);
2046 s1 = _mm_add_si64(s1, s2);
2047 C[3] = _mm_cvtsi64_si32(s1);
2048 s1 = _mm_srli_si64(s1, 32);
2050 s3 = _mm_add_si64(x0, y0);
2051 s2 = _mm_add_si64(w14, w16);
2052 s1 = _mm_add_si64(s1, s3);
2053 s1 = _mm_add_si64(s1, s2);
2054 C[4] = _mm_cvtsi64_si32(s1);
2055 s1 = _mm_srli_si64(s1, 32);
2057 s3 = _mm_add_si64(x1, y1);
2058 s4 = _mm_add_si64(x4, y4);
2059 s1 = _mm_add_si64(s1, w18);
2060 s3 = _mm_add_si64(s3, s4);
2061 s1 = _mm_add_si64(s1, w20);
2062 s1 = _mm_add_si64(s1, s3);
2063 C[5] = _mm_cvtsi64_si32(s1);
2064 s1 = _mm_srli_si64(s1, 32);
2066 s3 = _mm_add_si64(x6, y6);
2067 s4 = _mm_add_si64(x8, y8);
2068 s1 = _mm_add_si64(s1, w22);
2069 s3 = _mm_add_si64(s3, s4);
2070 s1 = _mm_add_si64(s1, w26);
2071 s1 = _mm_add_si64(s1, s3);
2072 C[6] = _mm_cvtsi64_si32(s1);
2073 s1 = _mm_srli_si64(s1, 32);
2075 C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
2079 #endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2111 if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
2112 LowLevel::Multiply8(R, A, B);
2113 else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
2114 LowLevel::Multiply4(R, A, B);
2116 LowLevel::Multiply2(R, A, B);
2119 const unsigned int N2 = N/2;
2122 int aComp = Compare(A0, A1, N2);
2123 int bComp = Compare(B0, B1, N2);
2125 switch (2*aComp + aComp + bComp)
2128 LowLevel::Subtract(R0, A1, A0, N2);
2129 LowLevel::Subtract(R1, B0, B1, N2);
2130 RecursiveMultiply(T0,
T2, R0, R1, N2);
2131 LowLevel::Subtract(
T1,
T1, R0, N2);
2135 LowLevel::Subtract(R0, A1, A0, N2);
2136 LowLevel::Subtract(R1, B0, B1, N2);
2137 RecursiveMultiply(T0,
T2, R0, R1, N2);
2141 LowLevel::Subtract(R0, A0, A1, N2);
2142 LowLevel::Subtract(R1, B1, B0, N2);
2143 RecursiveMultiply(T0,
T2, R0, R1, N2);
2147 LowLevel::Subtract(R0, A1, A0, N2);
2148 LowLevel::Subtract(R1, B0, B1, N2);
2149 RecursiveMultiply(T0,
T2, R0, R1, N2);
2150 LowLevel::Subtract(
T1,
T1, R1, N2);
2158 RecursiveMultiply(R0,
T2, A0, B0, N2);
2159 RecursiveMultiply(R2,
T2, A1, B1, N2);
2163 carry += LowLevel::Add(T0, T0, R0, N);
2164 carry += LowLevel::Add(T0, T0, R2, N);
2165 carry += LowLevel::Add(R1, R1, T0, N);
2167 Increment(R3, N2, carry);
2172 void RecursiveSquare(
word *R,
word *T,
const word *A,
unsigned int N)
2174 if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
2175 LowLevel::Square4(R, A);
2177 LowLevel::Square2(R, A);
2180 const unsigned int N2 = N/2;
2182 RecursiveSquare(R0,
T2, A0, N2);
2183 RecursiveSquare(R2,
T2, A1, N2);
2184 RecursiveMultiply(T0,
T2, A0, A1, N2);
2186 word carry = LowLevel::Add(R1, R1, T0, N);
2187 carry += LowLevel::Add(R1, R1, T0, N);
2188 Increment(R3, N2, carry);
2199 void RecursiveMultiplyBottom(
word *R,
word *T,
const word *A,
const word *B,
2202 if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
2203 LowLevel::Multiply8Bottom(R, A, B);
2204 else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
2205 LowLevel::Multiply4Bottom(R, A, B);
2207 LowLevel::Multiply2Bottom(R, A, B);
2210 const unsigned int N2 = N/2;
2212 RecursiveMultiply(R, T, A0, B0, N2);
2213 RecursiveMultiplyBottom(T0,
T1, A1, B0, N2);
2214 LowLevel::Add(R1, R1, T0, N2);
2215 RecursiveMultiplyBottom(T0,
T1, A0, B1, N2);
2216 LowLevel::Add(R1, R1, T0, N2);
2222 const word *B,
unsigned int N)
2226 LowLevel::Multiply4(T, A, B);
2227 memcpy(R, T+4, 4*WORD_SIZE);
2231 LowLevel::Multiply2(T, A, B);
2232 memcpy(R, T+2, 2*WORD_SIZE);
2236 const unsigned int N2 = N/2;
2239 int aComp = Compare(A0, A1, N2);
2240 int bComp = Compare(B0, B1, N2);
2242 switch (2*aComp + aComp + bComp)
2245 LowLevel::Subtract(R0, A1, A0, N2);
2246 LowLevel::Subtract(R1, B0, B1, N2);
2247 RecursiveMultiply(T0,
T2, R0, R1, N2);
2248 LowLevel::Subtract(
T1,
T1, R0, N2);
2252 LowLevel::Subtract(R0, A1, A0, N2);
2253 LowLevel::Subtract(R1, B0, B1, N2);
2254 RecursiveMultiply(T0,
T2, R0, R1, N2);
2258 LowLevel::Subtract(R0, A0, A1, N2);
2259 LowLevel::Subtract(R1, B1, B0, N2);
2260 RecursiveMultiply(T0,
T2, R0, R1, N2);
2264 LowLevel::Subtract(R0, A1, A0, N2);
2265 LowLevel::Subtract(R1, B0, B1, N2);
2266 RecursiveMultiply(T0,
T2, R0, R1, N2);
2267 LowLevel::Subtract(
T1,
T1, R1, N2);
2275 RecursiveMultiply(
T2, R0, A1, B1, N2);
2279 word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
2280 c2 += LowLevel::Subtract(R0, R0, T0, N2);
2281 word t = (Compare(R0,
T2, N2) == -1);
2284 carry += Increment(R0, N2, c2+t);
2285 carry += LowLevel::Add(R0, R0,
T1, N2);
2286 carry += LowLevel::Add(R0, R0,
T3, N2);
2288 CopyWords(R1,
T3, N2);
2289 Increment(R1, N2, carry);
2296 return LowLevel::Add(C, A, B, N);
2299 inline word Subtract(
word *C,
const word *A,
const word *B,
unsigned int N)
2301 return LowLevel::Subtract(C, A, B, N);
2307 RecursiveMultiply(R, T, A, B, N);
2310 inline void Square(
word *R,
word *T,
const word *A,
unsigned int N)
2312 RecursiveSquare(R, T, A, N);
2316 void AsymmetricMultiply(
word *R,
word *T,
const word *A,
unsigned int NA,
2317 const word *B,
unsigned int NB)
2322 Square(R, T, A, NA);
2324 Multiply(R, T, A, B, NA);
2340 SetWords(R, 0, NB+2);
2343 CopyWords(R, B, NB);
2344 R[NB] = R[NB+1] = 0;
2347 R[NB] = LinearMultiply(R, B, A[0], NB);
2353 Multiply(R, T, A, B, NA);
2354 CopyWords(T+2*NA, R+NA, NA);
2358 for (i=2*NA; i<NB; i+=2*NA)
2359 Multiply(T+NA+i, T, A, B+i, NA);
2360 for (i=NA; i<NB; i+=2*NA)
2361 Multiply(R+i, T, A, B+i, NA);
2363 if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2364 Increment(R+NB, NA);
2368 void PositiveMultiply(Integer& product,
const Integer& a,
const Integer& b)
2370 unsigned int aSize = RoundupSize(a.WordCount());
2371 unsigned int bSize = RoundupSize(b.WordCount());
2373 product.reg_.CleanNew(RoundupSize(aSize + bSize));
2374 product.sign_ = Integer::POSITIVE;
2376 AlignedWordBlock workspace(aSize + bSize);
2377 AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
2378 a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
2381 void Multiply(Integer &product,
const Integer &a,
const Integer &b)
2383 PositiveMultiply(product, a, b);
2385 if (a.NotNegative() != b.NotNegative())
2390 static inline unsigned int EvenWordCount(
const word *X,
unsigned int N)
2392 while (N && X[N-2]==0 && X[N-1]==0)
2398 unsigned int AlmostInverse(
word *R,
word *T,
const word *A,
unsigned int NA,
2399 const word *M,
unsigned int N)
2405 unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
2406 unsigned int k=0, s=0;
2408 SetWords(T, 0, 3*N);
2410 CopyWords(f, A, NA);
2418 if (EvenWordCount(f, fgLen)==0)
2424 ShiftWordsRightByWords(f, fgLen, 1);
2425 if (c[bcLen-1]) bcLen+=2;
2426 ShiftWordsLeftByWords(c, bcLen, 1);
2439 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2444 Subtract(R, M, b, N);
2448 ShiftWordsRightByBits(f, fgLen, i);
2449 t=ShiftWordsLeftByBits(c, bcLen, i);
2456 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2459 if (Compare(f, g, fgLen)==-1)
2466 Subtract(f, f, g, fgLen);
2468 if (Add(b, b, c, bcLen))
2480 void DivideByPower2Mod(
word *R,
const word *A,
unsigned int k,
const word *M,
2488 ShiftWordsRightByBits(R, N, 1);
2491 word carry = Add(R, R, M, N);
2492 ShiftWordsRightByBits(R, N, 1);
2493 R[N-1] += carry<<(WORD_BITS-1);
2502 void MultiplyByPower2Mod(
word *R,
const word *A,
unsigned int k,
const word *M,
2508 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2509 Subtract(R, R, M, N);
2517 : reg_(2), sign_(POSITIVE)
2519 reg_[0] = reg_[1] = 0;
2523 Integer::Integer(
const Integer& t)
2524 : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
2526 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2530 Integer::Integer(
signed long value)
2540 reg_[0] =
word(value);
2541 reg_[1] =
word(SafeRightShift<WORD_BITS, unsigned long>(value));
2545 Integer::Integer(Sign s,
word high,
word low)
2553 Integer::Integer(
word value,
unsigned int length)
2554 : reg_(RoundupSize(length)), sign_(POSITIVE)
2557 SetWords(reg_ + 1, 0, reg_.size() - 1);
2561 Integer::Integer(
const byte *encodedInteger,
unsigned int byteCount,
2564 Decode(encodedInteger, byteCount, s);
2570 Integer::Integer(
Source& source)
2571 : reg_(2), sign_(POSITIVE)
2576 void Integer::Decode(Source& source)
2578 byte b = source.next();
2580 source.SetError(INTEGER_E);
2584 word32 length = GetLength(source);
2585 if (length == 0 || source.GetError().What())
return;
2587 if ( (b = source.next()) == 0x00)
2592 if (source.IsLeft(length) ==
false)
return;
2594 unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
2595 words = RoundupSize(words);
2596 if (words > reg_.size()) reg_.CleanNew(words);
2598 for (
int j = length; j > 0; j--) {
2600 reg_ [(j-1) / WORD_SIZE] |= (
word)b << ((j-1) % WORD_SIZE) * 8;
2605 void Integer::Decode(const byte* input,
unsigned int inputLen, Signedness s)
2607 unsigned int idx(0);
2608 byte b = input[idx++];
2609 sign_ = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
2611 while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
2617 reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
2620 for (
unsigned int i=inputLen; i > 0; i--)
2623 reg_[(i-1)/WORD_SIZE] |= (
word)b << ((i-1)%WORD_SIZE)*8;
2626 if (sign_ == NEGATIVE)
2628 for (
unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
2629 reg_[i/WORD_SIZE] |= (
word)0xff << (i%WORD_SIZE)*8;
2630 TwosComplement(reg_.get_buffer(), reg_.size());
2635 unsigned int Integer::Encode(byte* output,
unsigned int outputLen,
2636 Signedness signedness)
const
2638 unsigned int idx(0);
2639 if (signedness == UNSIGNED || NotNegative())
2641 for (
unsigned int i=outputLen; i > 0; i--)
2642 output[idx++] = GetByte(i-1);
2647 Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *
this;
2648 for (
unsigned i=0; i<outputLen; i++)
2649 output[idx++] = temp.GetByte(outputLen-i-1);
2655 static Integer* zero = 0;
2657 const Integer &Integer::Zero()
2660 zero = NEW_TC Integer;
2665 static Integer* one = 0;
2667 const Integer &Integer::One()
2670 one = NEW_TC Integer(1,2);
2687 Integer::Integer(RandomNumberGenerator& rng,
const Integer& min,
2690 Randomize(rng, min, max);
2694 void Integer::Randomize(RandomNumberGenerator& rng,
unsigned int nbits)
2696 const unsigned int nbytes = nbits/8 + 1;
2697 ByteBlock
buf(nbytes);
2698 rng.GenerateBlock(
buf.get_buffer(), nbytes);
2700 buf[0] = (byte)Crop(
buf[0], nbits % 8);
2701 Decode(
buf.get_buffer(), nbytes, UNSIGNED);
2704 void Integer::Randomize(RandomNumberGenerator& rng,
const Integer& min,
2707 Integer range = max - min;
2708 const unsigned int nbits = range.BitCount();
2712 Randomize(rng, nbits);
2714 while (*
this > range);
2720 Integer Integer::Power2(
unsigned int e)
2722 Integer r((
word)0, BitsToWords(e + 1));
2728 void Integer::SetBit(
unsigned int n,
bool value)
2732 reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
2733 reg_[n / WORD_BITS] |= (
word(1) << (n % WORD_BITS));
2737 if (n / WORD_BITS < reg_.size())
2738 reg_[n / WORD_BITS] &= ~(
word(1) << (n % WORD_BITS));
2743 void Integer::SetByte(
unsigned int n, byte value)
2745 reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
2746 reg_[n/WORD_SIZE] &= ~(
word(0xff) << 8*(n%WORD_SIZE));
2747 reg_[n/WORD_SIZE] |= (
word(value) << 8*(n%WORD_SIZE));
2751 void Integer::Negate()
2754 sign_ = Sign(1 - sign_);
2758 bool Integer::operator!()
const
2760 return IsNegative() ?
false : (reg_[0]==0 && WordCount()==0);
2764 Integer& Integer::operator=(
const Integer& t)
2768 reg_.New(RoundupSize(t.WordCount()));
2769 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2776 Integer& Integer::operator+=(
const Integer& t)
2778 reg_.CleanGrow(t.reg_.size());
2781 if (t.NotNegative())
2782 PositiveAdd(*
this, *
this, t);
2784 PositiveSubtract(*
this, *
this, t);
2788 if (t.NotNegative())
2789 PositiveSubtract(*
this, t, *
this);
2792 PositiveAdd(*
this, *
this, t);
2793 sign_ = Integer::NEGATIVE;
2800 Integer Integer::operator-()
const
2802 Integer result(*
this);
2808 Integer& Integer::operator-=(
const Integer& t)
2810 reg_.CleanGrow(t.reg_.size());
2813 if (t.NotNegative())
2814 PositiveSubtract(*
this, *
this, t);
2816 PositiveAdd(*
this, *
this, t);
2820 if (t.NotNegative())
2822 PositiveAdd(*
this, *
this, t);
2823 sign_ = Integer::NEGATIVE;
2826 PositiveSubtract(*
this, t, *
this);
2832 Integer& Integer::operator++()
2836 if (Increment(reg_.get_buffer(), reg_.size()))
2838 reg_.CleanGrow(2*reg_.size());
2839 reg_[reg_.size()/2]=1;
2844 word borrow = Decrement(reg_.get_buffer(), reg_.size());
2852 Integer& Integer::operator--()
2856 if (Increment(reg_.get_buffer(), reg_.size()))
2858 reg_.CleanGrow(2*reg_.size());
2859 reg_[reg_.size()/2]=1;
2864 if (Decrement(reg_.get_buffer(), reg_.size()))
2871 Integer& Integer::operator<<=(
unsigned int n)
2873 const unsigned int wordCount = WordCount();
2874 const unsigned int shiftWords = n / WORD_BITS;
2875 const unsigned int shiftBits = n % WORD_BITS;
2877 reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
2878 ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
2880 ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
2885 Integer& Integer::operator>>=(
unsigned int n)
2887 const unsigned int wordCount = WordCount();
2888 const unsigned int shiftWords = n / WORD_BITS;
2889 const unsigned int shiftBits = n % WORD_BITS;
2891 ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
2892 if (wordCount > shiftWords)
2893 ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
2895 if (IsNegative() && WordCount()==0)
2901 void PositiveAdd(Integer& sum,
const Integer& a,
const Integer& b)
2904 if (a.reg_.size() == b.reg_.size())
2905 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2906 b.reg_.get_buffer(), a.reg_.size());
2907 else if (a.reg_.size() > b.reg_.size())
2909 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2910 b.reg_.get_buffer(), b.reg_.size());
2911 CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
2912 a.reg_.size()-b.reg_.size());
2913 carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
2918 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2919 b.reg_.get_buffer(), a.reg_.size());
2920 CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
2921 b.reg_.size()-a.reg_.size());
2922 carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
2928 sum.reg_.CleanGrow(2*sum.reg_.size());
2929 sum.reg_[sum.reg_.size()/2] = 1;
2931 sum.sign_ = Integer::POSITIVE;
2934 void PositiveSubtract(Integer &diff,
const Integer &a,
const Integer& b)
2936 unsigned aSize = a.WordCount();
2938 unsigned bSize = b.WordCount();
2943 if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
2945 Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2946 b.reg_.get_buffer(), aSize);
2947 diff.sign_ = Integer::POSITIVE;
2951 Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2952 a.reg_.get_buffer(), aSize);
2953 diff.sign_ = Integer::NEGATIVE;
2956 else if (aSize > bSize)
2958 word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2959 b.reg_.get_buffer(), bSize);
2960 CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
2961 borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
2962 diff.sign_ = Integer::POSITIVE;
2966 word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2967 a.reg_.get_buffer(), aSize);
2968 CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
2969 borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
2970 diff.sign_ = Integer::NEGATIVE;
2975 unsigned int Integer::MinEncodedSize(Signedness signedness)
const
2977 unsigned int outputLen = max(1
U, ByteCount());
2978 if (signedness == UNSIGNED)
2980 if (NotNegative() && (GetByte(outputLen-1) & 0x80))
2982 if (IsNegative() && *
this < -Power2(outputLen*8-1))
2988 int Integer::Compare(
const Integer& t)
const
2992 if (t.NotNegative())
2993 return PositiveCompare(t);
2999 if (t.NotNegative())
3002 return -PositiveCompare(t);
3007 int Integer::PositiveCompare(
const Integer& t)
const
3009 unsigned size = WordCount(), tSize = t.WordCount();
3012 return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(),
size);
3014 return size > tSize ? 1 : -1;
3018 bool Integer::GetBit(
unsigned int n)
const
3020 if (n/WORD_BITS >= reg_.size())
3023 return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3027 unsigned long Integer::GetBits(
unsigned int i,
unsigned int n)
const
3029 unsigned long v = 0;
3030 for (
unsigned int j=0; j<
n; j++)
3031 v |= GetBit(i+j) << j;
3036 byte Integer::GetByte(
unsigned int n)
const
3038 if (n/WORD_SIZE >= reg_.size())
3041 return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3045 unsigned int Integer::BitCount()
const
3047 unsigned wordCount = WordCount();
3049 return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
3055 unsigned int Integer::ByteCount()
const
3057 unsigned wordCount = WordCount();
3059 return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
3065 unsigned int Integer::WordCount()
const
3067 return CountWords(reg_.get_buffer(), reg_.size());
3071 bool Integer::IsConvertableToLong()
const
3073 if (ByteCount() >
sizeof(
long))
3076 unsigned long value = reg_[0];
3077 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3079 if (sign_ == POSITIVE)
3080 return (
signed long)value >= 0;
3082 return -(
signed long)value < 0;
3086 signed long Integer::ConvertToLong()
const
3088 unsigned long value = reg_[0];
3089 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3090 return sign_ == POSITIVE ? value : -(
signed long)value;
3094 void Integer::Swap(Integer& a)
3097 STL::swap(sign_, a.sign_);
3101 Integer Integer::Plus(
const Integer& b)
const
3103 Integer sum((
word)0, max(reg_.size(), b.reg_.size()));
3106 if (b.NotNegative())
3107 PositiveAdd(sum, *
this, b);
3109 PositiveSubtract(sum, *
this, b);
3113 if (b.NotNegative())
3114 PositiveSubtract(sum, b, *
this);
3117 PositiveAdd(sum, *
this, b);
3118 sum.sign_ = Integer::NEGATIVE;
3125 Integer Integer::Minus(
const Integer& b)
const
3127 Integer diff((
word)0, max(reg_.size(), b.reg_.size()));
3130 if (b.NotNegative())
3131 PositiveSubtract(diff, *
this, b);
3133 PositiveAdd(diff, *
this, b);
3137 if (b.NotNegative())
3139 PositiveAdd(diff, *
this, b);
3140 diff.sign_ = Integer::NEGATIVE;
3143 PositiveSubtract(diff, b, *
this);
3149 Integer Integer::Times(
const Integer &b)
const
3152 Multiply(product, *
this, b);
3173 static inline void AtomicDivide(
word *Q,
const word *A,
const word *B)
3176 DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
3177 DWord(A[2], A[3]), DWord(B[0], B[1]));
3178 Q[0] = q.GetLowHalf();
3179 Q[1] = q.GetHighHalf();
3187 Portable::Multiply2(P, Q, B);
3195 static void CorrectQuotientEstimate(
word *R,
word *T,
word *Q,
const word *B,
3202 for (i=0; i<N; i+=4)
3203 LowLevel::Multiply2(T+i, Q, B+i);
3204 for (i=2; i<N; i+=4)
3205 if (LowLevel::Multiply2Add(T+i, Q, B+i))
3206 T[i+5] += (++T[i+4]==0);
3210 T[N] = LinearMultiply(T, B, Q[0], N);
3214 word borrow = Subtract(R, R, T, N+2);
3217 while (R[N] || Compare(R, B, N) >= 0)
3219 R[N] -= Subtract(R, R, B, N);
3220 Q[1] += (++Q[0]==0);
3232 const word* B,
unsigned int NB)
3236 word *
const TB=T+NA+2;
3237 word *
const TP=T+NA+2+NB;
3240 unsigned shiftWords = (B[NB-1]==0);
3241 TB[0] = TB[NB-1] = 0;
3242 CopyWords(TB+shiftWords, B, NB-shiftWords);
3243 unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
3244 ShiftWordsLeftByBits(TB, NB, shiftBits);
3247 TA[0] = TA[NA] = TA[NA+1] = 0;
3248 CopyWords(TA+shiftWords, A, NA);
3249 ShiftWordsLeftByBits(TA, NA+2, shiftBits);
3251 if (TA[NA+1]==0 && TA[NA] <= 1)
3253 Q[NA-NB+1] = Q[NA-NB] = 0;
3254 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
3256 TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
3266 BT[0] = TB[NB-2] + 1;
3267 BT[1] = TB[NB-1] + (BT[0]==0);
3270 for (
unsigned i=NA-2; i>=NB; i-=2)
3272 AtomicDivide(Q+i-NB, TA+i-2, BT);
3273 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
3277 CopyWords(R, TA+shiftWords, NB);
3278 ShiftWordsRightByBits(R, NB, shiftBits);
3282 void PositiveDivide(Integer& remainder, Integer& quotient,
3283 const Integer& a,
const Integer& b)
3285 unsigned aSize = a.WordCount();
3286 unsigned bSize = b.WordCount();
3288 if (a.PositiveCompare(b) == -1)
3291 remainder.sign_ = Integer::POSITIVE;
3292 quotient = Integer::Zero();
3299 remainder.reg_.CleanNew(RoundupSize(bSize));
3300 remainder.sign_ = Integer::POSITIVE;
3301 quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
3302 quotient.sign_ = Integer::POSITIVE;
3304 AlignedWordBlock T(aSize+2*bSize+4);
3305 Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
3306 T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
3310 void Integer::Divide(Integer &remainder, Integer "ient,
3311 const Integer ÷nd,
const Integer &divisor)
3313 PositiveDivide(remainder, quotient, dividend, divisor);
3315 if (dividend.IsNegative())
3318 if (remainder.NotZero())
3321 remainder = divisor.AbsoluteValue() - remainder;
3325 if (divisor.IsNegative())
3329 void Integer::DivideByPowerOf2(Integer &r, Integer &q,
const Integer &a,
3335 const unsigned int wordCount = BitsToWords(n);
3336 if (wordCount <= a.WordCount())
3338 r.reg_.resize(RoundupSize(wordCount));
3339 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
3340 SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
3341 if (n % WORD_BITS != 0)
3342 r.reg_[wordCount-1] %= (
word(1) << (n % WORD_BITS));
3346 r.reg_.resize(RoundupSize(a.WordCount()));
3347 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
3351 if (a.IsNegative() && r.NotZero())
3358 Integer Integer::DividedBy(
const Integer &b)
const
3360 Integer remainder, quotient;
3361 Integer::Divide(remainder, quotient, *
this, b);
3365 Integer Integer::Modulo(
const Integer &b)
const
3367 Integer remainder, quotient;
3368 Integer::Divide(remainder, quotient, *
this, b);
3372 void Integer::Divide(
word &remainder, Integer "ient,
3373 const Integer ÷nd,
word divisor)
3375 if ((divisor & (divisor-1)) == 0)
3377 quotient = dividend >> (BitPrecision(divisor)-1);
3378 remainder = dividend.reg_[0] & (divisor-1);
3382 unsigned int i = dividend.WordCount();
3383 quotient.reg_.CleanNew(RoundupSize(i));
3387 quotient.reg_[
i] = DWord(dividend.reg_[i], remainder) / divisor;
3388 remainder = DWord(dividend.reg_[i], remainder) % divisor;
3391 if (dividend.NotNegative())
3392 quotient.sign_ = POSITIVE;
3395 quotient.sign_ = NEGATIVE;
3399 remainder = divisor - remainder;
3404 Integer Integer::DividedBy(
word b)
const
3408 Integer::Divide(remainder, quotient, *
this, b);
3412 word Integer::Modulo(
word divisor)
const
3416 if ((divisor & (divisor-1)) == 0)
3417 remainder = reg_[0] & (divisor-1);
3420 unsigned int i = WordCount();
3427 remainder = sum % divisor;
3433 remainder = DWord(reg_[i], remainder) % divisor;
3437 if (IsNegative() && remainder)
3438 remainder = divisor - remainder;
3444 Integer Integer::AbsoluteValue()
const
3446 Integer result(*
this);
3447 result.sign_ = POSITIVE;
3452 Integer Integer::SquareRoot()
const
3458 Integer x, y = Power2((BitCount()+1)/2);
3463 y = (x + *
this/x) >> 1;
3469 bool Integer::IsSquare()
const
3471 Integer r = SquareRoot();
3472 return *
this == r.Squared();
3475 bool Integer::IsUnit()
const
3477 return (WordCount() == 1) && (reg_[0] == 1);
3480 Integer Integer::MultiplicativeInverse()
const
3482 return IsUnit() ? *
this : Zero();
3485 Integer a_times_b_mod_c(
const Integer &x,
const Integer& y,
const Integer& m)
3490 Integer a_exp_b_mod_c(
const Integer &x,
const Integer& e,
const Integer& m)
3492 ModularArithmetic mr(m);
3493 return mr.Exponentiate(x, e);
3496 Integer Integer::Gcd(
const Integer &a,
const Integer &b)
3498 return EuclideanDomainOf().Gcd(a, b);
3501 Integer Integer::InverseMod(
const Integer &m)
const
3503 if (IsNegative() || *
this>=m)
3504 return (*
this%m).InverseMod(m);
3513 Integer u = m.InverseMod(*
this);
3514 return !u ? Zero() : (m*(*this-u)+1)/(*this);
3517 AlignedWordBlock T(m.reg_.size() * 4);
3518 Integer r((
word)0, m.reg_.size());
3519 unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
3520 reg_.get_buffer(), reg_.size(),
3521 m.reg_.get_buffer(), m.reg_.size());
3522 DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
3523 m.reg_.get_buffer(), m.reg_.size());
3527 word Integer::InverseMod(
const word mod)
const
3529 word g0 = mod, g1 = *
this % mod;
3530 word v0 = 0, v1 = 1;
3554 const Integer& ModularArithmetic::Half(
const Integer &a)
const
3556 if (a.reg_.size()==modulus.reg_.size())
3558 TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
3559 modulus.reg_.begin(), a.reg_.size());
3563 return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
3566 const Integer& ModularArithmetic::Add(
const Integer &a,
const Integer &b)
const
3568 if (a.reg_.size()==modulus.reg_.size() &&
3569 b.reg_.size()==modulus.reg_.size())
3571 if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
3573 || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
3574 a.reg_.size()) >= 0)
3576 TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3577 modulus.reg_.begin(), a.reg_.size());
3584 if (result1 >= modulus)
3590 Integer& ModularArithmetic::Accumulate(Integer &a,
const Integer &b)
const
3592 if (a.reg_.size()==modulus.reg_.size() &&
3593 b.reg_.size()==modulus.reg_.size())
3595 if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3596 b.reg_.get_buffer(), a.reg_.size())
3597 || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
3598 a.reg_.size()) >= 0)
3600 TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3601 modulus.reg_.get_buffer(), a.reg_.size());
3614 const Integer& ModularArithmetic::Subtract(
const Integer &a,
3615 const Integer &b)
const
3617 if (a.reg_.size()==modulus.reg_.size() &&
3618 b.reg_.size()==modulus.reg_.size())
3620 if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
3621 b.reg_.begin(), a.reg_.size()))
3622 TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
3623 modulus.reg_.begin(), a.reg_.size());
3629 if (result1.IsNegative())
3635 Integer& ModularArithmetic::Reduce(Integer &a,
const Integer &b)
const
3637 if (a.reg_.size()==modulus.reg_.size() &&
3638 b.reg_.size()==modulus.reg_.size())
3640 if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3641 b.reg_.get_buffer(), a.reg_.size()))
3642 TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3643 modulus.reg_.get_buffer(), a.reg_.size());
3655 const Integer& ModularArithmetic::Inverse(
const Integer &a)
const
3660 CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
3661 if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3662 a.reg_.begin(), a.reg_.size()))
3663 Decrement(result.reg_.begin()+a.reg_.size(), 1,
3664 modulus.reg_.size()-a.reg_.size());
3669 Integer ModularArithmetic::CascadeExponentiate(
const Integer &x,
3670 const Integer &e1,
const Integer &y,
const Integer &e2)
const
3672 if (modulus.IsOdd())
3674 MontgomeryRepresentation dr(modulus);
3675 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
3676 dr.ConvertIn(y), e2));
3679 return AbstractRing::CascadeExponentiate(x, e1, y, e2);
3682 void ModularArithmetic::SimultaneousExponentiate(Integer *results,
3683 const Integer &base,
const Integer *exponents,
3684 unsigned int exponentsCount)
const
3686 if (modulus.IsOdd())
3688 MontgomeryRepresentation dr(modulus);
3689 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
3691 for (
unsigned int i=0; i<exponentsCount; i++)
3692 results[i] = dr.ConvertOut(results[i]);
3695 AbstractRing::SimultaneousExponentiate(results, base,
3696 exponents, exponentsCount);
3721 RecursiveMultiplyBottom(R, T, A, B, N);
3725 const word *B,
unsigned int N)
3727 RecursiveMultiplyTop(R, T, L, A, B, N);
3738 const word *
U,
unsigned int N)
3740 MultiplyBottom(R, T, X, U, N);
3741 MultiplyTop(T, T+N, X, R, M, N);
3742 word borrow = Subtract(T, X+N, T, N);
3744 word carry = Add(T+N, T, M, N);
3746 CopyWords(R, T + (borrow ? N : 0), N);
3753 void RecursiveInverseModPower2(
word *R,
word *T,
const word *A,
unsigned int N)
3757 T[0] = AtomicInverseModPower2(A[0]);
3759 LowLevel::Multiply2Bottom(T+2, T, A);
3760 TwosComplement(T+2, 2);
3761 Increment(T+2, 2, 2);
3762 LowLevel::Multiply2Bottom(R, T, T+2);
3766 const unsigned int N2 = N/2;
3767 RecursiveInverseModPower2(R0, T0, A0, N2);
3769 SetWords(T0+1, 0, N2-1);
3770 MultiplyTop(R1,
T1, T0, R0, A0, N2);
3771 MultiplyBottom(T0,
T1, R0, A1, N2);
3772 Add(T0, R1, T0, N2);
3773 TwosComplement(T0, N2);
3774 MultiplyBottom(R1,
T1, R0, T0, N2);
3796 MontgomeryRepresentation::MontgomeryRepresentation(
const Integer &m)
3797 : ModularArithmetic(m),
3798 u((
word)0, modulus.reg_.size()),
3799 workspace(5*modulus.reg_.size())
3801 RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
3802 modulus.reg_.get_buffer(), modulus.reg_.size());
3805 const Integer& MontgomeryRepresentation::Multiply(
const Integer &a,
3806 const Integer &b)
const
3808 word *
const T = workspace.begin();
3809 word *
const R = result.reg_.begin();
3810 const unsigned int N = modulus.reg_.size();
3812 AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
3813 b.reg_.get_buffer(), b.reg_.size());
3814 SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
3815 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3816 u.reg_.get_buffer(), N);
3820 const Integer& MontgomeryRepresentation::Square(
const Integer &a)
const
3822 word *
const T = workspace.begin();
3823 word *
const R = result.reg_.begin();
3824 const unsigned int N = modulus.reg_.size();
3826 TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
3827 SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
3828 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3829 u.reg_.get_buffer(), N);
3833 Integer MontgomeryRepresentation::ConvertOut(
const Integer &a)
const
3835 word *
const T = workspace.begin();
3836 word *
const R = result.reg_.begin();
3837 const unsigned int N = modulus.reg_.size();
3839 CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3840 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3841 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3842 u.reg_.get_buffer(), N);
3846 const Integer& MontgomeryRepresentation::MultiplicativeInverse(
3847 const Integer &a)
const
3851 word *
const T = workspace.begin();
3852 word *
const R = result.reg_.begin();
3853 const unsigned int N = modulus.reg_.size();
3855 CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3856 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3857 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3858 u.reg_.get_buffer(), N);
3859 unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
3864 DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
3866 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
3873 Integer ModularRoot(
const Integer &a,
const Integer &dp,
const Integer &dq,
3874 const Integer &p,
const Integer &q,
const Integer &u)
3876 Integer p2 = ModularExponentiation((a % p), dp, p);
3877 Integer q2 = ModularExponentiation((a % q), dq, q);
3878 return CRT(p2, p, q2, q, u);
3881 Integer CRT(
const Integer &xp,
const Integer &p,
const Integer &xq,
3882 const Integer &q,
const Integer &u)
3885 return p * (u * (xq-xp) % q) + xp;
3889 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
3890 #ifndef TAOCRYPT_NATIVE_DWORD_AVAILABLE
3891 template hword DivideThreeWordsByTwo<hword, Word>(hword*, hword, hword, Word*);
3893 template word DivideThreeWordsByTwo<word, DWord>(
word*,
word,
word, DWord*);
3894 #ifdef SSE2_INTRINSICS_AVAILABLE
3895 template class AlignedAllocator<word>;