Perform the AES-CBC XOR operations 4 bytes at a time, using the helper

author Ken Raeburn <raeburn@mit.edu>

Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)

committer Ken Raeburn <raeburn@mit.edu>

Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)
author Ken Raeburn <raeburn@mit.edu>
Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)
committer Ken Raeburn <raeburn@mit.edu>
Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)
diff --git a/src/lib/crypto/builtin/enc_provider/aes.c b/src/lib/crypto/builtin/enc_provider/aes.c

index 396f6537564d6beffbb0207eb46b6ea6d09bc14a..e635f517d2a70229bea1d3038cdd8f914584b87a 100644 (file)
--- a/src/lib/crypto/builtin/enc_provider/aes.c
+++ b/src/lib/crypto/builtin/enc_provider/aes.c
@@ -51,8 +51,24 @@ static void
  xorblock(unsigned char *out, const unsigned char *in)
  {
      int z;
-    for (z = 0; z < BLOCK_SIZE; z++)
-        out[z] ^= in[z];
+    for (z = 0; z < BLOCK_SIZE/4; z++) {
+        unsigned char *outptr = &out[z*4];
+        unsigned char *inptr = &in[z*4];
+        /* Use unaligned accesses.  On x86, this will probably still
+           be faster than multiple byte accesses for unaligned data,
+           and for aligned data should be far better.  (One test
+           indicated about 2.4% faster encryption for 1024-byte
+           messages.)
+
+           If some other CPU has really slow unaligned-word or byte
+           accesses, perhaps this function (or the load/store
+           helpers?) should test for alignment first.
+
+           If byte accesses are faster than unaligned words, we may
+           need to conditionalize on CPU type, as that may be hard to
+           determine automatically.  */
+        store_32_n (load_32_n(outptr) ^ load_32_n(inptr), outptr);
+    }
  }
  
  krb5_error_code
author	Ken Raeburn <raeburn@mit.edu>
	Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)
committer	Ken Raeburn <raeburn@mit.edu>
	Wed, 2 Dec 2009 23:09:33 +0000 (23:09 +0000)