Hello, I have weird issues using CudaAccelerator.
I wrote a very simple (perhaps naive) implementation of the DES ECB encryption, and with the CPUAccelerator it works as expected. But when I change the CPUAccelerator to the CudaAccelerator, I get wrong results. Here is my full code:
using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.CPU;
using ILGPU.Runtime.Cuda;
using System;
using System.Linq;
using System.Diagnostics;
namespace VVatashi.Cryptography
{
public static class Program
{
private struct RoundKeys
{
public ulong Key0;
public ulong Key1;
public ulong Key2;
public ulong Key3;
public ulong Key4;
public ulong Key5;
public ulong Key6;
public ulong Key7;
public ulong Key8;
public ulong Key9;
public ulong Key10;
public ulong Key11;
public ulong Key12;
public ulong Key13;
public ulong Key14;
public ulong Key15;
}
private static byte[] PC1 = new byte[] {
56, 48, 40, 32, 24, 16, 8, 0,
57, 49, 41, 33, 25, 17, 9, 1,
58, 50, 42, 34, 26, 18, 10, 2,
59, 51, 43, 35, 62, 54, 46, 38,
30, 22, 14, 6, 61, 53, 45, 37,
29, 21, 13, 5, 60, 52, 44, 36,
28, 20, 12, 4, 27, 19, 11, 3,
};
private static byte[] PC2 = new byte[] {
13, 16, 10, 23, 0, 4, 2, 27,
14, 5, 20, 9, 22, 18, 11, 3,
25, 7, 15, 6, 26, 19, 12, 1,
40, 51, 30, 36, 46, 54, 29, 39,
50, 44, 32, 47, 43, 48, 38, 55,
33, 52, 45, 41, 49, 35, 28, 31,
};
private static byte[] IP = new byte[] {
57, 49, 41, 33, 25, 17, 9, 1,
59, 51, 43, 35, 27, 19, 11, 3,
61, 53, 45, 37, 29, 21, 13, 5,
63, 55, 47, 39, 31, 23, 15, 7,
56, 48, 40, 32, 24, 16, 8, 0,
58, 50, 42, 34, 26, 18, 10, 2,
60, 52, 44, 36, 28, 20, 12, 4,
62, 54, 46, 38, 30, 22, 14, 6,
};
private static byte[] IIP = new byte[] {
39, 7, 47, 15, 55, 23, 63, 31,
38, 6, 46, 14, 54, 22, 62, 30,
37, 5, 45, 13, 53, 21, 61, 29,
36, 4, 44, 12, 52, 20, 60, 28,
35, 3, 43, 11, 51, 19, 59, 27,
34, 2, 42, 10, 50, 18, 58, 26,
33, 1, 41, 9, 49, 17, 57, 25,
32, 0, 40, 8, 48, 16, 56, 24,
};
private static byte[] E = new byte[] {
31, 0, 1, 2, 3, 4,
3, 4, 5, 6, 7, 8,
7, 8, 9, 10, 11, 12,
11, 12, 13, 14, 15, 16,
15, 16, 17, 18, 19, 20,
19, 20, 21, 22, 23, 24,
23, 24, 25, 26, 27, 28,
27, 28, 29, 30, 31, 0,
};
private static byte[] P = new byte[] {
15, 6, 19, 20, 28, 11, 27, 16,
0, 14, 22, 25, 4, 17, 30, 9,
1, 7, 23, 13, 31, 26, 2, 8,
18, 12, 29, 5, 21, 10, 3, 24,
};
private static byte[] S = new byte[] {
// S0
14, 0, 4, 15, 13, 7, 1, 4, 2, 14, 15, 2, 11, 13, 8, 1,
3, 10, 10, 6, 6, 12, 12, 11, 5, 9, 9, 5, 0, 3, 7, 8,
4, 15, 1, 12, 14, 8, 8, 2, 13, 4, 6, 9, 2, 1, 11, 7,
15, 5, 12, 11, 9, 3, 7, 14, 3, 10, 10, 0, 5, 6, 0, 13,
// S1
15, 3, 1, 13, 8, 4, 14, 7, 6, 15, 11, 2, 3, 8, 4, 14,
9, 12, 7, 0, 2, 1, 13, 10, 12, 6, 0, 9, 5, 11, 10, 5,
0, 13, 14, 8, 7, 10, 11, 1, 10, 3, 4, 15, 13, 4, 1, 2,
5, 11, 8, 6, 12, 7, 6, 12, 9, 0, 3, 5, 2, 14, 15, 9,
// S2
10, 13, 0, 7, 9, 0, 14, 9, 6, 3, 3, 4, 15, 6, 5, 10,
1, 2, 13, 8, 12, 5, 7, 14, 11, 12, 4, 11, 2, 15, 8, 1,
13, 1, 6, 10, 4, 13, 9, 0, 8, 6, 15, 9, 3, 8, 0, 7,
11, 4, 1, 15, 2, 14, 12, 3, 5, 11, 10, 5, 14, 2, 7, 12,
// S3
7, 13, 13, 8, 14, 11, 3, 5, 0, 6, 6, 15, 9, 0, 10, 3,
1, 4, 2, 7, 8, 2, 5, 12, 11, 1, 12, 10, 4, 14, 15, 9,
10, 3, 6, 15, 9, 0, 0, 6, 12, 10, 11, 1, 7, 13, 13, 8,
15, 9, 1, 4, 3, 5, 14, 11, 5, 12, 2, 7, 8, 2, 4, 14,
// S4
2, 14, 12, 11, 4, 2, 1, 12, 7, 4, 10, 7, 11, 13, 6, 1,
8, 5, 5, 0, 3, 15, 15, 10, 13, 3, 0, 9, 14, 8, 9, 6,
4, 11, 2, 8, 1, 12, 11, 7, 10, 1, 13, 14, 7, 2, 8, 13,
15, 6, 9, 15, 12, 0, 5, 9, 6, 10, 3, 4, 0, 5, 14, 3,
// S5
12, 10, 1, 15, 10, 4, 15, 2, 9, 7, 2, 12, 6, 9, 8, 5,
0, 6, 13, 1, 3, 13, 4, 14, 14, 0, 7, 11, 5, 3, 11, 8,
9, 4, 14, 3, 15, 2, 5, 12, 2, 9, 8, 5, 12, 15, 3, 10,
7, 11, 0, 14, 4, 1, 10, 7, 1, 6, 13, 0, 11, 8, 6, 13,
// S6
4, 13, 11, 0, 2, 11, 14, 7, 15, 4, 0, 9, 8, 1, 13, 10,
3, 14, 12, 3, 9, 5, 7, 12, 5, 2, 10, 15, 6, 8, 1, 6,
1, 6, 4, 11, 11, 13, 13, 8, 12, 1, 3, 4, 7, 10, 14, 7,
10, 9, 15, 5, 6, 0, 8, 15, 0, 14, 5, 2, 9, 3, 2, 12,
// S7
13, 1, 2, 15, 8, 13, 4, 8, 6, 10, 15, 3, 11, 7, 1, 4,
10, 12, 9, 5, 3, 6, 14, 11, 5, 0, 0, 14, 12, 9, 7, 2,
7, 2, 11, 1, 4, 14, 1, 7, 9, 4, 12, 10, 14, 8, 2, 13,
0, 15, 6, 12, 10, 9, 13, 0, 15, 3, 3, 5, 5, 6, 8, 11,
};
private static ulong Permutate(ulong value, ArrayView<byte> table)
{
ulong result = 0;
for (int i = table.Length - 1; i >= 0; --i) {
result >>= 1;
if ((value & (0x8000000000000000UL >> table[i])) != 0) {
result |= 0x8000000000000000UL;
}
}
return result;
}
private static ulong ShiftHalfKey(ulong key, byte count)
{
key = (key << count) | (key >> (28 - count));
return key & 0xFFFFFFF000000000UL;
}
private static RoundKeys CreateRoundKeys(ulong key, ArrayView<byte> pc1, ArrayView<byte> pc2)
{
key = Permutate(key, pc1);
ulong c0 = key & 0xFFFFFFF000000000UL;
ulong d0 = (key << 28) & 0xFFFFFFF000000000UL;
return new RoundKeys() {
Key0 = Permutate(ShiftHalfKey(c0, 1) | (ShiftHalfKey(d0, 1) >> 28), pc2),
Key1 = Permutate(ShiftHalfKey(c0, 2) | (ShiftHalfKey(d0, 2) >> 28), pc2),
Key2 = Permutate(ShiftHalfKey(c0, 4) | (ShiftHalfKey(d0, 4) >> 28), pc2),
Key3 = Permutate(ShiftHalfKey(c0, 6) | (ShiftHalfKey(d0, 6) >> 28), pc2),
Key4 = Permutate(ShiftHalfKey(c0, 8) | (ShiftHalfKey(d0, 8) >> 28), pc2),
Key5 = Permutate(ShiftHalfKey(c0, 10) | (ShiftHalfKey(d0, 10) >> 28), pc2),
Key6 = Permutate(ShiftHalfKey(c0, 12) | (ShiftHalfKey(d0, 12) >> 28), pc2),
Key7 = Permutate(ShiftHalfKey(c0, 14) | (ShiftHalfKey(d0, 14) >> 28), pc2),
Key8 = Permutate(ShiftHalfKey(c0, 15) | (ShiftHalfKey(d0, 15) >> 28), pc2),
Key9 = Permutate(ShiftHalfKey(c0, 17) | (ShiftHalfKey(d0, 17) >> 28), pc2),
Key10 = Permutate(ShiftHalfKey(c0, 19) | (ShiftHalfKey(d0, 19) >> 28), pc2),
Key11 = Permutate(ShiftHalfKey(c0, 21) | (ShiftHalfKey(d0, 21) >> 28), pc2),
Key12 = Permutate(ShiftHalfKey(c0, 23) | (ShiftHalfKey(d0, 23) >> 28), pc2),
Key13 = Permutate(ShiftHalfKey(c0, 25) | (ShiftHalfKey(d0, 25) >> 28), pc2),
Key14 = Permutate(ShiftHalfKey(c0, 27) | (ShiftHalfKey(d0, 27) >> 28), pc2),
Key15 = Permutate(c0 | (d0 >> 28), pc2),
};
}
private static ulong Encrypt(
ulong key,
ulong value,
ArrayView<byte> e,
ArrayView<byte> s,
ArrayView<byte> p
) {
value = Permutate(value, e) ^ key;
ulong b0 = (value >> 58) & 0x3FUL;
ulong b1 = (value >> (58 - 6)) & 0x3FUL;
ulong b2 = (value >> (58 - 6 * 2)) & 0x3FUL;
ulong b3 = (value >> (58 - 6 * 3)) & 0x3FUL;
ulong b4 = (value >> (58 - 6 * 4)) & 0x3FUL;
ulong b5 = (value >> (58 - 6 * 5)) & 0x3FUL;
ulong b6 = (value >> (58 - 6 * 6)) & 0x3FUL;
ulong b7 = (value >> (58 - 6 * 7)) & 0x3FUL;
ulong sb0 = s[(int)b0];
ulong sb1 = s[(int)b1 + 64];
ulong sb2 = s[(int)b2 + 64 * 2];
ulong sb3 = s[(int)b3 + 64 * 3];
ulong sb4 = s[(int)b4 + 64 * 4];
ulong sb5 = s[(int)b5 + 64 * 5];
ulong sb6 = s[(int)b6 + 64 * 6];
ulong sb7 = s[(int)b7 + 64 * 7];
ulong result = (sb7 << 60);
result = (sb6 << 60) | (result >> 4);
result = (sb5 << 60) | (result >> 4);
result = (sb4 << 60) | (result >> 4);
result = (sb3 << 60) | (result >> 4);
result = (sb2 << 60) | (result >> 4);
result = (sb1 << 60) | (result >> 4);
result = (sb0 << 60) | (result >> 4);
return Permutate(result, p);
}
private static void Kernel(
Index index,
ArrayView<byte> pc1,
ArrayView<byte> pc2,
ArrayView<byte> ip,
ArrayView<byte> iip,
ArrayView<byte> e,
ArrayView<byte> s,
ArrayView<byte> p,
ArrayView<ulong> keys,
ArrayView<ulong> blocks,
ArrayView<ulong> output
) {
RoundKeys roundKeys = CreateRoundKeys(keys[index], pc1, pc2);
ulong block = Permutate(blocks[index], ip);
ulong a0 = block & 0xFFFFFFFF00000000UL;
ulong b0 = (block << 32) & 0xFFFFFFFF00000000UL;
ulong b1 = Encrypt(roundKeys.Key0, b0, e, s, p) ^ a0;
ulong b2 = Encrypt(roundKeys.Key1, b1, e, s, p) ^ b0;
ulong b3 = Encrypt(roundKeys.Key2, b2, e, s, p) ^ b1;
ulong b4 = Encrypt(roundKeys.Key3, b3, e, s, p) ^ b2;
ulong b5 = Encrypt(roundKeys.Key4, b4, e, s, p) ^ b3;
ulong b6 = Encrypt(roundKeys.Key5, b5, e, s, p) ^ b4;
ulong b7 = Encrypt(roundKeys.Key6, b6, e, s, p) ^ b5;
ulong b8 = Encrypt(roundKeys.Key7, b7, e, s, p) ^ b6;
ulong b9 = Encrypt(roundKeys.Key8, b8, e, s, p) ^ b7;
ulong b10 = Encrypt(roundKeys.Key9, b9, e, s, p) ^ b8;
ulong b11 = Encrypt(roundKeys.Key10, b10, e, s, p) ^ b9;
ulong b12 = Encrypt(roundKeys.Key11, b11, e, s, p) ^ b10;
ulong b13 = Encrypt(roundKeys.Key12, b12, e, s, p) ^ b11;
ulong b14 = Encrypt(roundKeys.Key13, b13, e, s, p) ^ b12;
ulong b15 = Encrypt(roundKeys.Key14, b14, e, s, p) ^ b13;
ulong b16 = Encrypt(roundKeys.Key15, b15, e, s, p) ^ b14;
ulong result = b16 | (b15 >> 32);
output[index] = Permutate(result, iip);
}
private static void Main(string[] args)
{
using (var context = new Context())
using (var accelerator = new CPUAccelerator(context))
{
const int count = 10;
var kernel = accelerator.LoadAutoGroupedStreamKernel<
Index,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<byte>,
ArrayView<ulong>,
ArrayView<ulong>,
ArrayView<ulong>
>(Kernel);
using (var pc1Buffer = accelerator.Allocate<byte>(PC1.Length))
using (var pc2Buffer = accelerator.Allocate<byte>(PC2.Length))
using (var ipBuffer = accelerator.Allocate<byte>(IP.Length))
using (var iipBuffer = accelerator.Allocate<byte>(IIP.Length))
using (var eBuffer = accelerator.Allocate<byte>(E.Length))
using (var sBuffer = accelerator.Allocate<byte>(S.Length))
using (var pBuffer = accelerator.Allocate<byte>(P.Length))
using (var keysBuffer = accelerator.Allocate<ulong>(count))
using (var blocksBuffer = accelerator.Allocate<ulong>(count))
using (var outputBuffer = accelerator.Allocate<ulong>(count))
{
pc1Buffer.CopyFrom(PC1, 0, 0, PC1.Length);
pc2Buffer.CopyFrom(PC2, 0, 0, PC2.Length);
ipBuffer.CopyFrom(IP, 0, 0, IP.Length);
iipBuffer.CopyFrom(IIP, 0, 0, IIP.Length);
eBuffer.CopyFrom(E, 0, 0, E.Length);
sBuffer.CopyFrom(S, 0, 0, S.Length);
pBuffer.CopyFrom(P, 0, 0, P.Length);
// Just for testing.
ulong[] keys = Enumerable.Range(0, count).Select(x => (ulong)x).ToArray();
keysBuffer.CopyFrom(keys, 0, 0, keys.Length);
ulong[] blocks = Enumerable.Range(0, count).Select(x => (ulong)x).ToArray();
blocksBuffer.CopyFrom(blocks, 0, 0, blocks.Length);
var sw = new Stopwatch();
sw.Start();
kernel(
count,
pc1Buffer.View,
pc2Buffer.View,
ipBuffer.View,
iipBuffer.View,
eBuffer.View,
sBuffer.View,
pBuffer.View,
keysBuffer.View,
blocksBuffer.View,
outputBuffer.View
);
accelerator.Synchronize();
sw.Stop();
Console.WriteLine("Kernel executed in {0} ms", sw.ElapsedMilliseconds);
ulong[] output = outputBuffer.GetAsArray();
for (int i = 0; i < 10; ++i) {
Console.WriteLine("{0} {1} {2}", keys[i], blocks[i], output[i]);
}
}
}
}
}
}
After lots of debugging I discovered that it starts getting wrong results on the line ulong b1 = Encrypt(roundKeys.Key0, b0, e, s, p) ^ a0;
Strange that Encrypt(roundKeys.Key0, b0, e, s, p)
and a0
separately calculated correctly, but b1
always evaluates to 0.
I am running it on the Arch linux 5.0.10, .NET Core 2.2.105, NVIDIA GeForce GTX650 GPU with the 418.56 driver, CUDA 10.1, if it matters.