; ; Copyright © L. van Oldeneel, 2012 ; e-mail: ; ; This program is a free software: you can redistribute it ; and/or modify it under the terms of the GNU General Public ; License as published by the Free Software Foundation. ; ; It is distributed without any warranty of correctness nor ; fintess for any particular purpose. See the GNU General ; Public License for more details. ; ; . ; ; Description: Skein-512-256 hash function, version 1.3. ; Sskein_512_256.asm - Version 1 - August 2012. ; ; ; ---------------------------------------------------- ; | User interface: | ; |----------------------------------------------------| ; |(1) Data to hash must be in SRAM with the first byte| ; | at the location pointed by SRAM_DATA. The data | ; | has to be udated before each update or final | ; | call and the number of bytes needed in SRAM by | ; | each update call is equal to DATA_NUM_BYTE | ; | | ; | If the last block called by the update routine | ; | is exactly the message block, H must be set. | ; | H must be clear otherwise. | ; | | ; | If final routine is called, r24 must contain | ; | the number of bytes of data passed to the | ; | function. | ; |----------------------------------------------------| ; |(2) Call init, update or final routine | ; |----------------------------------------------------| ; |(3) After each call, the intermediate (or final) | ; | hash function state is at location pointed by | ; | SRAM_STATE. Lenght of hash intermediate states | ; | is given by STATE_NUM_BYTE constant. | ; | Lenght of final hash value is given by | ; | HASH_NUM_BYTE. | ; ---------------------------------------------------- ; ; The three test vectors in the Skein 1.3 document, Appendix C.1, p73, have been tested successfully. ; ;.include "tn45def.inc" ; Constants ; .EQU DATA_NUM_BYTE = 64 ;Number of bytes that will be processed by the update function .EQU STATE_NUM_BYTE = 64 ;Memory needed for hash function intermediate state (in bytes) .EQU ADD_MEM_NUM_BYTE = 96 ;Optional additional memory for internal computation (in bytes) ; 64 for the 3-fish key K0..K7 + 8 for the 3-fish K8 + 24 for the tweak T0..T2 .EQU HASH_NUM_BYTE = 32 ;Size of output hash (in bytes) .DSEG SRAM_DATA: .BYTE DATA_NUM_BYTE SRAM_STATE: .BYTE STATE_NUM_BYTE SRAM_ADD_MEM : .BYTE ADD_MEM_NUM_BYTE ;must be following and contiguous to SRAM_STATE (like here) .EQU ST0 = SRAM_STATE .EQU ST1 = SRAM_STATE + 8 .EQU ST2 = SRAM_STATE + 16 .EQU ST3 = SRAM_STATE + 24 .EQU ST4 = SRAM_STATE + 32 .EQU ST5 = SRAM_STATE + 40 .EQU ST6 = SRAM_STATE + 48 .EQU ST7 = SRAM_STATE + 56 .EQU ST8 = SRAM_STATE + 64 .CSEG ; ; Registers declaration ; .def round2 = R0 ;.def sub_round = R1 ;.def w22 = R2 ;.def w23 = R3 ;.def w24 = R4 ;.def w25 = R5 ;.def w26 = R6 ;.def w27 = R7 .def w10 = R8 .def w11 = R9 .def w12 = R10 .def w13 = R11 .def w14 = R12 .def w15 = R13 .def w16 = R14 .def w17 = R15 .def w00 = R16 .def w01 = R17 .def w02 = R18 .def w03 = R19 .def w04 = R20 .def w05 = R21 .def w06 = R22 .def w07 = R23 .def temp = R24 .def round = R25 ;[R26 to R31] are defined as the X, Y and Z pointers (all used here) ; Initialisation ; save the initial config message in SRAM_STATE, as a key for the first 3-fish to be performed. ; save the initial tweak T0|T1. It will be updated afterwards in the UPDATE function. init: ;load the Config message at the state address ldi YH, high(SRAM_STATE) ldi YL, low(SRAM_STATE) ldi ZH, high(C_init<<1) ldi ZL, low(C_init<<1) ldi round, 64 ;'round' is used here as a temporary counter C_init_loop: lpm temp, Z+ st Y+, temp dec round brne C_init_loop ;load the tweak address ldi YH, high(SRAM_ADD_MEM + 72) ldi YL, low(SRAM_ADD_MEM + 72) ;all fields 0, except type (Tmsg = 48 = b001100000) and first (1) (so the ms-byte is b01110000 = 0x70) ldi round, 15 ;'round' is used here as a temporary counter ldi temp, 0 T_init_loop: st Y+, temp dec round brne T_init_loop ldi temp, 0x70 st Y+, temp ret ; Update Hash ; CAUTION : the half-carry bit H must be set if update is called with the last (full) block of the message. This happens only when the message lenght size is a multiple of the block size. Otherwise it must be clear. ; updates the tweak ; performs one 3-FISH call ; update: ldi temp, DATA_NUM_BYTE rcall tweak_upd update_2: rcall threef ;update the 'First' flag in the Tweak, clearing it. the 'Type' is still the constant Tmsg (48). ;load the tweak high byte address, located in the SRAM_ADD_MEM. ldi YH, high(SRAM_ADD_MEM + 87) ldi YL, low(SRAM_ADD_MEM + 87) ldi temp, 48 st Y, temp ret ;end of update ; Final ; r24 contains the number of bytes passed to the final function. ; if the r24 = 0, branch to the last output UBI ; otherwise, pad the message, call update for the last time and then perform the output UBI final: cpi r24, 0 ; r24 = temp breq final_output ;updating the tweak (temp already has the value to add to position) bset 5 ; set H, for it's the last message rcall tweak_upd ;padding the end of the message in SRAM_DATA ldi XH, high(SRAM_DATA) ldi XL, low(SRAM_DATA) add XL, r24 ldi round, 0 ;round is here a temporary register adc XH, round ldi round, DATA_NUM_BYTE sub round, r24 ldi temp, 0 final_padding: st X+, temp dec round brne final_padding rcall threef final_output: ;preparing the message (64 zeros) in SRAM_DATA ldi XH, high(SRAM_DATA) ldi XL, low(SRAM_DATA) ldi round, DATA_NUM_BYTE ;'round' is used here as a temporary counter ldi temp, 0 output_message: st X+, temp dec round brne output_message ;preparing the tweak ;load the tweak address, located in the SRAM_ADD_MEM. ldi YH, high(SRAM_ADD_MEM + 72) ldi YL, low(SRAM_ADD_MEM + 72) ldi ZH, high(final_tweak<<1) ldi ZL, low(final_tweak<<1) ldi round, 16 ;'round' is used here as a temporary counter final_tweak_loop: lpm temp, Z+ st Y+, temp dec round brne final_tweak_loop ;call 3-fish rcall threef ;done, congratulations ret ;******************************************* ;***Threefish Block Cipher Implementation*** ;******************************************* ;******* Function THREEF - BEG ******* ;PRE ;the data is supposed to be present : ;- plain in SRAM_DATA ;- state in SRAM_STATE ;- key (incl. K8) then tweak (incl. T2) in SRAM_ADD_MEM, ;POST ;at the end the cipher xored with the plain message is in the state space threef: ;3-FISH KEY LOADING ldi XH, high(SRAM_STATE) ldi XL, low(SRAM_STATE) ldi YH, high(SRAM_ADD_MEM) ldi YL, low(SRAM_ADD_MEM) ldi round, 64 ;round and w00 key_loop: ld w00, X+ st Y+, w00 dec round brne key_loop ;3-FISH KEYSCHEDULE COMPUTATION rcall k8_t2 ;3-FISH STATE LOADING ;load the message in the state place ldi XH, high(SRAM_DATA) ldi XL, low(SRAM_DATA) ;...to be placed in the state, pointed by Y ldi YH, high(SRAM_STATE) ldi YL, low(SRAM_STATE) ;loop ldi round, 64 ;'round' is used here as a temporary counter plain_loop: ld temp, X+ st Y+, temp dec round brne plain_loop ;while 'round' != 0, then branch ;3-FISH INITIALISATION ;initialize round clr round ;initialize ZL for the 'mix' function ldi ZL, 0x00 ;3-FISH MAIN LOOP : addkey + 16 mix + 4 perm = 4 3-fish rounds / loop main_loop: rcall addkey ;load the rdj_table high adress in Z (used in 'mix' and modified by 'addkey'). the low address is pushed-poped in addkey ldi ZH, high(rdj_table<<1) ldi YL, 4 ;YL used here as a temporary counter, for it is not used in 'mix' nor in 'perm' main_subloop: ldi XH, high(SRAM_STATE) ldi XL, low(SRAM_STATE) rcall mix rcall mix rcall mix rcall mix rcall perm dec YL brne main_subloop inc round ;round is in fact 3-fish-round/4 ;check if round = 18 (=72/4), exit if yes cpi round,18 brne main_loop ;last addkey after the 72 rounds rcall addkey ;UBI-XOR ;save the result in SRAM_STATE (key space) after having xored it with the message ldi XH, high(SRAM_DATA) ldi XL, low(SRAM_DATA) ldi YH, high(SRAM_STATE) ldi YL, low(SRAM_STATE) ldi round, 64 ;round, w00 and W10 are temporary values here end_xor_loop: ld w00, X+ ld w01, Y eor w01, w00 st Y+, w01 dec round brne end_xor_loop ret ;******* Function THREEF - END ******* ;******* Function PERM - BEG ******* perm: ;sequence of saves and moves to perform the whole permutation on the 8 words ;save 6 in W1 (for 'save state word 6') ldi XH, high(ST6) ldi XL, low(ST6) rcall load1 ;move 0 -> 6 ldi XH, high(ST0) ldi XL, low(ST0) rcall load0 ldi XH, high(ST7) ldi XL, low(ST7) rcall save0 ;move 2 -> 0 ldi XH, high(ST2) ldi XL, low(ST2) rcall load0 ldi XH, high(ST1) ldi XL, low(ST1) rcall save0 ;move 4 -> 2 ldi XH, high(ST4) ldi XL, low(ST4) rcall load0 ldi XH, high(ST3) ldi XL, low(ST3) rcall save0 ;move 6 -> 4 ldi XH, high(ST5) ldi XL, low(ST5) rcall save1 ;save 3 in W1 ldi XH, high(ST3) ldi XL, low(ST3) rcall load1 ;move 7 -> 3 ldi XH, high(ST7) ldi XL, low(ST7) rcall load0 ldi XH, high(ST4) ldi XL, low(ST4) rcall save0 ;move 3 -> 7 ldi XH, high(ST8) ldi XL, low(ST8) rcall save1 ;done ret ;******* Function PERM - END ******* ;******* Function ADDKEY - BEG ******* addkey: push ZL ;save ZL for the 'mix' function ;adding naked round-keys ldi XH,high(SRAM_STATE) ;pointing at the beginning of the state ldi XL,low(SRAM_STATE) ldi ZH,high(mod9<<1) mov round2, round lsl round2 ; round2 = round * 2 mov ZL,round2 ldi temp, 8 addkey_loop: lpm YH,Z+ lpm YL,Z+ ;Y points on ksmod9 rcall load0 rcall load1Y rcall add01 rcall save0 adiw XH:XL, 8 ;update X for next iteration dec temp brne addkey_loop ;adding round-tweaks ldi XH,high(SRAM_STATE + 40) ; pointing at the state 5th byte ldi XL,low(SRAM_STATE + 40) ldi ZH,high(mod3<<1) mov ZL,round2 ldi temp, 2 addtweak_loop: lpm YH,Z+ lpm YL,Z+ ;Y points on tsmod3 rcall load0 rcall load1Y rcall add01 rcall save0 adiw XH:XL, 8 ;update X for next iteration dec temp brne addtweak_loop ;adding s ; ldi XH,high(SRAM_STATE + 56) ; pointing at the state 8th byte ; ldi XL,low(SRAM_STATE + 56) clr temp ld w00, X+ ;the next bytes won't be affected by the addition ld w01, X+ add w00, round adc w01, temp st -X, w01 st -X, w00 ;done pop ZL ret ;******* Function ADDKEY - END ******* ;******* Function MIX - BEG ********** cfr. p11 ;performs the MIX function on two words, placed sequentially in the data space. X points on the first one mix: ;load° rcall load0 rcall load1 mix_2: ;1° (W0 + W1) modulo 64 rcall add01 ;2° ZL-bit-rotate-left W1 andi ZL,0x1F lpm temp, Z+ roln64: rcall rol64 dec temp brne roln64 ;3° W0 XOR W1 eor w10,w00 eor w11,w01 eor w12,w02 eor w13,w03 eor w14,w04 eor w15,w05 eor w16,w06 eor w17,w07 ;save° rcall save1 rcall save0 ;update X for the next MIX adiw XH:XL, 16 ret ;******* Function MIX - BEG *********** ;******* k8 and t2 comput.- BEG ******* k8_t2: ;PRE : ;- key K0..K7 placed in SRAM_STATE ;- tweak T0|T1 placed in 2nd and 3rd words of SRAM_ADD_MEM ;POST: ;- K8 follows the key in SRAM_ADD_MEM, T2 follows T1 in SRAM_MEM ;MODIFIED : ;- W0 ;- Y ;load the key address ldi YH, high(SRAM_ADD_MEM) ldi YL, low(SRAM_ADD_MEM) ;compute K8 in W0 ;load C240 (0x 1B D1 1B DA A9 FC 1A 22) ldi w07,0x1B ldi w06,0xD1 ldi w05,0x1B ldi w04,0xDA ldi w03,0xA9 ldi w02,0xFC ldi w01,0x1A ldi w00,0x22 ;xor k0...k7 ('round' used here as a secondary temporary register) ldi round,0x08 k8_t2_1: ld temp,Y+ eor w00,temp ld temp,Y+ eor w01,temp ld temp,Y+ eor w02,temp ld temp,Y+ eor w03,temp ld temp,Y+ eor w04,temp ld temp,Y+ eor w05,temp ld temp,Y+ eor w06,temp ld temp,Y+ eor w07,temp dec round breq k8_t2_2 rjmp k8_t2_1 ;store k8 after k7 (already pointed by Y) k8_t2_2: st Y+,w00 st Y+,w01 st Y+,w02 st Y+,w03 st Y+,w04 st Y+,w05 st Y+,w06 st Y+,w07 ;compute t2 ;load t0 in W0 ld w00,Y+ ld w01,Y+ ld w02,Y+ ld w03,Y+ ld w04,Y+ ld w05,Y+ ld w06,Y+ ld w07,Y+ ;xor t1 ld temp,Y+ eor w00,temp ld temp,Y+ eor w01,temp ld temp,Y+ eor w02,temp ld temp,Y+ eor w03,temp ld temp,Y+ eor w04,temp ld temp,Y+ eor w05,temp ld temp,Y+ eor w06,temp ld temp,Y+ eor w07,temp ;store after t1 st Y+,w00 st Y+,w01 st Y+,w02 st Y+,w03 st Y+,w04 st Y+,w05 st Y+,w06 st Y+,w07 ret ;******* k8 and t2 comput.- END ******* ;******** Function ROL64 - BEG ******* ;makes the 64-bit word 'w1' 1-bit-rotate-left. Note that the execution time is constant (not usefull here) rol64: lsl w10 rol w11 rol w12 rol w13 rol w14 rol w15 rol w16 rol w17 brcc rol64_0 ;if C = 1 inc w10 ret ;if C = 0 rol64_0: ret ;******** Function ROL64 - END ******* ;******* Function ADD01 - BEG ********** ;performs W0 = W0 + W1 (mod 2^64) add01: add w00,w10 adc w01,w11 adc w02,w12 adc w03,w13 adc w04,w14 adc w05,w15 adc w06,w16 adc w07,w17 ret ;******* Function LOAD0 - BEG ********** ;loads the 8 bytes pointed by X in W0 load0: ld w00, X+ ld w01, X+ ld w02, X+ ld w03, X+ ld w04, X+ ld w05, X+ ld w06, X+ ld w07, X+ ret ;******* Function LOAD1 - BEG ********** ;loads the 8 bytes pointed by X in W1 load1: ld w10, X+ ld w11, X+ ld w12, X+ ld w13, X+ ld w14, X+ ld w15, X+ ld w16, X+ ld w17, X+ ret ;******* Function LOAD1Y - BEG ********** ;loads the 8 bytes pointed by Y in W1 load1y: ld w10, Y+ ld w11, Y+ ld w12, Y+ ld w13, Y+ ld w14, Y+ ld w15, Y+ ld w16, Y+ ld w17, Y+ ret ;******* Function SAVE0 - BEG ********** ;saves the 8 bytes W0 where points X, backward save0: st -X, w07 st -X, w06 st -X, w05 st -X, w04 st -X, w03 st -X, w02 st -X, w01 st -X, w00 ret ;******* Function SAVE1 - BEG ********** ;saves the 8 bytes W1 where points X, backward save1: st -X, w17 st -X, w16 st -X, w15 st -X, w14 st -X, w13 st -X, w12 st -X, w11 st -X, w10 ret ;******* Function TWEAK_UPD - BEG ****** tweak_upd: ;update the 'Final' flag if H is set. 'First' might be cleared or set, so a or-instruction is needed. brhc tweak_upd_2 ldi YH, high(SRAM_ADD_MEM + 87) ldi YL, low(SRAM_ADD_MEM + 87) ld round, Y ; round is here a temporary register ldi w00, 0x80 ; b10000000, w00 used here as a temporary register (and temp value is needed) or round, w00 ;set the msb ('Final') st Y, round tweak_upd_2: ;update the tweak position field, adding the temp value ;load the tweak lower 96-bit (12 bytes) address, located in the SRAM_ADD_MEM. ldi YH, high(SRAM_ADD_MEM + 72) ldi YL, low(SRAM_ADD_MEM + 72) ld w00, Y+ ld w01, Y+ ld w02, Y+ ld w03, Y+ ld w04, Y+ ld w05, Y+ ld w06, Y+ ld w07, Y+ ld w10, Y+ ld w11, Y+ ld w12, Y+ ld w13, Y+ ;add the block size add w00,temp ldi round, 0 ;round is here a temporary register, to keep the value temp = r24 adc w01,round adc w02,round adc w03,round adc w04,round adc w05,round adc w06,round adc w07,round adc w10,round adc w11,round adc w12,round adc w13,round ;save the position back sbiw Y, 12 st Y+, w00 st Y+, w01 st Y+, w02 st Y+, w03 st Y+, w04 st Y+, w05 st Y+, w06 st Y+, w07 st Y+, w10 st Y+, w11 st Y+, w12 st Y+, w13 ; Y points here to the 13th tweak byte ret ;******* Function TWEAK_UPD - END ****** .CSEG rdj_table_align: .org ((low(rdj_table_align) & 0xFF) == 0) ? (rdj_table_align) : ((rdj_table_align & 0xFF00) + 256) rdj_table: .db 46,36,19,37,33,27,14,42,17,49,36,39,44,9,54,56,39,30,34,24,13,50,10,17,25,29,39,43,8,35,56,22 ;new constants (skein 1.2 and older) .CSEG mod9_align: .org ((low(mod9_align) & 0xFF) == 0) ? (mod9_align) : ((mod9_align & 0xFF00) + 256) mod9: .db high(SRAM_ADD_MEM),low(SRAM_ADD_MEM),high(SRAM_ADD_MEM+8),low(SRAM_ADD_MEM+8),high(SRAM_ADD_MEM+16),low(SRAM_ADD_MEM+16),high(SRAM_ADD_MEM+24),low(SRAM_ADD_MEM+24),high(SRAM_ADD_MEM+32),low(SRAM_ADD_MEM+32),high(SRAM_ADD_MEM+40),low(SRAM_ADD_MEM+40),high(SRAM_ADD_MEM+48),low(SRAM_ADD_MEM+48),high(SRAM_ADD_MEM+56),low(SRAM_ADD_MEM+56),high(SRAM_ADD_MEM+64),low(SRAM_ADD_MEM+64),high(SRAM_ADD_MEM),low(SRAM_ADD_MEM),high(SRAM_ADD_MEM+8),low(SRAM_ADD_MEM+8),high(SRAM_ADD_MEM+16),low(SRAM_ADD_MEM+16),high(SRAM_ADD_MEM+24),low(SRAM_ADD_MEM+24),high(SRAM_ADD_MEM+32),low(SRAM_ADD_MEM+32),high(SRAM_ADD_MEM+40),low(SRAM_ADD_MEM+40),high(SRAM_ADD_MEM+48),low(SRAM_ADD_MEM+48),high(SRAM_ADD_MEM+56),low(SRAM_ADD_MEM+56),high(SRAM_ADD_MEM+64),low(SRAM_ADD_MEM+64),high(SRAM_ADD_MEM),low(SRAM_ADD_MEM),high(SRAM_ADD_MEM+8),low(SRAM_ADD_MEM+8),high(SRAM_ADD_MEM+16),low(SRAM_ADD_MEM+16),high(SRAM_ADD_MEM+24),low(SRAM_ADD_MEM+24),high(SRAM_ADD_MEM+32),low(SRAM_ADD_MEM+32),high(SRAM_ADD_MEM+40),low(SRAM_ADD_MEM+40),high(SRAM_ADD_MEM+48),low(SRAM_ADD_MEM+48),high(SRAM_ADD_MEM+56),low(SRAM_ADD_MEM+56) .CSEG mod3_align: .org ((low(mod3_align) & 0xFF) == 0) ? (mod3_align) : ((mod3_align & 0xFF00) + 256) mod3: .db high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80),high(SRAM_ADD_MEM+88),low(SRAM_ADD_MEM+88),high(SRAM_ADD_MEM+72),low(SRAM_ADD_MEM+72),high(SRAM_ADD_MEM+80),low(SRAM_ADD_MEM+80) .CSEG ;no need to place it at a special address C_init: ;0xCC D0 44 A1 2F DB 3E 13, 0xE8 35 90 30 1A 79 A9 EB, 0x55 AE A0 61 4F 81 6E 6F, 0x2A 27 67 A4 AE 9B 94 DB ;0xEC 06 02 5E 74 DD 76 83, 0xE7 A4 36 CD C4 74 62 51, 0xC3 6F BA F9 39 3A D1 85, 0x3E ED BA 18 33 ED FC 13 .db 0x13, 0x3E ,0xDB ,0x2F ,0xA1 ,0x44 ,0xD0, 0xCC ,0xEB ,0xA9 ,0x79 ,0x1A ,0x30 ,0x90 ,0x35 ,0xE8 ,0x6F ,0x6E ,0x81 ,0x4F ,0x61 ,0xA0 ,0xAE ,0x55 ,0xDB ,0x94 ,0x9B ,0xAE ,0xA4 ,0x67 ,0x27 ,0x2A, 0x83, 0x76 ,0xDD ,0x74 ,0x5E ,0x02 ,0x06, 0xEC ,0x51 ,0x62 ,0x74 ,0xC4 ,0xCD ,0x36 ,0xA4 ,0xE7 ,0x85 ,0xD1 ,0x3A ,0x39 ,0xF9 ,0xBA ,0x6F ,0xC3 ,0x13 ,0xFC ,0xED ,0x33 ,0x18 ,0xBA ,0xED ,0x3E final_tweak: ;(position = 32 ; type = output (63) ; final = 1 ; first = 1) .db 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff ;******************************************* ;***Threefish END Implementation*** ;*******************************************