Anyone interested in retromining?
Several years ago I implemented SHA-256 on my TI-83. I also implemented AES-128-CFB. I gave up on RSA and DSA because of memory constraints, so I never did create a complete strong-crypto suite for TI-83/z80. With all the interest in bitcoin these days, maybe someone will find my SHA-256 code interesting (or I'll revisit it---although I'd personally find asymmetric crypto or a stronger symmetric block mode more interesting).
If the ~800 bytes/sec from my comments is reliable, the old z80 should do ~7 h/sec, right?
Calculation code:
; sha256up.z80 by timewave0
; 1twzU46whuMER6hhBPCGmdaLw8atyv9c4
;
; this file constains pure, portable z80 code,
; no TI-8x ROM calls
; this subroutine updates the sha256 hash
; in H0 to H7 given a complete block copied
; to the beginning of WBUF
sha256update:
; second part of message schedule (step 1)
ld ix,WBUF+64-4
ld c,16-1
step1: inc ix
inc ix ; ix tracks W(t)
inc ix
inc ix
inc c ; counter++
; lowercase sigma_1
ld h,(ix-8+2) ; rotate 16 bits
ld l,(ix-8+3) ; as two bytes
ld d,(ix-8+0) ; 0123 -> dehl
ld e,(ix-8+1)
ld b,1
call rotrb ; one bit right
ld a,h
ld (sigma_temp+0+0),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+0),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+0),a
ld a,e
ld (sigma_temp+9+0),a
ld b,2
call rotrb ; two bits right
ld a,h
ld (sigma_temp+0+1),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+1),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+1),a
ld a,e
ld (sigma_temp+9+1),a
ld h,0 ; shift 8 bits
ld l,(ix-8+0) ; as a byte
ld d,(ix-8+1)
ld e,(ix-8+2)
ld b,2
call sharb ; two more bits
ld a,h
ld (sigma_temp+0+2),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+2),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+2),a
ld a,e
ld (sigma_temp+9+2),a
ld hl,sigma_temp
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (ix+0),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (ix+1),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (ix+2),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (ix+3),a
; done with lowercase sigma_1
; lowercase sigma_0
ld h,(ix-60+3) ; rotate 8 bits
ld l,(ix-60+0) ; as one byte
ld d,(ix-60+1)
ld e,(ix-60+2)
ld b,1
call slcb ; one bit left
ld a,h
ld (sigma_temp+0+0),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+0),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+0),a
ld a,e
ld (sigma_temp+9+0),a
ld a,e
ld e,d
ld d,l
ld l,h
ld h,a ; hlde -> ehld
ld b,3
call rotrb ; three more bits
ld a,h
ld (sigma_temp+0+1),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+1),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+1),a
ld a,e
ld (sigma_temp+9+1),a
ld h,(ix-60+0)
ld l,(ix-60+1)
ld d,(ix-60+2)
ld e,(ix-60+3)
ld b,3
call sharb ; three more bits
ld a,h
ld (sigma_temp+0+2),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+2),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+2),a
ld a,e
ld (sigma_temp+9+2),a
ld hl,sigma_temp
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld d,a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld e,a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld b,a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
add a,(ix+3)
ld (ix+3),a
ld a,(ix+2)
adc a,b
ld (ix+2),a
ld a,(ix+1)
adc a,e
ld (ix+1),a
ld a,(ix+0)
adc a,d
ld (ix+0),a
push ix
push ix
pop hl
inc hl
inc hl
inc hl
ld de,3-28
add ix,de
call add32 ; += W_t-7
inc hl
inc hl
inc hl
ld de,28-64
add ix,de
call add32 ; += W_t-16
pop ix
ld a,63
cp c ; repeat?
jp nc,step1
; done with message schedule
; step 2 is a simple copy, made even easier by
; keeping [A-H] and H[0-7] together in memory
ld hl,H0
ld de,ABUF
ld bc,32
ldir
; wow, that was easy
; step 3 is the hardest part of the algorithm
xor a ; start with t = 0
ld (step3_t),a ; counter is step3_t
step_3:
; uppercase sigma_1 of e
ld ix,EBUF
ld h,(ix+3)
ld l,(ix+0)
ld d,(ix+1)
ld e,(ix+2)
ld b,2
call slcb ; net 6 bits right
ld a,h
ld (sigma_temp+0+0),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+0),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+0),a
ld a,e
ld (sigma_temp+9+0),a
ld a,e
ld e,d
ld d,l
ld l,h
ld h,a
ld b,3
call slcb ; 6+8-3 bits right
ld a,h
ld (sigma_temp+0+1),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+1),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+1),a
ld a,e
ld (sigma_temp+9+1),a
ex de,hl
ld b,2
call slcb ; 6+8-3+16-2
ld a,h
ld (sigma_temp+0+2),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+2),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+2),a
ld a,e
ld (sigma_temp+9+2),a
ld hl,sigma_temp
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP1+0),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP1+1),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP1+2),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP1+3),a
; done with uppercase sigma_1 of e
ld ix,HBUF+3
ld hl,TEMP1+3
call add32 ; T_1 += HBUF
ld ix,WBUF+3
ld a,(step3_t)
ld b,0
sla a
rl b ; *= 2
sla a
rl b ; *= 2
ld c,a
add ix,bc
ld hl,TEMP1+3
call add32 ; T_1 += W_t
ld ix,Karray+3
add ix,bc
ld hl,TEMP1+3
call add32 ; T_1 += K_t
; use g xor (e and (f xor g)) for Ch
ld ix,EBUF
ld a,(ix+8+0) ; GBUF
xor (ix+4+0) ; xor FBUF
and (ix+0+0) ; and EBUF
xor (ix+8+0) ; xor GBUF
ld d,a
ld a,(ix+8+1) ; GBUF
xor (ix+4+1) ; xor FBUF
and (ix+0+1) ; and EBUF
xor (ix+8+1) ; xor GBUF
ld e,a
ld a,(ix+8+2) ; GBUF
xor (ix+4+2) ; xor FBUF
and (ix+0+2) ; and EBUF
xor (ix+8+2) ; xor GBUF
ld b,a
ld a,(ix+8+3) ; GBUF
xor (ix+4+3) ; xor FBUF
and (ix+0+3) ; and EBUF
xor (ix+8+3) ; xor GBUF
add a,(ix-20+3)
ld (ix-20+3),a
ld a,(ix-20+2)
adc a,b
ld (ix-20+2),a
ld a,(ix-20+1)
adc a,e
ld (ix-20+1),a
ld a,(ix-20+0)
adc a,d
ld (ix-20+0),a ; TEMP1 += Ch
; uppercase sigma_0 of a
ld ix,ABUF
ld h,(ix+0)
ld l,(ix+1)
ld d,(ix+2) ; 0123 -> hlde
ld e,(ix+3)
ld b,2
call rotrb ; two bits right
ld a,h
ld (sigma_temp+0+0),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+0),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+0),a
ld a,e
ld (sigma_temp+9+0),a
ld a,e
ld e,d
ld d,l
ld l,h
ld h,a
ld b,3
call rotrb ; 2+8+3
ld a,h
ld (sigma_temp+0+1),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+1),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+1),a
ld a,e
ld (sigma_temp+9+1),a
ld a,e
ld e,d
ld d,l
ld l,h
ld h,a
ld b,1
call rotrb ; 2+8+3+8+1
ld a,h
ld (sigma_temp+0+2),a ; all bytes that
ld a,l ; will get xored
ld (sigma_temp+3+2),a ; later should
ld a,d ; be consecutive
ld (sigma_temp+6+2),a
ld a,e
ld (sigma_temp+9+2),a
ld hl,sigma_temp
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP2+0),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP2+1),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP2+2),a
inc hl
ld a,(hl)
inc hl
xor (hl)
inc hl
xor (hl)
ld (TEMP2+3),a
; done with uppercase sigma_0 of a
; use (a and b) or (c and (a or b)) for Maj
; ix still points to ABUF
ld a,(ix+0+0) ; ABUF
or (ix+4+0) ; or BBUF
and (ix+8+0) ; and CBUF
ld d,a
ld a,(ix+0+0) ; ABUF
and (ix+4+0) ; and BBUF
or d
ld d,a
ld a,(ix+0+1) ; ABUF
or (ix+4+1) ; or BBUF
and (ix+8+1) ; and CBUF
ld e,a
ld a,(ix+0+1) ; ABUF
and (ix+4+1) ; and BBUF
or e
ld e,a
ld a,(ix+0+2) ; ABUF
or (ix+4+2) ; or BBUF
and (ix+8+2) ; and CBUF
ld b,a
ld a,(ix+0+2) ; ABUF
and (ix+4+2) ; and BBUF
or b
ld b,a
ld a,(ix+0+3) ; ABUF
or (ix+4+3) ; or BBUF
and (ix+8+3) ; and CBUF
ld c,a
ld a,(ix+0+3) ; ABUF
and (ix+4+3) ; and BBUF
or c
add a,(ix+32+3)
ld (ix+32+3),a
ld a,(ix+32+2)
adc a,b
ld (ix+32+2),a
ld a,(ix+32+1)
adc a,e
ld (ix+32+1),a
ld a,(ix+32+0)
adc a,d
ld (ix+32+0),a ; TEMP2 += Maj
ld de,HBUF+3 ; H=G, G=F, F=E,
ld hl,GBUF+3 ; E=D, D=C, C=B,
ld bc,32 ; B=A, A=T1
lddr
ld hl,EBUF+3
ld ix,TEMP1+3
call add32 ; E += T1
ld hl,ABUF+3
ld ix,TEMP2+3
call add32 ; A += T2
ld a,(step3_t)
inc a
ld (step3_t),a
cp 64 ; loop 64 times
jp c,step_3
; part 4 is all 32-bit addition
ld hl,H7+3
ld ix,HBUF+3
call add32
ld ix,GBUF+3
dec hl ; remember add32
call add32 ; does hl -= 3
ld ix,FBUF+3
dec hl
call add32
ld ix,EBUF+3
dec hl
call add32
ld ix,DBUF+3
dec hl
call add32
ld ix,CBUF+3
dec hl
call add32
ld ix,BBUF+3
dec hl
call add32
ld ix,ABUF+3
dec hl
call add32
; end of not-so-difficult part 4
ret
; end of sha256update subroutine
; subroutine to perform a 32-bit circular
; right shift of hlde iterated b times
rotrb: srl h
rr l
rr d
rr e
jr nc,ncrce
set 7,h
ncrce: djnz rotrb
ret
; the result is in hlde
; subroutine to perform a 32-bit
; right shift of hlde iterated b times
sharb: srl h
rr l
rr d
rr e
ncrh: djnz sharb
ret
; the result is in hlde
; subroutine to perform a 32-bit circular
; left shift of hlde iterated b times
slcb: sla e
rl d
rl l
rl h
jr nc,nclh
inc e
nclh: djnz slcb
ret
; the result is in hlde
; subroutine to perform 32-bit addition
; adds (ix) to (hl)
; both should initially point to the LSB
add32: ld a,(hl)
add a,(ix+0) ; LSB
ld (hl),a
dec hl
ld a,(hl)
adc a,(ix-1) ; add with carry
ld (hl),a
dec hl
ld a,(hl)
adc a,(ix-2)
ld (hl),a
dec hl
ld a,(hl)
adc a,(ix-3) ; MSB
ld (hl),a
ret
; hl changes, and a is clobbered
Str1: .db $04,$AA,$00
; initial values
H0init: .db $6a, $09, $e6, $67
H1init: .db $bb, $67, $ae, $85
H2init: .db $3c, $6e, $f3, $72
H3init: .db $a5, $4f, $f5, $3a
H4init: .db $51, $0e, $52, $7f
H5init: .db $9b, $05, $68, $8c
H6init: .db $1f, $83, $d9, $ab
H7init: .db $5b, $e0, $cd, $19
; constants
Karray: .db $42,$8a,$2f,$98,$71,$37,$44,$91,
.db $b5,$c0,$fb,$cf,$e9,$b5,$db,$a5,
.db $39,$56,$c2,$5b,$59,$f1,$11,$f1,
.db $92,$3f,$82,$a4,$ab,$1c,$5e,$d5,
.db $d8,$07,$aa,$98,$12,$83,$5b,$01,
.db $24,$31,$85,$be,$55,$0c,$7d,$c3,
.db $72,$be,$5d,$74,$80,$de,$b1,$fe,
.db $9b,$dc,$06,$a7,$c1,$9b,$f1,$74,
.db $e4,$9b,$69,$c1,$ef,$be,$47,$86,
.db $0f,$c1,$9d,$c6,$24,$0c,$a1,$cc,
.db $2d,$e9,$2c,$6f,$4a,$74,$84,$aa,
.db $5c,$b0,$a9,$dc,$76,$f9,$88,$da,
.db $98,$3e,$51,$52,$a8,$31,$c6,$6d,
.db $b0,$03,$27,$c8,$bf,$59,$7f,$c7,
.db $c6,$e0,$0b,$f3,$d5,$a7,$91,$47,
.db $06,$ca,$63,$51,$14,$29,$29,$67,
.db $27,$b7,$0a,$85,$2e,$1b,$21,$38,
.db $4d,$2c,$6d,$fc,$53,$38,$0d,$13,
.db $65,$0a,$73,$54,$76,$6a,$0a,$bb,
.db $81,$c2,$c9,$2e,$92,$72,$2c,$85,
.db $a2,$bf,$e8,$a1,$a8,$1a,$66,$4b,
.db $c2,$4b,$8b,$70,$c7,$6c,$51,$a3,
.db $d1,$92,$e8,$19,$d6,$99,$06,$24,
.db $f4,$0e,$35,$85,$10,$6a,$a0,$70,
.db $19,$a4,$c1,$16,$1e,$37,$6c,$08,
.db $27,$48,$77,$4c,$34,$b0,$bc,$b5,
.db $39,$1c,$0c,$b3,$4e,$d8,$aa,$4a,
.db $5b,$9c,$ca,$4f,$68,$2e,$6f,$f3,
.db $74,$8f,$82,$ee,$78,$a5,$63,$6f,
.db $84,$c8,$78,$14,$8c,$c7,$02,$08,
.db $90,$be,$ff,$fa,$a4,$50,$6c,$eb,
.db $be,$f9,$a3,$f7,$c6,$71,$78,$f2
Interface code:
; sha256.z80 by timewave0
; 1twzU46whuMER6hhBPCGmdaLw8atyv9c4
;
; see FIPS PUB 180-2
;
; ~800 bytes/sec for very large
; programs, as measured on Vti
;
; verified for random byte patterns of lengths:
; 0-10, 50-70, 255-257, 16383-16385, 19000
.LIST
_chkfindsym .equ $442A
_findsym .equ $442E
_zeroop1 .equ $428E
_errundefined .equ $467B
_errsyntax .equ $466C
_createstrng .equ $4472
_delvar .equ $44AA
OP1 .equ $8039
progobj .equ $05
strngobj .equ $04
block .equ $8265 ; magic number
extra_bytes .equ block+2
H0 .equ extra_bytes+1
H1 .equ H0+4
H2 .equ H1+4
H3 .equ H2+4
H4 .equ H3+4
H5 .equ H4+4
H6 .equ H5+4
H7 .equ H6+4
; start of variables that can't be moved in memory
TEMP1 .equ H7+4
ABUF .equ TEMP1+4
BBUF .equ ABUF+4
CBUF .equ BBUF+4
DBUF .equ CBUF+4
EBUF .equ DBUF+4
FBUF .equ EBUF+4
GBUF .equ FBUF+4
HBUF .equ GBUF+4
TEMP2 .equ HBUF+4
; end of variables that can't be moved in memory
WBUF .equ TEMP2+4
sigma_temp .equ WBUF+(4*64)
dataptr .equ sigma_temp+(3*4)
step3_t .equ dataptr+2
size .equ step3_t+1
.org $9327 ; magic number
call clearvars ; clear variables
; based on code from squish by Pat Milheron
call _zeroop1
ld hl,Str1
ld de,OP1
ld bc,3
ldir
call _findsym ; lookup Str1
jp c,_errundefined
and $1F
cp strngobj ; is it a sring?
jp nz,_errsyntax
ld hl,op1
ld (hl),progobj
inc hl
ld a,(de) ; size of name
ld c,a
ld b,0
inc de
inc de ; (de) is name ptr
ex de,hl
ldir ; name to op1
call _chkfindsym ; size ptr -> de
jp c,_errundefined
; end of squish-based code
ex de,hl
ld e,(hl) ; LSB -> e
inc hl
ld d,(hl) ; MSB -> d
inc hl
ld (size),de
ld (dataptr),hl
ld b,6
xor a
div64: srl d
rr e
rr a ; save remainder
djnz div64
srl a
srl a
ld (block),de ; # of _whole_ blocks
ld (extra_bytes),a
ld hl,H0init
ld de,H0
ld bc,64
ldir
do_hash: ; a label or advice?
ld hl,(block)
xor a
cp h
jr nz,no_check_l
cp l
jr z,do_padding
no_check_l:
dec hl
ld (block),hl ; block--
ld hl,(dataptr)
ld de,WBUF
ld bc,64
ldir ; copy block
ld (dataptr),hl ; dataptr += 64
call sha256update
jr do_hash
do_padding:
ld a,(extra_bytes)
ld b,0
ld c,a
ld hl,(dataptr)
ld de,WBUF
cp 0
jr z,no_cpy
ldir
no_cpy: ex de,hl
ld (hl),$80
inc hl
cp 63
jr z,need_another_block
load_0s:
neg
add a,63 ; 64-1-extra_bytes
ld b,a
zero_fill:
ld (hl),0
inc hl
djnz zero_fill
cp 8 ; room for length?
jr c,need_another_block
; since the message length can't possibly be more than
; 16 bits, I'm safe reusing the 32-bit circular shift
; from the hash code to multiply by 8
ld de,(size)
ld hl,$0000 ; ensure 0s shift in
ld b,3
call slcb
ld a,e
ld (WBUF+63),a
ld a,d
ld (WBUF+62),a
ld a,l
ld (WBUF+61),a
call sha256update
jr done
need_another_block:
call sha256update
xor a
ld hl,WBUF
jr load_0s ; do another block
; call this guy when we're done hashing
done: call _zeroop1
ld hl,Str1
ld de,OP1
ld bc,3
ldir
call _chkfindsym
call _delvar
ld hl,2*256/8
call _createstrng ; recreate Str1
inc de
ld b,256/8
ld hl,H0-1
store: inc hl
ld a,(hl)
push af
and $F0
srl a
srl a
srl a
srl a
call store_hex ; high nibble
pop af
and $0F
call store_hex ; low nibble
djnz store
call clearvars
ret
; end of done subroutine
.include "sha256up.z80" ; the fun stuff
; subroutine to clear all variables
clearvars:
ld hl,block
xor a
ld (hl),a
ld de,block+1
ld bc,size-block ; last var - start
ldir
ret
; end of clearvars subroutine
; subroutine to store a character of the hash
store_hex:
inc de
cp $A
jr nc,letter
add a,'0'
ld (de),a
ret
letter: add a,'A'-$A
ld (de),a
ret
; end of hex character store subroutine
.end