-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathxfma.z80
More file actions
241 lines (223 loc) · 3.42 KB
/
Copy pathxfma.z80
File metadata and controls
241 lines (223 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#ifndef included_xfma
#define included_xfma
#include "pushpop.z80"
#include "mov.z80"
#include "routines/add64.z80"
#include "routines/sub64.z80"
#include "routines/sla64.z80"
#include "routines/rr64.z80"
#include "routines/srl64.z80"
#include "routines/srl64_x4.z80"
#include "routines/swapbuf.z80"
#include "xmul.z80"
xfma:
;Fused Multiply-Add
;Performs x*y+t -> z
;HL points to x
;DE points to y
;BC points to z
;IX points to t
call pushpop
push bc ;save the output location
push ix ;save the location of what to add
;First multiply x and y, but keep full 128-bits precision
ld bc,var_z+8
call xmul
;Now we need to perform a high-precision addition
;First we move the float to scrap
pop hl
ld de,var_z-10
call mov10
;Now do a special add
call fma_add
;Now return the result
pop de
ld hl,var_z+8
jp mov10
fma_add:
; Zero out the bottom 8 bytes of the addend
ld hl,0
ld (var_z-12),hl
ld (var_z-14),hl
ld (var_z-16),hl
ld (var_z-18),hl
; Check for special cases
ld de,(var_z-2)
ld hl,(var_z+16)
res 7,h
res 7,d
ld a,h
or l
jp z,caseadd_fma
ld a,d
or e
jp z,caseadd1_fma
; Now make sure var_z has the bigger exponent
sbc hl,de
jr nc,+_
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
push hl
; We need to swap.
ld de,var_z-18
ld hl,var_z
ld bc,18
call swapbuf
pop hl
_:
ld a,h
or a
ret nz
ld a,l
cp 130
ret nc
;Now we need to shift down by A bits.
or a
jr z,add_shifted_fma
rra \ call c,srl_var_z_m_18
rra \ call c,srl2_var_z_m_18
rra \ call c,srl4_var_z_m_18
and $1F
ld l,a
ld bc,255&(var_z-19)
ld h,(var_z-18)>>8
add hl,bc
sub 18
cpl
ld c,a
ld de,var_z-19
ldir
ld c,a
ld a,17
sub c
jr z,add_shifted_fma
ld b,a
xor a
ld (de),a \ inc de \ djnz $-2
add_shifted_fma:
;If the signs match, then just add
;If they differ, then subtract
ld hl,var_z-1
ld a,(var_z+17)
xor (hl)
jp p,xfma_add
; Subtract the mantissas
ld hl,var_z-18
ld de,var_z
call sub64
inc hl
inc de
call sbc64
jr nc,+_
;Negate the mantissa, invert the sign
; Invert the sign
inc de
inc de
ld a,(de)
xor 80h
ld (de),a
; Negate the mantissa
ld hl,var_z
ld bc,$1000
ld a,c \ sbc a,(hl) \ ld (hl),a \ inc hl
djnz $-4
_:
ret m
;need to shift up until top bit is 1. Should be at most 1 shift, I think
ld de,(var_z+16)
; Make sure that the mantissa isn't zero
ld hl,var_z
ld b,15
ld a,(hl)
_:
inc hl \ or (hl) \ jr nz,+_
djnz -_
inc hl
ld (hl),a
inc hl
ld (hl),a
ret
_:
dec de
ld a,d
and $7F
or e
jp z,add_zero_fma
ld hl,var_z
call sla64
inc hl
call rl64
jp p,-_
ld (var_z+16),de
ret
xfma_add:
;add the mantissas
ld hl,var_z-18
ld de,var_z
call add64
inc hl
inc de
call adc64
ret nc
ex de,hl
inc hl
inc (hl) \ jr nz,+_
inc hl
inc (hl)
ld a,(hl)
dec hl
and $7F
jr z,add_inf_fma
scf
_:
dec hl
jp rr64
srl4_var_z_m_18:
ld hl,var_z-3
ld b,a
call srl64_4
dec hl
call rrd8
ld a,b
ret
srl2_var_z_m_18:
call srl_var_z_m_18
srl_var_z_m_18:
ld hl,var_z-3
ld b,a
call srl64
dec hl
call rr64
ld a,b
ret
caseadd_fma:
;zero+x => x for all x
;NaN +x => NaN for all x
;inf-inf=> NaN
;inf +x => inf, x != inf
ret
caseadd1_fma:
;x+zero => x
;x+inf => inf
;x+NaN => NaN
ret
add_zero_fma:
xor a
ld (var_z+15),a
ld h,a
ld l,a
ld (var_z+16),hl
ret
add_inf_fma:
xor a
ld (var_z+15),a
dec a
ld h,a
ld l,a
ld (var_z+16),hl
ret
#endif