z80float/extended/xfma.z80 at master · Zeda/z80float · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#ifndef included_xfma
#define included_xfma
#include "pushpop.z80"
#include "mov.z80"
#include "routines/add64.z80"
#include "routines/sub64.z80"
#include "routines/sla64.z80"
#include "routines/rr64.z80"
#include "routines/srl64.z80"
#include "routines/srl64_x4.z80"
#include "routines/swapbuf.z80"
#include "xmul.z80"


xfma:
;Fused Multiply-Add
;Performs x*y+t -> z
;HL points to x
;DE points to y
;BC points to z
;IX points to t
  call pushpop
  push bc   ;save the output location
  push ix   ;save the location of what to add

;First multiply x and y, but keep full 128-bits precision
  ld bc,var_z+8
  call xmul

;Now we need to perform a high-precision addition
;First we move the float to scrap
  pop hl
  ld de,var_z-10
  call mov10

;Now do a special add
  call fma_add

;Now return the result
  pop de
  ld hl,var_z+8
  jp mov10

fma_add:
; Zero out the bottom 8 bytes of the addend
  ld hl,0
  ld (var_z-12),hl
  ld (var_z-14),hl
  ld (var_z-16),hl
  ld (var_z-18),hl


; Check for special cases
	ld de,(var_z-2)
	ld hl,(var_z+16)
  res 7,h
  res 7,d

  ld a,h
  or l
  jp z,caseadd_fma
  ld a,d
  or e
  jp z,caseadd1_fma

; Now make sure var_z has the bigger exponent
  sbc hl,de
  jr nc,+_
  xor a
  sub l
  ld l,a
  sbc a,a
  sub h
  ld h,a
  push hl
; We need to swap.
  ld de,var_z-18
  ld hl,var_z
  ld bc,18
  call swapbuf
  pop hl
_:
  ld a,h
  or a
  ret nz
  ld a,l
  cp 130
  ret nc
;Now we need to shift down by A bits.
  or a
  jr z,add_shifted_fma
  rra \ call c,srl_var_z_m_18
  rra \ call c,srl2_var_z_m_18
  rra \ call c,srl4_var_z_m_18
  and $1F
  ld l,a
  ld bc,255&(var_z-19)
  ld h,(var_z-18)>>8
  add hl,bc
  sub 18
  cpl
  ld c,a
  ld de,var_z-19
  ldir
  ld c,a
  ld a,17
  sub c
  jr z,add_shifted_fma
  ld b,a
  xor a
  ld (de),a \ inc de \ djnz $-2
add_shifted_fma:
;If the signs match, then just add
;If they differ, then subtract
	ld hl,var_z-1
	ld a,(var_z+17)
  xor (hl)
  jp p,xfma_add

; Subtract the mantissas
  ld hl,var_z-18
  ld de,var_z
  call sub64
  inc hl
  inc de
  call sbc64
  jr nc,+_
;Negate the mantissa, invert the sign
; Invert the sign
  inc de
  inc de
  ld a,(de)
  xor 80h
  ld (de),a

; Negate the mantissa
  ld hl,var_z
  ld bc,$1000
  ld a,c \ sbc a,(hl) \ ld (hl),a \ inc hl
  djnz $-4
_:
  ret m

;need to shift up until top bit is 1. Should be at most 1 shift, I think
  ld de,(var_z+16)

; Make sure that the mantissa isn't zero
  ld hl,var_z
  ld b,15
  ld a,(hl)
_:
  inc hl \ or (hl) \ jr nz,+_
  djnz -_
  inc hl
  ld (hl),a
  inc hl
  ld (hl),a
  ret

_:
  dec de
  ld a,d
  and $7F
  or e
  jp z,add_zero_fma

  ld hl,var_z
  call sla64
  inc hl
  call rl64
  jp p,-_
  ld (var_z+16),de
  ret

xfma_add:
  ;add the mantissas
  ld hl,var_z-18
  ld de,var_z
  call add64
  inc hl
  inc de
  call adc64
  ret nc
  ex de,hl
  inc hl
  inc (hl) \ jr nz,+_
  inc hl
  inc (hl)
  ld a,(hl)
  dec hl
  and $7F
  jr z,add_inf_fma
  scf
_:
  dec hl
  jp rr64
srl4_var_z_m_18:
  ld hl,var_z-3
  ld b,a
  call srl64_4
  dec hl
  call rrd8
  ld a,b
  ret
srl2_var_z_m_18:
  call srl_var_z_m_18
srl_var_z_m_18:
  ld hl,var_z-3
  ld b,a
  call srl64
  dec hl
  call rr64
  ld a,b
  ret
caseadd_fma:
;zero+x => x for all x
;NaN +x => NaN for all x
;inf-inf=> NaN
;inf +x => inf, x != inf
  ret
caseadd1_fma:
;x+zero => x
;x+inf  => inf
;x+NaN  => NaN
  ret
add_zero_fma:
  xor a
  ld (var_z+15),a
  ld h,a
  ld l,a
  ld (var_z+16),hl
  ret
add_inf_fma:
  xor a
  ld (var_z+15),a
  dec a
  ld h,a
  ld l,a
  ld (var_z+16),hl
  ret
#endif