From: RWilson@acorn.co.uk
Subject: Dithering in general
Date: 30 Jan 92 12:26:11 GMT
As well as the deathless 'How to find the closest colour out of the RISC OS
256' (yes, *anyone* can use this code (especially Find256)  we'd like an
acknowledgement, please!), here are some notes on dithering:
(a) There are some text books that get the Floyd Steinberg error weights WRONG
 the books say 2/8 3/8 and 3/8, but this (although cheaper to calculate)
is not FS, where the weights are 7/16, 5/16, 3/16 and 1/16.
(b) ALL dithering algorithms look 'better' (less prone to regular artifact
patterning) if the display is scanned serpentine (lr one raster line,
then rl).
(c) There are more expensive (and slightly better than FS) schemes: see
"Digital Halftoning" by Robert Ulichney published by the MIT Press, ISBN
0262210096.
(d) ChangeFSI gets right. :)
(e) For god's sake, allow people access to the undithered data.
Roger
From: RWilson@acorn.co.uk
Subject: How to find the closest colour out of the RISC OS 256
Date: 30 Jan 92 12:20:23 GMT
; MACRO ColErr
;
; Calculates an error value in $error corresponding to the difference
; between the two colours $col1 and $col2. Requires two temporary
; registers. All registers must be different. This routine requires
; 16 instructions, three of which are multiples plus for instructions
; for the error loading calculations. To speed up the multiplies
; the error value for each component is made positive.
;
; The error calculation is controlled by the following ``loading'' values.
; The basic component error calculation, given components c1 and c2 from
; $col1 and $col2, is:
;
; e := ((c1c2) ^ 2) * load
;
; Thus a higher ``load'' makes errors in the component more significant when
; the component value itself is small, however has little effect when the
; component value is large. Each loading must not exceed 65536/3 or the
; error calculation may overflow. If a loading exceeds 32768/3 the result may
; have the top bit set. The loadings are determined algorithmically by the
; following macros  this avoids having to do a real multiplication. The
; macros have the form:
;
; XWeight $err, $sqr
;
; and calculate:
;
; $err = $sqr * XLoad (for BLoad)
; $err += $sqr * XLoad (for GLoad, RLoad)
;
; $sqr may be overwritten. They are called in the order BLoad, GLoad
; RLoad, and are only called if the corresponding loading value is not
; 1.
;
BLoad EQU 1
MACRO
$label BWeight $error, $sqr
ASSERT 0
MEND
GLoad EQU 10
MACRO
$label GWeight $error, $sqr
$label ADD $sqr, $sqr, $sqr, LSL #2
ADD $error, $error, $sqr, LSL #1
MEND
RLoad EQU 3
MACRO
$label RWeight $error, $sqr
$label ADD $sqr, $sqr, $sqr, LSL #1
ADD $error, $error, $sqr
MEND
MACRO
$label ColErr $error, $col1, $col2, $temp1, $temp2
$label MOV $temp1, $col1, LSR #24
SUBS $temp2, $temp1, $col2, LSR #24
RSBLT $temp2, $temp2, #0
[ BLoad /= 1
MUL $temp1, $temp2, $temp2
BWeight $error, $temp1

MUL $error, $temp2, $temp2
]
MOV $temp2, #255
AND $temp1, $temp2, $col1, LSR #16
AND $temp2, $temp2, $col2, LSR #16
SUBS $temp2, $temp1, $temp2
RSBLT $temp2, $temp2, #0
[ GLoad /= 1
MUL $temp1, $temp2, $temp2
GWeight $error, $temp1

MLA $error, $temp2, $temp2, $error
]
MOV $temp2, #255
AND $temp1, $temp2, $col1, LSR #8
AND $temp2, $temp2, $col2, LSR #8
SUBS $temp2, $temp1, $temp2
RSBLT $temp2, $temp2, #0
[ RLoad /= 1
MUL $temp1, $temp2, $temp2
RWeight $error, $temp1

MLA $error, $temp2, $temp2, $error
]
MEND
;
; MACRO CompErr
;
; This macro also calculates an error value, however the second colour
; is specified as three separate r, g, b values. The registers containing
; these values can be the same, if desired. The registers should not be
; the same as any of the other registers. The calculation needs 16
; instructions, including three multiplies.
;
MACRO
$label CompErr $error, $col, $red, $green, $blue, $temp1, $temp2
$label SUBS $temp2, $blue, $col, LSR #24
RSBLT $temp2, $temp2, #0
[ BLoad /= 1
MUL $temp1, $temp2, $temp2
BWeight $error, $temp1

MUL $error, $temp2, $temp2
]
AND $temp1, $col, #&FF0000 ; green component, still shifted
SUBS $temp2, $green, $temp1, LSR #16
RSBLT $temp2, $temp2, #0 ; green error
[ GLoad /= 1
MUL $temp1, $temp2, $temp2
GWeight $error, $temp1

MLA $error, $temp2, $temp2, $error
]
AND $temp1, $col, #&FF00 ; red component, still shifted
SUBS $temp2, $red, $temp1, LSR #8
RSBLT $temp2, $temp2, #0 ; red error
[ RLoad /= 1
MUL $temp1, $temp2, $temp2
RWeight $error, $temp1

MLA $error, $temp2, $temp2, $error
]
MEND
;
; MACRO FindCol
;
; This macro finds the closest colour to the given r, g, b triple from
; an array of (word sized) RGB values (encoded BBGGRR00). The macro
; preserves the $red, $green and $blue values, exits with $error set
; to that of the found colour and $index set to the index of the entry. Again,
; all arguments must be different registers.
;
MACRO
$label FindCol $list, $listend, $red, $green, $blue, $error, $index, $ptr, $col, $temp1, $temp2, $temp3
$label MOV $error, #&FFFFFFFF ; maximum error
SUB $ptr, $list, #4 ; points to entry to last entry tried
0: LDR $col, [$ptr, #4]!
CompErr $temp3, $col, $red, $green, $blue, $temp1, $temp2
CMP $temp3, $error
MOVLS $error, $temp3
SUBLS $index, $list, $ptr ; index * 4
CMP $ptr, $listend
BLO 0b
MOV $index, $index, LSR #2
MEND
;
; MACRO Find256
;
; This macro finds the best matching colour for the *standard* ARM 256 entry palette  the
; one with the R/G/B/T (tint) bits. The algorithm returns a palette value encoded in the
; standard way (BGGRBRTT) in $col and the error in $error.
; All arguments must be different registers. The loop is about 44 instructions, including
; the normal three multiplies. The code goes round it four times, there is a further 12
; instruction overhead.
;
; The registers must all be different except for $red, $green and $blue, which can be
; the same if desired.
;
; The ARM palette entries are assumed to expand a 4 bit component to an 8 bit component
; using c<<4c  this has been determined experimentally to give good results.
;
MACRO
$label Find256 $red, $green, $blue, $error, $col, $tint, $temp1, $temp2, $temp3, $pixel
$label MOV $error, #&FFFFFFFF
MOV $tint, #&30:SHL:23 ; tint bits unexpanded
0: RSB $temp1, $tint, #&20:SHL:23 ; overflow is not possible here
SUB $temp2, $blue, $blue, LSR #4 ; effectively multiplication by 16/17
ADDS $temp1, $temp1, $temp2, LSL #23
;
; At this point the top bits of $temp1 hold the best blue bit values given
; the current $tint tint bits, however the desired value may be >11tt or <00tt,
; in either case the top bit (bit 31) of $temp1 will be set, hence the N flag
; will be set in the PSR. We must distinguish overflow (>11tt) from a simple
; negative result (<00tt) and truncate both to the appropriate end of the
; scale. We have calculated (bluetint+&22)<<23. The overflow (V) flag will
; ONLY be set for >11tt; the other possible results (in the range &FF<<23 to
; &17<<23 are representable without overflow), so:
;
MOVVSS $temp1, #&7F000000 ; clears the N flag!
MOVMI $temp1, #0
;
; Now extract the blue bits and reconstruct the real (expanded) blue value.
;
AND $temp1, $temp1, #&60000000 ; two blue bits
ADD $temp1, $temp1, $tint ; plus tint
ADD $pixel, $temp1, $temp1, LSR #4 ; expand component bits  8 bit blue value
;
; Calculate the error as above.
;
SUBS $temp2, $blue, $pixel, LSR #23
RSBLT $temp2, $temp2, #0 ; speeds up multiplication
[ BLoad /= 1
MUL $temp1, $temp2, $temp2
BWeight $temp3, $temp1

MUL $temp3, $temp2, $temp2
]
;
; Repeat this for the green component, accumulating the error
;
RSB $temp1, $tint, #&20:SHL:23
SUB $temp2, $green, $green, LSR #4
ADDS $temp1, $temp1, $temp2, LSL #23
MOVVSS $temp1, #&7F000000
MOVMI $temp1, #0
;
AND $temp1, $temp1, #&60000000 ; two green bits
ADD $temp1, $tint, $temp1 ; 4 bit green value
ADD $temp1, $temp1, $temp1, LSR #4 ; expand component bits
ORR $pixel, $pixel, $temp1, LSR #8 ; Accumulate bits in $pixel
;
SUBS $temp2, $green, $temp1, LSR #23
RSBLT $temp2, $temp2, #0
[ GLoad /= 1
MUL $temp1, $temp2, $temp2
GWeight $temp3, $temp1

MLA $temp3, $temp2, $temp2, $temp3
]
;
; And the red component:
;
RSB $temp1, $tint, #&20:SHL:23
SUB $temp2, $red, $red, LSR #4
ADDS $temp1, $temp1, $temp2, LSL #23
MOVVSS $temp1, #&7F000000
MOVMI $temp1, #0
;
AND $temp1, $temp1, #&60000000 ; two red bits
ADD $temp1, $tint, $temp1 ; 4 bit red value
ADD $temp1, $temp1, $temp1, LSR #4 ; expand component bits
ORR $pixel, $pixel, $temp1, LSR #16 ; Accumulate bits in $pixel
;
SUBS $temp2, $red, $temp1, LSR #23
RSBLT $temp2, $temp2, #0
[ RLoad /= 1
MUL $temp1, $temp2, $temp2
RWeight $temp3, $temp1

MLA $temp3, $temp2, $temp2, $temp3
]
;
; $temp3 contains the error for the ARM value in $pixel (actually this value
; is shifted right by 1 bit because of the LSL 23 above). Check the error
; and see if this is a better pixel.
;
CMP $temp3, $error
MOVLS $error, $temp3 ; LS, so selects lower tint in preference
MOVLS $col, $pixel ; $col holds best match
;
; Try the next tint
;
SUBS $tint, $tint, #&10:SHL:23
BGE 0b
;
; $error is the error, and is directly comparable with the $error
; value from the other macros. $col to the ARM palette entry using
; the appropriate bit manipulations. The value is currently:
;
; 0BBTT011 1GGTT011 1RRTT011 10000000
;
; We need to convert this to the form:
;
; 76543210
; BGGRBRTT
;
AND $temp1, $col, #&40000000 ; B (needs >> 23)
MOV $temp2, $temp1, LSR #23
AND $temp1, $col, #&600000 ; GG (needs >> 16)
ORR $temp2, $temp2, $temp1, LSR #16
AND $temp1, $col, #&4000 ; R (needs >> 10)
ORR $temp2, $temp2, $temp1, LSR #10
AND $temp1, $col, #&20000000 ; B (needs >> 26)
ORR $temp2, $temp2, $temp1, LSR #26
AND $temp1, $col, #&3800 ; RTT (needs >> 11)
ORR $col, $temp2, $temp1, LSR #11
MEND