@@ -232,6 +232,10 @@ impl RenderEngine {
232232 // Bitmap with 4 bits per pixel
233233 self . draw_next_line_chunky4 ( scan_line_buffer, current_line_num) ;
234234 }
235+ neotron_common_bios:: video:: Format :: Chunky8 => {
236+ // Bitmap with 8 bits per pixel
237+ self . draw_next_line_chunky8 ( scan_line_buffer, current_line_num) ;
238+ }
235239 _ => {
236240 // Draw nothing
237241 }
@@ -253,9 +257,13 @@ impl RenderEngine {
253257 let line_start = unsafe { base_ptr. add ( offset) } ;
254258 // Get a pointer into our scan-line buffer
255259 let mut scan_line_buffer_ptr = scan_line_buffer. pixel_ptr ( ) ;
256- let black_pixel = RGBColour ( VIDEO_PALETTE [ 0 ] . load ( Ordering :: Relaxed ) ) ;
257- let white_pixel = RGBColour ( VIDEO_PALETTE [ 1 ] . load ( Ordering :: Relaxed ) ) ;
258260 if is_double {
261+ let white_pixel = RGBColour (
262+ VIDEO_PALETTE [ TextForegroundColour :: White as usize ] . load ( Ordering :: Relaxed ) ,
263+ ) ;
264+ let black_pixel = RGBColour (
265+ VIDEO_PALETTE [ TextForegroundColour :: Black as usize ] . load ( Ordering :: Relaxed ) ,
266+ ) ;
259267 // double-width mode.
260268 // sixteen RGB pixels (eight pairs) per byte
261269 let white_pair = RGBPair :: from_pixels ( white_pixel, white_pixel) ;
@@ -465,11 +473,114 @@ impl RenderEngine {
465473 }
466474 }
467475 } else {
468- for col in 0 ..line_len_bytes {
476+ // // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop.
477+ //
478+ // for col in 0..line_len_bytes {
479+ // unsafe {
480+ // let pixel_pair = line_start_bytes.add(col).read();
481+ // let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
482+ // scan_line_buffer_ptr.write(pair);
483+ // scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
484+ // }
485+ // }
486+
487+ // So I wrote it by hand in assembly instead, saving two clock cycles per loop
488+ // We have 640x4 (320x8) input and must produce 320x32 output
489+ unsafe {
490+ core:: arch:: asm!(
491+ "0:" ,
492+ // load a byte from line_start_bytes
493+ "ldrb {tmp}, [{lsb}]" ,
494+ // multiply it by sizeof(u32)
495+ "lsls {tmp}, {tmp}, #0x2" ,
496+ // load a 32-bit RGB pair from CHUNKY4_COLOUR_LOOKUP
497+ "ldr {tmp}, [{chunky}, {tmp}]" ,
498+ // store the 32-bit RGB pair to the scanline buffer, and increment
499+ "stm {slbp}!, {{ {tmp} }}" ,
500+ // increment the pointer to the start of the line
501+ "adds {lsb}, {lsb}, #0x1" ,
502+ // loop until we're done
503+ "cmp {lsb}, {lsb_max}" ,
504+ "bne 0b" ,
505+ lsb = in( reg) line_start_bytes,
506+ lsb_max = in( reg) line_start_bytes. add( line_len_bytes) ,
507+ chunky = in( reg) core:: ptr:: addr_of!( CHUNKY4_COLOUR_LOOKUP ) ,
508+ tmp = in( reg) 0 ,
509+ slbp = in( reg) scan_line_buffer_ptr,
510+ ) ;
511+ }
512+ }
513+ }
514+
515+ /// Draw a line of 8-bpp bitmap as pixels.
516+ ///
517+ /// Writes into the relevant pixel buffer (either [`PIXEL_DATA_BUFFER_ODD`]
518+ /// or [`PIXEL_DATA_BUFFER_EVEN`]) assuming the framebuffer is a bitmap.
519+ ///
520+ /// The `current_line_num` goes from `0..NUM_LINES`.
521+ #[ link_section = ".data" ]
522+ pub fn draw_next_line_chunky8 ( & mut self , scan_line_buffer : & LineBuffer , current_line_num : u16 ) {
523+ let is_double = self . current_video_mode . is_horiz_2x ( ) ;
524+ let base_ptr = self . current_video_ptr as * const u8 ;
525+ let line_len_bytes = self . current_video_mode . line_size_bytes ( ) ;
526+ let line_start_offset_bytes = usize:: from ( current_line_num) * line_len_bytes;
527+ let line_start_bytes = unsafe { base_ptr. add ( line_start_offset_bytes) } ;
528+ // Get a pointer into our scan-line buffer
529+ let mut scan_line_buffer_ptr = scan_line_buffer. pixel_ptr ( ) ;
530+ let palette_ptr = VIDEO_PALETTE . as_ptr ( ) as * const RGBColour ;
531+ if is_double {
532+ // Double-width mode.
533+ // two RGB pixels (one pair) per byte
534+
535+ // This code optimises poorly
536+ // for col in 0..line_len_bytes {
537+ // unsafe {
538+ // let chunky_pixel = line_start_bytes.add(col).read() as usize;
539+ // let rgb = palette_ptr.add(chunky_pixel).read();
540+ // scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb, rgb));
541+ // scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
542+ // }
543+ // }
544+
545+ // So I wrote it by hand in assembly instead, saving two clock cycles per loop
546+ // We have 320x8 input and must produce 320x32 output
547+ unsafe {
548+ core:: arch:: asm!(
549+ "0:" ,
550+ // load a byte from line_start_bytes
551+ "ldrb {tmp}, [{lsb}]" ,
552+ // multiply it by sizeof(u16)
553+ "lsls {tmp}, {tmp}, #0x1" ,
554+ // load a single 16-bit RGB value from the palette
555+ "ldrh {tmp}, [{palette}, {tmp}]" ,
556+ // double it up to make a 32-bit RGB pair containing two identical pixels
557+ "lsls {tmp2}, {tmp}, #16" ,
558+ "adds {tmp}, {tmp}, {tmp2}" ,
559+ // store the 32-bit RGB pair to the scanline buffer, and increment
560+ "stm {slbp}!, {{ {tmp} }}" ,
561+ // increment the pointer to the start of the line
562+ "adds {lsb}, {lsb}, #0x1" ,
563+ // loop until we're done
564+ "cmp {lsb}, {lsb_max}" ,
565+ "bne 0b" ,
566+ lsb = in( reg) line_start_bytes,
567+ lsb_max = in( reg) line_start_bytes. add( line_len_bytes) ,
568+ palette = in( reg) core:: ptr:: addr_of!( VIDEO_PALETTE ) ,
569+ tmp = in( reg) 0 ,
570+ tmp2 = in( reg) 1 ,
571+ slbp = in( reg) scan_line_buffer_ptr,
572+ ) ;
573+ }
574+ } else {
575+ // Single-width mode. This won't run fast enough on an RP2040, but no supported mode uses it.
576+ // one RGB pixel per byte
577+ for col in 0 ..line_len_bytes / 2 {
469578 unsafe {
470- let pixel_pair = line_start_bytes. add ( col) . read ( ) ;
471- let pair = CHUNKY4_COLOUR_LOOKUP . lookup ( pixel_pair) ;
472- scan_line_buffer_ptr. write ( pair) ;
579+ let chunky_pixel_left = line_start_bytes. add ( col * 2 ) . read ( ) as usize ;
580+ let rgb_left = palette_ptr. add ( chunky_pixel_left) . read ( ) ;
581+ let chunky_pixel_right = line_start_bytes. add ( ( col * 2 ) + 1 ) . read ( ) as usize ;
582+ let rgb_right = palette_ptr. add ( chunky_pixel_right) . read ( ) ;
583+ scan_line_buffer_ptr. write ( RGBPair :: from_pixels ( rgb_left, rgb_right) ) ;
473584 scan_line_buffer_ptr = scan_line_buffer_ptr. add ( 1 ) ;
474585 }
475586 }
@@ -1057,14 +1168,6 @@ impl Chunky4ColourLookup {
10571168 }
10581169 }
10591170 }
1060-
1061- /// Turn a pair of chunky4 pixels (in a `u8`), into a pair of RGB pixels.
1062- #[ inline]
1063- fn lookup ( & self , pixel_pair : u8 ) -> RGBPair {
1064- let index = usize:: from ( pixel_pair) ;
1065- let raw = self . entries [ index] . load ( Ordering :: Relaxed ) ;
1066- RGBPair ( raw)
1067- }
10681171}
10691172
10701173// -----------------------------------------------------------------------------
@@ -1983,6 +2086,12 @@ pub fn test_video_mode(mode: neotron_common_bios::video::Mode) -> bool {
19832086 | neotron_common_bios:: video:: Format :: Chunky4 ,
19842087 true ,
19852088 false ,
2089+ ) | (
2090+ neotron_common_bios:: video:: Timing :: T640x480
2091+ | neotron_common_bios:: video:: Timing :: T640x400 ,
2092+ neotron_common_bios:: video:: Format :: Chunky8 ,
2093+ true ,
2094+ false
19862095 )
19872096 )
19882097}
0 commit comments