Skip to content

Commit 62d07d1

Browse files
authored
Merge pull request #117 from Neotron-Compute/asm-video-out
Hand-roll assembly for chunky4 output.
2 parents e7e72c9 + b6e627e commit 62d07d1

File tree

2 files changed

+126
-17
lines changed

2 files changed

+126
-17
lines changed

memory.x

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,18 @@ MEMORY {
2525
/*
2626
* This is the bottom of the four striped banks of SRAM in the RP2040.
2727
*/
28-
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9630
28+
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9690
2929
/*
3030
* This is the top of the four striped banks of SRAM in the RP2040, plus
3131
* SRAM_BANK4 and SRAM_BANK5.
3232
*
3333
* This is carefully calculated to give us 8 KiB of stack space and ensure
3434
* the defmt buffer doesn't span across SRAM_BANK3 and SRAM_BANK4.
3535
*
36-
* 0x9630 should be the (size of .data + size of .bss + size of .uninit +
36+
* 0x9690 should be the (size of .data + size of .bss + size of .uninit +
3737
* 0x2000 for the stack).
3838
*/
39-
RAM : ORIGIN = 0x20042000 - 0x9630, LENGTH = 0x9630
39+
RAM : ORIGIN = 0x20042000 - 0x9690, LENGTH = 0x9690
4040
}
4141

4242
/*

src/vga/mod.rs

Lines changed: 123 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,10 @@ impl RenderEngine {
232232
// Bitmap with 4 bits per pixel
233233
self.draw_next_line_chunky4(scan_line_buffer, current_line_num);
234234
}
235+
neotron_common_bios::video::Format::Chunky8 => {
236+
// Bitmap with 8 bits per pixel
237+
self.draw_next_line_chunky8(scan_line_buffer, current_line_num);
238+
}
235239
_ => {
236240
// Draw nothing
237241
}
@@ -253,9 +257,13 @@ impl RenderEngine {
253257
let line_start = unsafe { base_ptr.add(offset) };
254258
// Get a pointer into our scan-line buffer
255259
let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr();
256-
let black_pixel = RGBColour(VIDEO_PALETTE[0].load(Ordering::Relaxed));
257-
let white_pixel = RGBColour(VIDEO_PALETTE[1].load(Ordering::Relaxed));
258260
if is_double {
261+
let white_pixel = RGBColour(
262+
VIDEO_PALETTE[TextForegroundColour::White as usize].load(Ordering::Relaxed),
263+
);
264+
let black_pixel = RGBColour(
265+
VIDEO_PALETTE[TextForegroundColour::Black as usize].load(Ordering::Relaxed),
266+
);
259267
// double-width mode.
260268
// sixteen RGB pixels (eight pairs) per byte
261269
let white_pair = RGBPair::from_pixels(white_pixel, white_pixel);
@@ -465,11 +473,114 @@ impl RenderEngine {
465473
}
466474
}
467475
} else {
468-
for col in 0..line_len_bytes {
476+
// // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop.
477+
//
478+
// for col in 0..line_len_bytes {
479+
// unsafe {
480+
// let pixel_pair = line_start_bytes.add(col).read();
481+
// let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
482+
// scan_line_buffer_ptr.write(pair);
483+
// scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
484+
// }
485+
// }
486+
487+
// So I wrote it by hand in assembly instead, saving two clock cycles per loop
488+
// We have 640x4 (320x8) input and must produce 320x32 output
489+
unsafe {
490+
core::arch::asm!(
491+
"0:",
492+
// load a byte from line_start_bytes
493+
"ldrb {tmp}, [{lsb}]",
494+
// multiply it by sizeof(u32)
495+
"lsls {tmp}, {tmp}, #0x2",
496+
// load a 32-bit RGB pair from CHUNKY4_COLOUR_LOOKUP
497+
"ldr {tmp}, [{chunky}, {tmp}]",
498+
// store the 32-bit RGB pair to the scanline buffer, and increment
499+
"stm {slbp}!, {{ {tmp} }}",
500+
// increment the pointer to the start of the line
501+
"adds {lsb}, {lsb}, #0x1",
502+
// loop until we're done
503+
"cmp {lsb}, {lsb_max}",
504+
"bne 0b",
505+
lsb = in(reg) line_start_bytes,
506+
lsb_max = in(reg) line_start_bytes.add(line_len_bytes),
507+
chunky = in(reg) core::ptr::addr_of!(CHUNKY4_COLOUR_LOOKUP),
508+
tmp = in(reg) 0,
509+
slbp = in(reg) scan_line_buffer_ptr,
510+
);
511+
}
512+
}
513+
}
514+
515+
/// Draw a line of 8-bpp bitmap as pixels.
516+
///
517+
/// Writes into the relevant pixel buffer (either [`PIXEL_DATA_BUFFER_ODD`]
518+
/// or [`PIXEL_DATA_BUFFER_EVEN`]) assuming the framebuffer is a bitmap.
519+
///
520+
/// The `current_line_num` goes from `0..NUM_LINES`.
521+
#[link_section = ".data"]
522+
pub fn draw_next_line_chunky8(&mut self, scan_line_buffer: &LineBuffer, current_line_num: u16) {
523+
let is_double = self.current_video_mode.is_horiz_2x();
524+
let base_ptr = self.current_video_ptr as *const u8;
525+
let line_len_bytes = self.current_video_mode.line_size_bytes();
526+
let line_start_offset_bytes = usize::from(current_line_num) * line_len_bytes;
527+
let line_start_bytes = unsafe { base_ptr.add(line_start_offset_bytes) };
528+
// Get a pointer into our scan-line buffer
529+
let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr();
530+
let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour;
531+
if is_double {
532+
// Double-width mode.
533+
// two RGB pixels (one pair) per byte
534+
535+
// This code optimises poorly
536+
// for col in 0..line_len_bytes {
537+
// unsafe {
538+
// let chunky_pixel = line_start_bytes.add(col).read() as usize;
539+
// let rgb = palette_ptr.add(chunky_pixel).read();
540+
// scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb, rgb));
541+
// scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
542+
// }
543+
// }
544+
545+
// So I wrote it by hand in assembly instead, saving two clock cycles per loop
546+
// We have 320x8 input and must produce 320x32 output
547+
unsafe {
548+
core::arch::asm!(
549+
"0:",
550+
// load a byte from line_start_bytes
551+
"ldrb {tmp}, [{lsb}]",
552+
// multiply it by sizeof(u16)
553+
"lsls {tmp}, {tmp}, #0x1",
554+
// load a single 16-bit RGB value from the palette
555+
"ldrh {tmp}, [{palette}, {tmp}]",
556+
// double it up to make a 32-bit RGB pair containing two identical pixels
557+
"lsls {tmp2}, {tmp}, #16",
558+
"adds {tmp}, {tmp}, {tmp2}",
559+
// store the 32-bit RGB pair to the scanline buffer, and increment
560+
"stm {slbp}!, {{ {tmp} }}",
561+
// increment the pointer to the start of the line
562+
"adds {lsb}, {lsb}, #0x1",
563+
// loop until we're done
564+
"cmp {lsb}, {lsb_max}",
565+
"bne 0b",
566+
lsb = in(reg) line_start_bytes,
567+
lsb_max = in(reg) line_start_bytes.add(line_len_bytes),
568+
palette = in(reg) core::ptr::addr_of!(VIDEO_PALETTE),
569+
tmp = in(reg) 0,
570+
tmp2 = in(reg) 1,
571+
slbp = in(reg) scan_line_buffer_ptr,
572+
);
573+
}
574+
} else {
575+
// Single-width mode. This won't run fast enough on an RP2040, but no supported mode uses it.
576+
// one RGB pixel per byte
577+
for col in 0..line_len_bytes / 2 {
469578
unsafe {
470-
let pixel_pair = line_start_bytes.add(col).read();
471-
let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
472-
scan_line_buffer_ptr.write(pair);
579+
let chunky_pixel_left = line_start_bytes.add(col * 2).read() as usize;
580+
let rgb_left = palette_ptr.add(chunky_pixel_left).read();
581+
let chunky_pixel_right = line_start_bytes.add((col * 2) + 1).read() as usize;
582+
let rgb_right = palette_ptr.add(chunky_pixel_right).read();
583+
scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb_left, rgb_right));
473584
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
474585
}
475586
}
@@ -1057,14 +1168,6 @@ impl Chunky4ColourLookup {
10571168
}
10581169
}
10591170
}
1060-
1061-
/// Turn a pair of chunky4 pixels (in a `u8`), into a pair of RGB pixels.
1062-
#[inline]
1063-
fn lookup(&self, pixel_pair: u8) -> RGBPair {
1064-
let index = usize::from(pixel_pair);
1065-
let raw = self.entries[index].load(Ordering::Relaxed);
1066-
RGBPair(raw)
1067-
}
10681171
}
10691172

10701173
// -----------------------------------------------------------------------------
@@ -1983,6 +2086,12 @@ pub fn test_video_mode(mode: neotron_common_bios::video::Mode) -> bool {
19832086
| neotron_common_bios::video::Format::Chunky4,
19842087
true,
19852088
false,
2089+
) | (
2090+
neotron_common_bios::video::Timing::T640x480
2091+
| neotron_common_bios::video::Timing::T640x400,
2092+
neotron_common_bios::video::Format::Chunky8,
2093+
true,
2094+
false
19862095
)
19872096
)
19882097
}

0 commit comments

Comments
 (0)