From 5c95cdb01835e7825d8d862e3154708ef9285f42 Mon Sep 17 00:00:00 2001 From: prescientmoon Date: Sun, 11 Aug 2024 18:32:17 +0200 Subject: [PATCH] Migrate more stuff towards my own ocr Signed-off-by: prescientmoon --- src/commands/score.rs | 78 ++++++++++++++-------- src/context.rs | 6 +- src/main.rs | 1 + src/recognition/hyperglass.rs | 85 +++++++++++++++--------- src/recognition/recognize.rs | 120 ++++++++++++++++------------------ src/time.rs | 11 ++++ 6 files changed, 176 insertions(+), 125 deletions(-) create mode 100644 src/time.rs diff --git a/src/commands/score.rs b/src/commands/score.rs index 9d3e899..e5007de 100644 --- a/src/commands/score.rs +++ b/src/commands/score.rs @@ -1,9 +1,12 @@ +use std::time::Instant; + use crate::arcaea::play::{CreatePlay, Play}; use crate::arcaea::score::Score; use crate::context::{Context, Error}; use crate::recognition::recognize::{ImageAnalyzer, ScoreKind}; use crate::user::{discord_it_to_discord_user, User}; -use crate::{edit_reply, get_user}; +use crate::{edit_reply, get_user, timed}; +use image::DynamicImage; use poise::serenity_prelude::CreateMessage; use poise::{serenity_prelude as serenity, CreateReply}; use sqlx::query; @@ -41,52 +44,61 @@ pub async fn magic( let mut analyzer = ImageAnalyzer::default(); for (i, file) in files.iter().enumerate() { + let start = Instant::now(); if let Some(_) = file.dimensions() { - let bytes = file.download().await?; - let mut image = image::load_from_memory(&bytes)?; + let bytes = timed!("file download", { file.download().await? }); + let mut image = timed!("decode image", { image::load_from_memory(&bytes)? }); + let mut grayscale_image = timed!("grayscale image", { + DynamicImage::ImageLuma8(image.to_luma8()) + }); // image = image.resize(1024, 1024, FilterType::Nearest); let result: Result<(), Error> = try { // {{{ Detection - // This makes OCR more likely to work - let mut ocr_image = image.grayscale().blur(1.); - edit_reply!(ctx, handle, "Image {}: reading kind", i + 1).await?; - let kind = analyzer.read_score_kind(ctx.data(), &ocr_image)?; + // edit_reply!(ctx, handle, "Image {}: reading kind", i + 1).await?; + let kind = timed!("read_score_kind", { + analyzer.read_score_kind(ctx.data(), &grayscale_image)? + }); - edit_reply!(ctx, handle, "Image {}: reading difficulty", i + 1).await?; + // edit_reply!(ctx, handle, "Image {}: reading difficulty", i + 1).await?; // Do not use `ocr_image` because this reads the colors - let difficulty = analyzer.read_difficulty(ctx.data(), &image, kind)?; + let difficulty = timed!("read_difficulty", { + analyzer.read_difficulty(ctx.data(), &image, kind)? + }); - edit_reply!(ctx, handle, "Image {}: reading jacket", i + 1).await?; - let (song, chart) = analyzer - .read_jacket(ctx.data(), &mut image, kind, difficulty) - .await?; - - ocr_image.invert(); + // edit_reply!(ctx, handle, "Image {}: reading jacket", i + 1).await?; + let (song, chart) = timed!("read_jacket", { + analyzer.read_jacket(ctx.data(), &mut image, kind, difficulty)? + }); let (note_distribution, max_recall) = match kind { ScoreKind::ScoreScreen => { edit_reply!(ctx, handle, "Image {}: reading distribution", i + 1) .await?; let note_distribution = - Some(analyzer.read_distribution(ctx.data(), &image)?); + Some(analyzer.read_distribution(ctx.data(), &grayscale_image)?); edit_reply!(ctx, handle, "Image {}: reading max recall", i + 1).await?; - let max_recall = Some(analyzer.read_max_recall(ctx.data(), &image)?); + let max_recall = + Some(analyzer.read_max_recall(ctx.data(), &grayscale_image)?); (note_distribution, max_recall) } ScoreKind::SongSelect => (None, None), }; - edit_reply!(ctx, handle, "Image {}: reading score", i + 1).await?; - let score = analyzer.read_score( - ctx.data(), - Some(chart.note_count), - &ocr_image, - kind, - )?; + grayscale_image.invert(); + + // edit_reply!(ctx, handle, "Image {}: reading score", i + 1).await?; + let score = timed!("read_score", { + analyzer.read_score( + ctx.data(), + Some(chart.note_count), + &grayscale_image, + kind, + )? + }); // {{{ Build play let maybe_fars = Score::resolve_distibution_ambiguities( @@ -104,9 +116,11 @@ pub async fn magic( // }}} // }}} // {{{ Deliver embed - let (embed, attachment) = play - .to_embed(&ctx.data().db, &user, &song, &chart, i, None) - .await?; + + let (embed, attachment) = timed!("to embed", { + play.to_embed(&ctx.data().db, &user, &song, &chart, i, None) + .await? + }); embeds.push(embed); attachments.extend(attachment); @@ -123,8 +137,16 @@ pub async fn magic( .await?; continue; } + let took = start.elapsed(); - edit_reply!(ctx, handle, "Processed {}/{} scores", i + 1, files.len()).await?; + edit_reply!( + ctx, + handle, + "Processed {}/{} scores. Last score took {took:?} to process.", + i + 1, + files.len() + ) + .await?; } handle.delete(ctx).await?; diff --git a/src/context.rs b/src/context.rs index db4f35c..64a48df 100644 --- a/src/context.rs +++ b/src/context.rs @@ -35,10 +35,12 @@ impl UserContext { let mut song_cache = SongCache::new(&db).await?; let jacket_cache = JacketCache::new(&data_dir, &mut song_cache)?; let ui_measurements = UIMeasurements::read(&data_dir)?; + let geosans_measurements = GEOSANS_FONT .with_borrow_mut(|font| CharMeasurements::from_text(font, "0123456789'", None))?; - let exo_measurements = EXO_FONT - .with_borrow_mut(|font| CharMeasurements::from_text(font, "0123456789'", Some(700)))?; + let exo_measurements = EXO_FONT.with_borrow_mut(|font| { + CharMeasurements::from_text(font, "0123456789'abcdefghijklmnopqrstuvwxyz", Some(700)) + })?; println!("Created user context"); diff --git a/src/main.rs b/src/main.rs index 3dd90fb..9e6acaa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,6 +13,7 @@ mod context; mod levenshtein; mod logs; mod recognition; +mod time; mod transform; mod user; diff --git a/src/recognition/hyperglass.rs b/src/recognition/hyperglass.rs index 120e448..ab6d02a 100644 --- a/src/recognition/hyperglass.rs +++ b/src/recognition/hyperglass.rs @@ -12,7 +12,7 @@ //! 5. Compute the largest width & height of the connected components. //! 5. Split each component (more precisely, start at its top-left corner and //! split an area equal to the aforementioned width & height) into a grid of -//! N^2 chunks (N=5 at the moment), and use that to generate a vector who's +//! N^2 chunks (N=5 at the moment), and use that to generate a vector whose //! elements represent the percentage of black pixels in each chunk which //! belong to the connected component at hand. //! 6. Normalise the vectors to remain font-weight independent. @@ -33,6 +33,7 @@ use crate::{ bitmap::{Align, BitmapCanvas, Color, TextStyle}, context::Error, logs::{debug_image_buffer_log, debug_image_log}, + timed, }; // {{{ ConponentVec @@ -47,7 +48,11 @@ struct ComponentVec { impl ComponentVec { // {{{ (Component => vector) encoding - fn from_component(components: &ComponentsWithBounds, component: u32) -> Result { + fn from_component( + components: &ComponentsWithBounds, + area: (u32, u32), + component: u32, + ) -> Result { let mut chunks = [0.0; IMAGE_VEC_DIM]; let bounds = components .bounds @@ -58,10 +63,10 @@ impl ComponentVec { for i in 0..(SPLIT_FACTOR * SPLIT_FACTOR) { let (iy, ix) = i.div_rem_euclid(&SPLIT_FACTOR); - let x_start = bounds.x_min + ix * components.max_width / SPLIT_FACTOR; - let x_end = bounds.x_min + (ix + 1) * components.max_width / SPLIT_FACTOR; - let y_start = bounds.y_min + iy * components.max_height / SPLIT_FACTOR; - let y_end = bounds.y_min + (iy + 1) * components.max_height / SPLIT_FACTOR; + let x_start = bounds.x_min + ix * area.0 / SPLIT_FACTOR; + let x_end = bounds.x_min + (ix + 1) * area.0 / SPLIT_FACTOR; + let y_start = bounds.y_min + iy * area.1 / SPLIT_FACTOR; + let y_end = bounds.y_min + (iy + 1) * area.1 / SPLIT_FACTOR; let mut count = 0; for x in x_start..x_end { @@ -148,9 +153,6 @@ struct ComponentsWithBounds { // but we don't want to waste a place in this vector. bounds: Vec>, - max_width: u32, - max_height: u32, - /// Stores the indices of `self.bounds` sorted based on their min position. bounds_by_position: Vec, } @@ -202,20 +204,6 @@ impl ComponentsWithBounds { } } // }}} - // {{{ Compute max width/height - let max_width = bounds - .iter() - .filter_map(|o| o.as_ref()) - .map(|b| b.x_max - b.x_min) - .max() - .ok_or_else(|| "No connected components found")?; - let max_height = bounds - .iter() - .filter_map(|o| o.as_ref()) - .map(|b| b.y_max - b.y_min) - .max() - .ok_or_else(|| "No connected components found")?; - // }}} let mut bounds_by_position: Vec = (0..(bounds.len())) .filter(|i| bounds[*i].is_some()) @@ -225,8 +213,6 @@ impl ComponentsWithBounds { Ok(Self { components, bounds, - max_width, - max_height, bounds_by_position, }) } @@ -235,11 +221,14 @@ impl ComponentsWithBounds { // {{{ Char measurements pub struct CharMeasurements { chars: Vec<(char, ComponentVec)>, + + max_width: u32, + max_height: u32, } impl CharMeasurements { // {{{ Creation - pub fn from_text(face: &mut Face, string: &str, _weight: Option) -> Result { + pub fn from_text(face: &mut Face, string: &str, weight: Option) -> Result { // These are bad estimates lol let char_w = 35; let char_h = 60; @@ -255,7 +244,7 @@ impl CharMeasurements { size: char_h, color: Color::BLACK, // TODO: do we want to use the weight hint for resilience? - weight: None, + weight, }, &string, )?; @@ -267,30 +256,64 @@ impl CharMeasurements { let components = ComponentsWithBounds::from_image(&image)?; + // {{{ Compute max width/height + let max_width = components + .bounds + .iter() + .filter_map(|o| o.as_ref()) + .map(|b| b.x_max - b.x_min) + .max() + .ok_or_else(|| "No connected components found")?; + let max_height = components + .bounds + .iter() + .filter_map(|o| o.as_ref()) + .map(|b| b.y_max - b.y_min) + .max() + .ok_or_else(|| "No connected components found")?; + // }}} + let mut chars = Vec::with_capacity(string.len()); for (i, char) in string.chars().enumerate() { chars.push(( char, ComponentVec::from_component( &components, + (max_width, max_height), components.bounds_by_position[i] as u32 + 1, )?, )) } - Ok(Self { chars }) + Ok(Self { + chars, + max_width, + max_height, + }) } // }}} // {{{ Recognition - pub fn recognise(&self, image: &DynamicImage) -> Result { - let components = ComponentsWithBounds::from_image(image)?; + pub fn recognise(&self, image: &DynamicImage, whitelist: &str) -> Result { + let components = timed!("from_image", { ComponentsWithBounds::from_image(image)? }); let mut result = String::new(); + + let max_height = components + .bounds + .iter() + .filter_map(|o| o.as_ref()) + .map(|b| b.y_max - b.y_min) + .max() + .ok_or_else(|| "No connected components found")?; + let max_width = self.max_width * max_height / self.max_height; + for i in &components.bounds_by_position { - let vec = ComponentVec::from_component(&components, *i as u32 + 1)?; + let vec = + ComponentVec::from_component(&components, (max_width, max_height), *i as u32 + 1)?; let best_match = self .chars .iter() + .filter(|(c, _)| whitelist.contains(*c)) .map(|(i, v)| (*i, v, v.distance_squared_to(&vec))) .min_by(|(_, _, d1), (_, _, d2)| { d1.partial_cmp(d2).expect("NaN distance encountered") diff --git a/src/recognition/recognize.rs b/src/recognition/recognize.rs index d4b3410..b00835f 100644 --- a/src/recognition/recognize.rs +++ b/src/recognition/recognize.rs @@ -4,7 +4,6 @@ use std::str::FromStr; use hypertesseract::{PageSegMode, Tesseract}; use image::imageops::FilterType; use image::{DynamicImage, GenericImageView}; -use image::{ImageBuffer, Rgba}; use num::integer::Roots; use poise::serenity_prelude::{CreateAttachment, CreateEmbed, CreateMessage}; @@ -14,11 +13,12 @@ use crate::arcaea::score::Score; use crate::bitmap::{Color, Rect}; use crate::context::{Context, Error, UserContext}; use crate::levenshtein::edit_distance; -use crate::logs::debug_image_buffer_log; +use crate::logs::debug_image_log; use crate::recognition::fuzzy_song_name::guess_chart_name; use crate::recognition::ui::{ ScoreScreenRect, SongSelectRect, UIMeasurementRect, UIMeasurementRect::*, }; +use crate::timed; use crate::transform::rotate; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -47,10 +47,8 @@ impl ImageAnalyzer { // {{{ Crop #[inline] - pub fn crop(&mut self, image: &DynamicImage, rect: Rect) -> ImageBuffer, Vec> { - image - .crop_imm(rect.x as u32, rect.y as u32, rect.width, rect.height) - .to_rgba8() + pub fn crop(&mut self, image: &DynamicImage, rect: Rect) -> DynamicImage { + image.crop_imm(rect.x as u32, rect.y as u32, rect.width, rect.height) } #[inline] @@ -59,12 +57,12 @@ impl ImageAnalyzer { ctx: &UserContext, image: &DynamicImage, ui_rect: UIMeasurementRect, - ) -> Result, Vec>, Error> { + ) -> Result { let rect = ctx.ui_measurements.interpolate(ui_rect, image)?; self.last_rect = Some((ui_rect, rect)); let result = self.crop(image, rect); - debug_image_buffer_log(&result)?; + debug_image_log(&result)?; Ok(result) } @@ -76,16 +74,14 @@ impl ImageAnalyzer { image: &DynamicImage, ui_rect: UIMeasurementRect, size: (u32, u32), - ) -> Result, Vec>, Error> { + ) -> Result { let rect = ctx.ui_measurements.interpolate(ui_rect, image)?; self.last_rect = Some((ui_rect, rect)); let result = self.crop(image, rect); - let result = DynamicImage::ImageRgba8(result) - .resize(size.0, size.1, FilterType::Nearest) - .into_rgba8(); + let result = result.resize(size.0, size.1, FilterType::Nearest); - debug_image_buffer_log(&result)?; + debug_image_log(&result)?; Ok(result) } @@ -138,32 +134,35 @@ impl ImageAnalyzer { image: &DynamicImage, kind: ScoreKind, ) -> Result { - let image = self.interp_crop_resize( - ctx, - image, - match kind { - ScoreKind::SongSelect => SongSelect(SongSelectRect::Score), - ScoreKind::ScoreScreen => ScoreScreen(ScoreScreenRect::Score), - }, - (u32::MAX, 100), - )?; + let image = timed!("interp_crop_resize", { + self.interp_crop_resize( + ctx, + image, + match kind { + ScoreKind::SongSelect => SongSelect(SongSelectRect::Score), + ScoreKind::ScoreScreen => ScoreScreen(ScoreScreenRect::Score), + }, + (u32::MAX, 100), + )? + }); let measurements = match kind { ScoreKind::SongSelect => &ctx.exo_measurements, ScoreKind::ScoreScreen => &ctx.geosans_measurements, }; - let result = Score( - measurements - .recognise(&DynamicImage::ImageRgba8(image))? - .chars() - .filter(|c| *c != '\'') - .collect::() - .parse()?, - ); + let result = timed!("full recognition", { + Score( + measurements + .recognise(&image, "0123456789'")? + .chars() + .filter(|c| *c != '\'') + .collect::() + .parse()?, + ) + }); - // {{{ Return score if consensus exists - // 1. Discard scores that are known to be impossible + // Discard scores if it's impossible if result.0 <= 10_010_000 && note_count.map_or(true, |note_count| { let (zeta, shinies, score_units) = result.analyse(note_count); @@ -222,11 +221,11 @@ impl ImageAnalyzer { .language(hypertesseract::Language::English) .page_seg_mode(PageSegMode::RawLine) .build()? - .recognize_text_cloned_with_conf(&self.interp_crop( - ctx, - image, - ScoreScreen(ScoreScreenRect::Difficulty), - )?)?; + .recognize_text_cloned_with_conf( + &self + .interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::Difficulty))? + .into_rgba8(), + )?; let text = text.trim().to_lowercase(); @@ -254,21 +253,10 @@ impl ImageAnalyzer { ctx: &UserContext, image: &DynamicImage, ) -> Result { - let (text, conf) = Tesseract::builder() - .language(hypertesseract::Language::English) - .page_seg_mode(PageSegMode::RawLine) - .build()? - .recognize_text_cloned_with_conf(&self.interp_crop(ctx, image, PlayKind)?)?; - - let text = text.trim().to_string(); - - if conf < 10 && conf != 0 { - return Err(format!( - "Score kind text is not readable (confidence = {}, text = {}).", - conf, text - ) - .into()); - } + let image = self.interp_crop(ctx, image, PlayKind)?; + let text = ctx + .exo_measurements + .recognise(&image, "resultselectasong")?; let result = if edit_distance(&text, "Result") < edit_distance(&text, "Select a song") { ScoreKind::ScoreScreen @@ -291,11 +279,11 @@ impl ImageAnalyzer { .page_seg_mode(PageSegMode::SingleLine) .whitelist_str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,.()- ")? .build()? - .recognize_text_cloned_with_conf(&self.interp_crop( - ctx, - image, - ScoreScreen(ScoreScreenRect::Title), - )?)?; + .recognize_text_cloned_with_conf( + &self + .interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::Title))? + .into_rgba8(), + )?; if conf < 20 && conf != 0 { return Err(format!( @@ -310,7 +298,7 @@ impl ImageAnalyzer { } // }}} // {{{ Read jacket - pub async fn read_jacket<'a>( + pub fn read_jacket<'a>( &mut self, ctx: &'a UserContext, image: &mut DynamicImage, @@ -374,7 +362,11 @@ impl ImageAnalyzer { .whitelist_str("0123456789")? .assume_numeric_input() .build()? - .recognize_text_cloned(&self.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?)?; + .recognize_text_cloned( + &self + .interp_crop(ctx, image, ScoreScreen(KINDS[i]))? + .into_rgba8(), + )?; println!("Raw '{}'", text.trim()); out[i] = u32::from_str(&text.trim()).unwrap_or(0); @@ -396,11 +388,11 @@ impl ImageAnalyzer { .whitelist_str("0123456789")? .assume_numeric_input() .build()? - .recognize_text_cloned_with_conf(&self.interp_crop( - ctx, - image, - ScoreScreen(ScoreScreenRect::MaxRecall), - )?)?; + .recognize_text_cloned_with_conf( + &self + .interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))? + .into_rgba8(), + )?; let max_recall = u32::from_str_radix(text.trim(), 10)?; diff --git a/src/time.rs b/src/time.rs new file mode 100644 index 0000000..f9d692a --- /dev/null +++ b/src/time.rs @@ -0,0 +1,11 @@ +#[macro_export] +macro_rules! timed { + ($label:expr, $code:block) => {{ + use std::time::Instant; + let start = Instant::now(); + let result = { $code }; // Execute the code block + let duration = start.elapsed(); + println!("{}: {:?}", $label, duration); + result + }}; +}