1
Fork 0

No longer use tesseract, I guess?

Signed-off-by: prescientmoon <git@moonythm.dev>
This commit is contained in:
prescientmoon 2024-08-11 19:49:46 +02:00
parent 5c95cdb018
commit 48c1f74f93
Signed by: prescientmoon
SSH key fingerprint: SHA256:UUF9JT2s8Xfyv76b8ZuVL7XrmimH4o49p4b+iexbVH4
8 changed files with 177 additions and 183 deletions

1
Cargo.lock generated
View file

@ -2605,6 +2605,7 @@ dependencies = [
"num",
"plotters",
"poise",
"rand",
"sqlx",
"tokio",
]

View file

@ -14,6 +14,7 @@ sqlx = { version = "0.8.0", features = ["sqlite", "runtime-tokio", "chrono"] }
hypertesseract = { features=["image"], git="https://github.com/BlueGhostGH/hypertesseract.git", rev="4e05063" }
tokio = {version="1.38.0", features=["rt-multi-thread"]}
imageproc = "0.25.0"
rand = "0.8.5"
[profile.dev.package."*"]
opt-level = 3

View file

@ -21,7 +21,7 @@ impl Difficulty {
pub const DIFFICULTY_SHORTHANDS: [&'static str; 5] = ["PST", "PRS", "FTR", "ETR", "BYD"];
pub const DIFFICULTY_STRINGS: [&'static str; 5] =
["past", "present", "future", "eternal", "beyond"];
["PAST", "PRESENT", "FUTURE", "ETERNAL", "BEYOND"];
#[inline]
pub fn to_index(self) -> usize {

View file

@ -7,6 +7,7 @@ use crate::recognition::recognize::{ImageAnalyzer, ScoreKind};
use crate::user::{discord_it_to_discord_user, User};
use crate::{edit_reply, get_user, timed};
use image::DynamicImage;
use poise::serenity_prelude::futures::future::join_all;
use poise::serenity_prelude::CreateMessage;
use poise::{serenity_prelude as serenity, CreateReply};
use sqlx::query;
@ -34,7 +35,9 @@ pub async fn magic(
if files.len() == 0 {
ctx.reply("No images found attached to message").await?;
} else {
return Ok(());
}
let mut embeds = Vec::with_capacity(files.len());
let mut attachments = Vec::with_capacity(files.len());
let handle = ctx
@ -43,15 +46,31 @@ pub async fn magic(
let mut analyzer = ImageAnalyzer::default();
for (i, file) in files.iter().enumerate() {
// {{{ Download files
let download_tasks = files
.iter()
.filter(|file| file.dimensions().is_some())
.map(|file| async move { (file, file.download().await) });
let downloaded = timed!("dowload_files", { join_all(download_tasks).await });
if downloaded.len() < files.len() {
ctx.reply("One or more of the attached files are not images!")
.await?;
}
// }}}
for (i, (file, bytes)) in downloaded.into_iter().enumerate() {
let bytes = bytes?;
let start = Instant::now();
if let Some(_) = file.dimensions() {
let bytes = timed!("file download", { file.download().await? });
// {{{ Preapare image
let mut image = timed!("decode image", { image::load_from_memory(&bytes)? });
let mut grayscale_image = timed!("grayscale image", {
DynamicImage::ImageLuma8(image.to_luma8())
});
// image = image.resize(1024, 1024, FilterType::Nearest);
// }}}
let result: Result<(), Error> = try {
// {{{ Detection
@ -64,7 +83,7 @@ pub async fn magic(
// edit_reply!(ctx, handle, "Image {}: reading difficulty", i + 1).await?;
// Do not use `ocr_image` because this reads the colors
let difficulty = timed!("read_difficulty", {
analyzer.read_difficulty(ctx.data(), &image, kind)?
analyzer.read_difficulty(ctx.data(), &image, &grayscale_image, kind)?
});
// edit_reply!(ctx, handle, "Image {}: reading jacket", i + 1).await?;
@ -72,40 +91,31 @@ pub async fn magic(
analyzer.read_jacket(ctx.data(), &mut image, kind, difficulty)?
});
let (note_distribution, max_recall) = match kind {
let max_recall = match kind {
ScoreKind::ScoreScreen => {
edit_reply!(ctx, handle, "Image {}: reading distribution", i + 1)
.await?;
let note_distribution =
Some(analyzer.read_distribution(ctx.data(), &grayscale_image)?);
edit_reply!(ctx, handle, "Image {}: reading max recall", i + 1).await?;
let max_recall =
Some(analyzer.read_max_recall(ctx.data(), &grayscale_image)?);
(note_distribution, max_recall)
// edit_reply!(ctx, handle, "Image {}: reading max recall", i + 1).await?;
Some(analyzer.read_max_recall(ctx.data(), &grayscale_image)?)
}
ScoreKind::SongSelect => (None, None),
ScoreKind::SongSelect => None,
};
grayscale_image.invert();
let note_distribution = match kind {
ScoreKind::ScoreScreen => {
// edit_reply!(ctx, handle, "Image {}: reading distribution", i + 1).await?;
Some(analyzer.read_distribution(ctx.data(), &grayscale_image)?)
}
ScoreKind::SongSelect => None,
};
// edit_reply!(ctx, handle, "Image {}: reading score", i + 1).await?;
let score = timed!("read_score", {
analyzer.read_score(
ctx.data(),
Some(chart.note_count),
&grayscale_image,
kind,
)?
analyzer.read_score(ctx.data(), Some(chart.note_count), &grayscale_image, kind)?
});
// {{{ Build play
let maybe_fars = Score::resolve_distibution_ambiguities(
score,
note_distribution,
chart.note_count,
);
let maybe_fars =
Score::resolve_distibution_ambiguities(score, note_distribution, chart.note_count);
let play = CreatePlay::new(score, &chart, &user)
.with_attachment(file)
@ -132,11 +142,7 @@ pub async fn magic(
.send_discord_error(ctx, &image, &file.filename, err)
.await?;
}
} else {
ctx.reply("One of the attached files is not an image!")
.await?;
continue;
}
let took = start.elapsed();
edit_reply!(
@ -156,7 +162,6 @@ pub async fn magic(
.send_files(ctx.http(), attachments, CreateMessage::new().embeds(embeds))
.await?;
}
}
Ok(())
}

View file

@ -39,7 +39,11 @@ impl UserContext {
let geosans_measurements = GEOSANS_FONT
.with_borrow_mut(|font| CharMeasurements::from_text(font, "0123456789'", None))?;
let exo_measurements = EXO_FONT.with_borrow_mut(|font| {
CharMeasurements::from_text(font, "0123456789'abcdefghijklmnopqrstuvwxyz", Some(700))
CharMeasurements::from_text(
font,
"0123456789'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
Some(700),
)
})?;
println!("Created user context");

View file

@ -6,10 +6,9 @@
//! allows for a convenient way to throw images into a `logs` directory with
//! a simple env var.
use std::{env, ops::Deref};
use std::{env, ops::Deref, sync::OnceLock, time::Instant};
use image::{DynamicImage, EncodableLayout, ImageBuffer, PixelWithColorType};
use poise::serenity_prelude::Timestamp;
use crate::context::Error;
@ -20,10 +19,19 @@ fn should_save_debug_images() -> bool {
.unwrap_or(false)
}
#[inline]
fn get_startup_time() -> Instant {
static CELL: OnceLock<Instant> = OnceLock::new();
*CELL.get_or_init(|| Instant::now())
}
#[inline]
pub fn debug_image_log(image: &DynamicImage) -> Result<(), Error> {
if should_save_debug_images() {
image.save(format!("./logs/{}.png", Timestamp::now()))?;
image.save(format!(
"./logs/{:0>15}.png",
get_startup_time().elapsed().as_nanos()
))?;
}
Ok(())
@ -37,7 +45,10 @@ where
C: Deref<Target = [P::Subpixel]>,
{
if should_save_debug_images() {
image.save(format!("./logs/{}.png", Timestamp::now()))?;
image.save(format!(
"./logs/{:0>15}.png",
get_startup_time().elapsed().as_nanos()
))?;
}
Ok(())

View file

@ -1,4 +1,4 @@
//! Hyperglass my own specialized OCR system, created as a result of my
//! Hyperglass is my own specialized OCR system, created as a result of my
//! annoyance with how unreliable tesseract is. Assuming we know the font,
//! OCR should be almost perfect, even when faced with stange kerning. This is
//! what this module achieves!
@ -158,8 +158,12 @@ struct ComponentsWithBounds {
}
impl ComponentsWithBounds {
fn from_image(image: &DynamicImage) -> Result<Self, Error> {
let image = threshold(&image.to_luma8(), 100, ThresholdType::Binary);
fn from_image(image: &DynamicImage, binarisation_threshold: u8) -> Result<Self, Error> {
let image = threshold(
&image.to_luma8(),
binarisation_threshold,
ThresholdType::Binary,
);
debug_image_buffer_log(&image)?;
let background = Luma([u8::MAX]);
@ -168,7 +172,7 @@ impl ComponentsWithBounds {
let mut bounds: Vec<Option<ComponentBounds>> = Vec::new();
for x in 0..components.width() {
for y in 0..components.height() {
// {{{ Retrieve pixel if it's not backround
// {{{ Retrieve pixel if it's not background
let component = components[(x, y)].0[0];
if component == 0 {
continue;
@ -254,7 +258,7 @@ impl CharMeasurements {
debug_image_log(&image)?;
let components = ComponentsWithBounds::from_image(&image)?;
let components = ComponentsWithBounds::from_image(&image, 100)?;
// {{{ Compute max width/height
let max_width = components
@ -293,9 +297,16 @@ impl CharMeasurements {
}
// }}}
// {{{ Recognition
pub fn recognise(&self, image: &DynamicImage, whitelist: &str) -> Result<String, Error> {
let components = timed!("from_image", { ComponentsWithBounds::from_image(image)? });
let mut result = String::new();
pub fn recognise(
&self,
image: &DynamicImage,
whitelist: &str,
binarisation_threshold: Option<u8>,
) -> Result<String, Error> {
let components = timed!("from_image", {
ComponentsWithBounds::from_image(image, binarisation_threshold.unwrap_or(100))?
});
let mut result = String::with_capacity(components.bounds.len());
let max_height = components
.bounds

View file

@ -1,5 +1,4 @@
use std::fmt::Display;
use std::str::FromStr;
use hypertesseract::{PageSegMode, Tesseract};
use image::imageops::FilterType;
@ -154,7 +153,7 @@ impl ImageAnalyzer {
let result = timed!("full recognition", {
Score(
measurements
.recognise(&image, "0123456789'")?
.recognise(&image, "0123456789'", None)?
.chars()
.filter(|c| *c != '\'')
.collect::<String>()
@ -182,6 +181,7 @@ impl ImageAnalyzer {
&mut self,
ctx: &UserContext,
image: &DynamicImage,
grayscale_image: &DynamicImage,
kind: ScoreKind,
) -> Result<Difficulty, Error> {
if kind == ScoreKind::SongSelect {
@ -202,10 +202,6 @@ impl ImageAnalyzer {
)
.unwrap();
// rect.width = 100;
// rect.height = 100;
// self.crop_image_to_bytes(image, rect).unwrap();
let image_color = image.get_pixel(rect.x as u32, rect.y as u32);
let image_color = Color::from_bytes(image_color.0);
@ -217,25 +213,15 @@ impl ImageAnalyzer {
return Ok(min.1);
}
let (text, conf) = Tesseract::builder()
.language(hypertesseract::Language::English)
.page_seg_mode(PageSegMode::RawLine)
.build()?
.recognize_text_cloned_with_conf(
&self
.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::Difficulty))?
.into_rgba8(),
let image = self.interp_crop(
ctx,
grayscale_image,
ScoreScreen(ScoreScreenRect::Difficulty),
)?;
let text = text.trim().to_lowercase();
if conf < 10 && conf != 0 {
return Err(format!(
"Difficulty text is not readable (confidence = {}, text = {}).",
conf, text
)
.into());
}
let text =
ctx.exo_measurements
.recognise(&image, "PASTPRESENTFUTUREETERNALBEYOND", None)?;
let difficulty = Difficulty::DIFFICULTIES
.iter()
@ -256,7 +242,7 @@ impl ImageAnalyzer {
let image = self.interp_crop(ctx, image, PlayKind)?;
let text = ctx
.exo_measurements
.recognise(&image, "resultselectasong")?;
.recognise(&image, "resultselectasong", None)?;
let result = if edit_distance(&text, "Result") < edit_distance(&text, "Select a song") {
ScoreKind::ScoreScreen
@ -356,21 +342,13 @@ impl ImageAnalyzer {
static KINDS: [ScoreScreenRect; 3] = [Pure, Far, Lost];
for i in 0..3 {
let text = Tesseract::builder()
.language(hypertesseract::Language::English)
.page_seg_mode(PageSegMode::SparseText)
.whitelist_str("0123456789")?
.assume_numeric_input()
.build()?
.recognize_text_cloned(
&self
.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?
.into_rgba8(),
)?;
println!("Raw '{}'", text.trim());
out[i] = u32::from_str(&text.trim()).unwrap_or(0);
let image = self.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?;
out[i] = ctx
.exo_measurements
.recognise(&image, "0123456789", Some(30))?
.parse()?;
}
println!("Ditribution {out:?}");
Ok((out[0], out[1], out[2]))
@ -382,28 +360,11 @@ impl ImageAnalyzer {
ctx: &'a UserContext,
image: &DynamicImage,
) -> Result<u32, Error> {
let (text, conf) = Tesseract::builder()
.language(hypertesseract::Language::English)
.page_seg_mode(PageSegMode::SingleLine)
.whitelist_str("0123456789")?
.assume_numeric_input()
.build()?
.recognize_text_cloned_with_conf(
&self
.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))?
.into_rgba8(),
)?;
let max_recall = u32::from_str_radix(text.trim(), 10)?;
if conf < 20 && conf != 0 {
return Err(format!(
"Title text is not readable (confidence = {}, text = {}).",
conf,
text.trim()
)
.into());
}
let image = self.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))?;
let max_recall = ctx
.exo_measurements
.recognise(&image, "0123456789", None)?
.parse()?;
Ok(max_recall)
}