From d448b3ba42b501bffd8c3cc61c0ec528374f8692 Mon Sep 17 00:00:00 2001 From: NA Date: Fri, 26 Jan 2024 18:55:14 +0000 Subject: [PATCH] Epub generation looks finished. Thinking of making it use CSS files from RoyalRoad. --- src/html.rs | 61 ++++++++++++++++++++++++++++++++++++++- src/http.rs | 30 ++++++++++++++++++- src/library.rs | 78 ++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 158 insertions(+), 11 deletions(-) diff --git a/src/html.rs b/src/html.rs index 2d32be6..cb0fc08 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,7 +1,8 @@ -use std::{collections::HashMap, process::exit}; +use std::{collections::HashMap, io::Write, process::{exit, Command, Stdio}}; use regex::Regex; use scraper::{Html, Selector}; +use tempdir::TempDir; use url::Url; use crate::misc::HashMapExt; @@ -188,4 +189,62 @@ pub fn extract_urls_and_img_tag(chapter_html: &Html) -> HashMap } return chapter_image_urls; +} + +/// Replace the image tag with new one that contains the new src attribute. +pub fn replace_img_src(img_tag: String, new_src: String) -> String { + let img_tag = string_to_html_fragment(&img_tag); + + let selector = Selector::parse("img").unwrap(); + let element = img_tag.select(&selector).next().unwrap(); + + + if element.attr("src").is_some() { + let image_tag = element.html(); + + let src_match_regex = Regex::new(r#"(src=["'].*["'])"#).unwrap(); + let src_attr = src_match_regex.captures(&image_tag).unwrap().get(0).map(|m| m.as_str()).unwrap(); + + return image_tag.replace(src_attr, &format!(r#"src="{new_src}""#)); + } + else { + return element.html(); + } +} + +/// Convert a given html dom into xhtml. +pub fn html_to_xhtml(html: Html, html2xhtml_dir: &TempDir) -> String { + #[cfg(target_os = "windows")] + const HTML2XHTML_ENTRY: &str = "html2xhtml.exe"; + + #[cfg(target_os = "linux")] + const HTML2XHTML_ENTRY: &str = "html2xhtml"; + + #[cfg(target_os = "macos")] + const HTML2XHTML_ENTRY: &str = "html2xhtml"; + + // Remove nbsp, They can cause certain e-readers to crash. + let html = html.html().replace(" ", " "); + + // Start html2xhtml. + let mut html2xhtml = match Command::new(html2xhtml_dir.path().join(HTML2XHTML_ENTRY)) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + { + Ok(child) => child, + Err(error) => { + eprintln!("Error! Unable to start html2xhtml: {error}"); + exit(1); + }, + }; + + // Write the html to the stdin, then wait for xhtml to be outputted to the stdout. + html2xhtml.stdin.as_mut().unwrap().write_all(html.as_bytes()).unwrap(); + let html2xhtml_output = html2xhtml.wait_with_output().unwrap(); + + // Generate a lossy string from the stdout. + let xhtml = String::from_utf8_lossy(&html2xhtml_output.stdout).to_string(); + + return xhtml; } \ No newline at end of file diff --git a/src/http.rs b/src/http.rs index e01aa87..317424d 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,4 +1,4 @@ -use std::process::exit; +use std::{collections::HashMap, process::exit}; use reqwest::{blocking::Response, header::HeaderMap}; use url::Url; @@ -36,6 +36,34 @@ impl HttpResponse { } } } + + /// Attempt to get the content(mime)-type and file extension from the http-header. + /// + /// If the content-type header value can not be found it will warn the use and return empty strings. + pub fn get_content_type_and_file_extension(&self) -> (String, String) { + // A hashmap to convert mime-types to file extensions. + let mime_to_file_extension: HashMap<&str, &str> = HashMap::from([ + ("image/png", "png"), + ("image/webp", "webp"), + ("image/jpeg", "jpeg"), + ("image/jpg", "jpg"), + ]); + + let content_type = match self.get_headers()["content-type"].to_str() { + Ok(content_type) => content_type, + Err(warning) => { + eprintln!("Warning! Unable to get content type from the http-header: {warning}"); + return (String::with_capacity(0), String::with_capacity(0)); + } + }; + + if mime_to_file_extension.contains_key(content_type) { + return (content_type.to_string(), mime_to_file_extension[content_type].to_string()); + } + else { + return (content_type.to_string(), String::with_capacity(0)); + } + } } /// Get an http response for a given url. Exits the program if it fails. diff --git a/src/library.rs b/src/library.rs index 7d3328e..eea17ea 100644 --- a/src/library.rs +++ b/src/library.rs @@ -1,9 +1,12 @@ -use std::{fs::OpenOptions, io::Write, path::PathBuf, process::exit}; +use std::{collections::HashMap, fs::OpenOptions, io::Write, path::PathBuf, process::exit}; +use bytes::Buf; use chrono::prelude::Local; use clap::Args; use epub_builder::{EpubBuilder, EpubContent, ReferenceType, ZipLibrary}; use file_system_crap::convert_path_to_os_specific; +use html::{html_to_xhtml, remove_image_tags, string_to_html_fragment}; +use indicatif::{ProgressBar, ProgressStyle}; use url::Url; mod book; @@ -80,18 +83,23 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB .expect("Unable to add title metadata"); // Download the cover image & add it to the epub. - let cover_image = http::get_response(book.cover_image_url).get_bytes().to_vec(); - epub_builder.add_cover_image("cover.jpeg", cover_image.as_slice(), "image/jpeg").expect("Unable to add cover image."); + let cover_image = http::get_response(book.cover_image_url); + let (cover_mime_type, cover_file_extension) = cover_image.get_content_type_and_file_extension(); + epub_builder.add_cover_image( + format!("cover.{cover_file_extension}"), + cover_image.get_bytes().to_vec().as_slice(), + cover_mime_type).expect("Error! Unable to add cover image."); // Generate the cover xhtml. let cover_xhtml = format!( r#"

{1}

- -

by: {2}

-

Archived on: {3}

"#, + +

by: {3}

+

Archived on: {4}

"#, book.book_url, book.title, + cover_file_extension, book.author, chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false) ); @@ -102,7 +110,7 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB EpubContent::new("title.xhtml", cover_xhtml.as_bytes()) .title("Cover") .reftype(ReferenceType::Cover), - ).expect("Unable to add cover"); + ).expect("Error! Unable to add cover"); // Add a table of contents after the cover page. epub_builder.inline_toc(); @@ -110,8 +118,60 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB // Setup html2xhtml on the operating system. let html2xhtml_dir = file_system_crap::setup_html2xhtml(); - // TODO! Generate the epub body, deal with images etc etc. You know pickup from last night etc etc. - // Finish setup_html2xhtml() first though dummy. + let mut old_tags_new_tags: HashMap = HashMap::new(); + + if !epub_args.no_images { + // Download the images and add em to the epub. + + println!("\nDownloading and processing images:"); + // Spawn a progress bar showing how many images have been downloaded & processed. + let progress_bar = ProgressBar::new(book.image_urls_and_tags.keys().len().try_into().unwrap()); + progress_bar.set_style( + ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}% ") + .unwrap() + .progress_chars("#>-"), + ); + + let mut i: usize = 0; + for image_url in book.image_urls_and_tags.keys() { + let image = http::get_response(image_url.clone()); + let (image_mime_type, image_file_extension) = image.get_content_type_and_file_extension(); + epub_builder.add_resource( + format!("image_{i}.{image_file_extension}"), + image.get_bytes().to_vec().reader(), + image_mime_type).expect("Error! Unable to add content image"); + + for image_tag in book.image_urls_and_tags[image_url].clone() { + old_tags_new_tags.insert(image_tag.clone(), html::replace_img_src(image_tag, format!("image_{i}.{image_file_extension}"))); + } + + i+=1; + progress_bar.inc(1); + } + + progress_bar.finish(); + } + + // Convert the html to xhtml and add the xhtml to the epub for each chapter. + for (i, chapter) in book.chapters.iter().enumerate() { + + let xhtml: String; + if epub_args.no_images { + xhtml = html_to_xhtml(string_to_html_fragment(&remove_image_tags(&chapter.isolated_chapter_html)), &html2xhtml_dir) + } + else { + let mut replaced_html = chapter.isolated_chapter_html.html(); + for old_img_tag in old_tags_new_tags.keys() { + replaced_html = replaced_html.replace(&old_img_tag.clone(), &old_tags_new_tags[old_img_tag]); + } + + xhtml = html_to_xhtml(string_to_html_fragment(&replaced_html), &html2xhtml_dir); + } + + epub_builder.add_content(EpubContent::new(format!("chapter_{}.xhtml", i+1), xhtml.as_bytes()) + .title(chapter.chapter_name.clone()) + .reftype(ReferenceType::Text)).expect("Error! Unable to add chapter"); + } // Generate the finished epub data as a byte vector. let mut finished_epub: Vec = vec![];