royal_road_archiver/src/book.rs

use std::collections::HashMap;

use indicatif::{ProgressBar, ProgressStyle};
use crate::misc::HashMapExt;
use scraper::Html;
use url::Url;

use crate::{file_system_crap::remove_illegal_chars, html, http};

/// A struct representing a book & all the needed data to generate one.
pub struct Book {
    /// The RoyalRoad Url for the book.
    pub book_url: Url,

    /// The book's title.
    pub title: String,

    /// Book title used for the filename.
    /// Should have illegal chars expunged via file_system_crap::remove_illegal_chars.
    pub file_name_title: String,

    /// The book's author.
    pub author: String,
    
    /// A Url to the book's cover image.
    pub cover_image_url: Url,

    /// The raw html data of the RoyalRoad index page.
    index_html: Html,

    /// A vector of the book's chapters.
    pub chapters: Vec<Chapter>,

    /// A hashmap representing the book image urls and their corresponding img html tags.
    pub image_urls_and_tags: HashMap<Url, Vec<String>>,
}

impl Book {
    /// Generate a new book instance with all the needed data from a given url.
    pub fn new(book_url: Url) -> Book {
        let index_html = html::string_to_html_document(&http::get_response(book_url.clone()).get_text());

        let chapter_names_and_urls = html::get_chapter_names_and_urls_from_index(&index_html);
        let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len());

        let mut image_urls_and_tags: HashMap<Url, Vec<String>> = HashMap::new();

        println!("\nDownloading and processing chapters:");
        // Spawn a progress bar showing how many chapters have been downloaded & processed.
        let progress_bar = ProgressBar::new(chapter_names_and_urls.len().try_into().unwrap());
        progress_bar.set_style(
            ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}%  ")
                .unwrap()
                .progress_chars("#>-"),
        );

        // Generate the chapters and add em to the book.
        for i in 0..chapter_names_and_urls.len() {
            let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]);

            // extract the image urls and add em to the image_urls_and_tags hashmap.
            image_urls_and_tags = image_urls_and_tags.join(html::extract_urls_and_img_tag(&chapter.isolated_chapter_html));

            chapters.push(chapter);

            progress_bar.inc(1);
        }

        progress_bar.finish();

        let title = html::get_title_from_index(&index_html);

        Book { 
            book_url: book_url, 
            title: title.clone(),
            file_name_title: remove_illegal_chars(title),
            author: html::get_author_from_index(&index_html),
            cover_image_url: http::string_to_url(&html::get_cover_image_url_from_index(&index_html)),
            index_html: index_html,
            chapters: chapters,
            image_urls_and_tags,
        }
    }

    /// Count how many paragraphs are in the book.
    pub fn count_paragraphs(&self) -> u128 {
        // TODO!
        0
    }
}

/// A struct representing a chapter.
pub struct Chapter {
    /// The Url of the chapter.
    chapter_url: Url,
    
    /// The name of the chapter.
    pub chapter_name: String,
    
    /// The raw html data of the chapter page.
    raw_chapter_html: Html,

    /// The isolated chapter html.
    pub isolated_chapter_html: Html,
}

impl Chapter {
    fn new(chapter_name: &str, chapter_url: &str) -> Self {
        let chapter_url = http::string_to_url(&chapter_url);
        let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text());

        Chapter {
            chapter_url: chapter_url, 
            chapter_name: chapter_name.to_string(),
            isolated_chapter_html: html::isolate_chapter_content(&raw_chapter_html),
            raw_chapter_html: raw_chapter_html,
        }
    }
}
Removed rustls from reqwest since it breaks windows support 2024-01-25 11:16:03 -06:00			`use std::collections::HashMap;`

Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`use indicatif::{ProgressBar, ProgressStyle};`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`use crate::misc::HashMapExt;`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`use scraper::Html;`
			`use url::Url;`

Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`use crate::{file_system_crap::remove_illegal_chars, html, http};`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
			`/// A struct representing a book & all the needed data to generate one.`
			`pub struct Book {`
			`/// The RoyalRoad Url for the book.`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`pub book_url: Url,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
			`/// The book's title.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub title: String,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`/// Book title used for the filename.`
			`/// Should have illegal chars expunged via file_system_crap::remove_illegal_chars.`
			`pub file_name_title: String,`

Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`/// The book's author.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub author: String,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
			`/// A Url to the book's cover image.`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`pub cover_image_url: Url,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
			`/// The raw html data of the RoyalRoad index page.`
			`index_html: Html,`

			`/// A vector of the book's chapters.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub chapters: Vec<Chapter>,`
Removed rustls from reqwest since it breaks windows support 2024-01-25 11:16:03 -06:00
			`/// A hashmap representing the book image urls and their corresponding img html tags.`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`pub image_urls_and_tags: HashMap<Url, Vec<String>>,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`}`

			`impl Book {`
			`/// Generate a new book instance with all the needed data from a given url.`
			`pub fn new(book_url: Url) -> Book {`
			`let index_html = html::string_to_html_document(&http::get_response(book_url.clone()).get_text());`

			`let chapter_names_and_urls = html::get_chapter_names_and_urls_from_index(&index_html);`
			`let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len());`

Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`let mut image_urls_and_tags: HashMap<Url, Vec<String>> = HashMap::new();`
Removed rustls from reqwest since it breaks windows support 2024-01-25 11:16:03 -06:00
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`println!("\nDownloading and processing chapters:");`
			`// Spawn a progress bar showing how many chapters have been downloaded & processed.`
			`let progress_bar = ProgressBar::new(chapter_names_and_urls.len().try_into().unwrap());`
			`progress_bar.set_style(`
			`ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}% ")`
			`.unwrap()`
			`.progress_chars("#>-"),`
			`);`

			`// Generate the chapters and add em to the book.`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`for i in 0..chapter_names_and_urls.len() {`
			`let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]);`
Removed rustls from reqwest since it breaks windows support 2024-01-25 11:16:03 -06:00
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`// extract the image urls and add em to the image_urls_and_tags hashmap.`
			`image_urls_and_tags = image_urls_and_tags.join(html::extract_urls_and_img_tag(&chapter.isolated_chapter_html));`
Removed rustls from reqwest since it breaks windows support 2024-01-25 11:16:03 -06:00
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`chapters.push(chapter);`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00
			`progress_bar.inc(1);`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`}`

Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`progress_bar.finish();`

Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`let title = html::get_title_from_index(&index_html);`

Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`Book {`
			`book_url: book_url,`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`title: title.clone(),`
			`file_name_title: remove_illegal_chars(title),`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`author: html::get_author_from_index(&index_html),`
			`cover_image_url: http::string_to_url(&html::get_cover_image_url_from_index(&index_html)),`
			`index_html: index_html,`
			`chapters: chapters,`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`image_urls_and_tags,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`}`
			`}`

			`/// Count how many paragraphs are in the book.`
			`pub fn count_paragraphs(&self) -> u128 {`
			`// TODO!`
			`0`
			`}`
			`}`

			`/// A struct representing a chapter.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub struct Chapter {`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`/// The Url of the chapter.`
			`chapter_url: Url,`

			`/// The name of the chapter.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub chapter_name: String,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`/// The raw html data of the chapter page.`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`raw_chapter_html: Html,`

			`/// The isolated chapter html.`
Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`pub isolated_chapter_html: Html,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`}`

			`impl Chapter {`
			`fn new(chapter_name: &str, chapter_url: &str) -> Self {`
			`let chapter_url = http::string_to_url(&chapter_url);`
			`let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text());`

Markdown generation is complete. Time for the hard shit. 2024-01-25 10:13:09 -06:00			`Chapter {`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`chapter_url: chapter_url,`
			`chapter_name: chapter_name.to_string(),`
Working on epub generation. Got to work on setup_html2xhtml() next. 2024-01-25 21:17:50 -06:00			`isolated_chapter_html: html::isolate_chapter_content(&raw_chapter_html),`
			`raw_chapter_html: raw_chapter_html,`
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work 2024-01-25 08:49:55 -06:00			`}`
			`}`
			`}`