diff --git a/Cargo.lock b/Cargo.lock index 59a7e4e..9801447 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,10 +2,172 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cranelift-bforest" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b80c3a50b9c4c7e5b5f73c0ed746687774fc9e36ef652b110da8daebf0c6e0e6" +dependencies = [ + "cranelift-entity", +] + +[[package]] +name = "cranelift-bitset" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38778758c2ca918b05acb2199134e0c561fb577c50574259b26190b6c2d95ded" + +[[package]] +name = "cranelift-codegen" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58258667ad10e468bfc13a8d620f50dfcd4bb35d668123e97defa2549b9ad397" +dependencies = [ + "bumpalo", + "cranelift-bforest", + "cranelift-bitset", + "cranelift-codegen-meta", + "cranelift-codegen-shared", + "cranelift-control", + "cranelift-entity", + "cranelift-isle", + "gimli", + "hashbrown 0.14.5", + "log", + "regalloc2 0.9.3", + "rustc-hash 1.1.0", + "smallvec", + "target-lexicon", +] + +[[package]] +name = "cranelift-codegen-meta" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043f0b702e529dcb07ff92bd7d40e7d5317b5493595172c5eb0983343751ee06" +dependencies = [ + "cranelift-codegen-shared", +] + +[[package]] +name = "cranelift-codegen-shared" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7763578888ab53eca5ce7da141953f828e82c2bfadcffc106d10d1866094ffbb" + +[[package]] +name = "cranelift-control" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32db15f08c05df570f11e8ab33cb1ec449a64b37c8a3498377b77650bef33d8b" +dependencies = [ + "arbitrary", +] + +[[package]] +name = "cranelift-entity" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5289cdb399381a27e7bbfa1b42185916007c3d49aeef70b1d01cb4caa8010130" +dependencies = [ + "cranelift-bitset", +] + +[[package]] +name = "cranelift-isle" +version = "0.111.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b72a3c5c166a70426dcb209bdd0bb71a787c1ea76023dc0974fbabca770e8f9" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +dependencies = [ + "fallible-iterator", + "indexmap", + "stable_deref_trait", +] + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + [[package]] name = "hbbytecode" version = "0.1.0" +[[package]] +name = "hbcb" +version = "0.1.0" +dependencies = [ + "cranelift-codegen", + "cranelift-codegen-meta", + "cranelift-control", + "cranelift-isle", + "log", + "regalloc2 0.10.2", + "smallvec", + "target-lexicon", +] + [[package]] name = "hbjit" version = "0.1.0" @@ -32,12 +194,28 @@ dependencies = [ "memmap2", ] +[[package]] +name = "indexmap" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +dependencies = [ + "equivalent", + "hashbrown 0.14.5", +] + [[package]] name = "libc" version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + [[package]] name = "memmap2" version = "0.9.5" @@ -47,6 +225,135 @@ dependencies = [ "libc", ] +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regalloc2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad156d539c879b7a24a363a2016d77961786e71f48f2e2fc8302a92abd2429a6" +dependencies = [ + "hashbrown 0.13.2", + "log", + "rustc-hash 1.1.0", + "slice-group-by", + "smallvec", +] + +[[package]] +name = "regalloc2" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12908dbeb234370af84d0579b9f68258a0f67e201412dd9a2814e6f45b2fc0f0" +dependencies = [ + "hashbrown 0.14.5", + "log", + "rustc-hash 2.0.0", + "slice-group-by", + "smallvec", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + +[[package]] +name = "slice-group-by" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826167069c09b99d56f31e9ae5c99049e932a98c9dc2dac47645b08dbbf76ba7" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "syn" +version = "2.0.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "xtask" version = "0.1.0" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index acb9024..fbc1d6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["hbbytecode", "hbvm", "hbxrt", "xtask", "hblang", "hbjit"] +members = ["hbbytecode", "hbvm", "hbxrt", "xtask", "hblang", "hbjit", "hbcb"] [profile.release] strip = true diff --git a/hbcb/Cargo.toml b/hbcb/Cargo.toml new file mode 100644 index 0000000..799a0a9 --- /dev/null +++ b/hbcb/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "hbcb" +version = "0.1.0" +edition = "2021" + +[dependencies] +cranelift-codegen = "0.111.0" +cranelift-control = "0.111.0" +log = "0.4.22" +regalloc2 = "0.10.2" +smallvec = "1.13.2" +target-lexicon = "0.12.16" + +[features] +default = ["isle-errors"] +unwind = [] +isle-in-source-tree = [] +isle-errors = [] + +[build-dependencies] +cranelift-codegen-meta = "0.111.0" +cranelift-isle = "0.111.0" diff --git a/hbcb/build.rs b/hbcb/build.rs new file mode 100644 index 0000000..67c63fb --- /dev/null +++ b/hbcb/build.rs @@ -0,0 +1,310 @@ +// Build script. +// +// This program is run by Cargo when building cranelift-codegen. It is used to generate Rust code from +// the language definitions in the cranelift-codegen/meta directory. +// +// Environment: +// +// OUT_DIR +// Directory where generated files should be placed. +// +// TARGET +// Target triple provided by Cargo. +// +// The build script expects to be run from the directory where this build.rs file lives. The +// current directory is used to find the sources. + +use { + cranelift_codegen_meta::{self as meta, isle::IsleCompilations}, + cranelift_isle::error::Errors, + meta::isle::IsleCompilation, + std::{env, io::Read, process, time::Instant}, +}; + +fn main() { + let start_time = Instant::now(); + + let out_dir = env::var("OUT_DIR").expect("The OUT_DIR environment variable must be set"); + let out_dir = std::path::Path::new(&out_dir); + //let target_triple = env::var("TARGET").expect("The TARGET environment variable must be set"); + + //let all_arch = env::var("CARGO_FEATURE_ALL_ARCH").is_ok(); + //let all_native_arch = env::var("CARGO_FEATURE_ALL_NATIVE_ARCH").is_ok(); + + let isas = &[meta::isa::Isa::Riscv64]; + + // let mut isas = meta::isa::Isa::all() + // .iter() + // .cloned() + // .filter(|isa| { + // let env_key = format!("CARGO_FEATURE_{}", isa.to_string().to_uppercase()); + // all_arch || env::var(env_key).is_ok() + // }) + // .collect::>(); + + // Don't require host isa if under 'all-arch' feature. + //let host_isa = env::var("CARGO_FEATURE_HOST_ARCH").is_ok() && !all_native_arch; + + //if isas.is_empty() || host_isa { + // // Try to match native target. + // let target_name = target_triple.split('-').next().unwrap(); + // let isa = meta::isa_from_arch(target_name).expect("error when identifying target"); + // println!("cargo:rustc-cfg=feature=\"{isa}\""); + // isas.push(isa); + //} + + let cur_dir = env::current_dir().expect("Can't access current working directory"); + let crate_dir = cur_dir.as_path(); + + println!("cargo:rerun-if-changed=build.rs"); + + let explicit_isle_dir = &crate_dir.join("isle_generated_code"); + #[cfg(feature = "isle-in-source-tree")] + let isle_dir = explicit_isle_dir; + #[cfg(not(feature = "isle-in-source-tree"))] + let isle_dir = &out_dir; + + #[cfg(feature = "isle-in-source-tree")] + { + std::fs::create_dir_all(isle_dir).expect("Could not create ISLE source directory"); + } + #[cfg(not(feature = "isle-in-source-tree"))] + { + if explicit_isle_dir.is_dir() { + eprintln!(concat!( + "Error: directory isle_generated_code/ exists but is only used when\n", + "`--feature isle-in-source-tree` is specified. To prevent confusion,\n", + "this build script requires the directory to be removed when reverting\n", + "to the usual generated code in target/. Please delete the directory and\n", + "re-run this build.\n", + )); + std::process::exit(1); + } + } + + if let Err(err) = meta::generate(isas, out_dir, isle_dir) { + eprintln!("Error: {err}"); + process::exit(1); + } + + if &std::env::var("SKIP_ISLE").unwrap_or("0".to_string()) != "1" { + if let Err(err) = build_isle(crate_dir, isle_dir) { + eprintln!("Error: {err}"); + process::exit(1); + } + } + + if env::var("CRANELIFT_VERBOSE").is_ok() { + for isa in isas { + println!("cargo:warning=Includes support for {} ISA", isa); + } + println!("cargo:warning=Build step took {:?}.", Instant::now() - start_time); + println!("cargo:warning=Generated files are in {}", out_dir.display()); + } + + let pkg_version = env::var("CARGO_PKG_VERSION").unwrap(); + let mut cmd = std::process::Command::new("git"); + cmd.arg("rev-parse") + .arg("HEAD") + .stdout(std::process::Stdio::piped()) + .current_dir(env::var("CARGO_MANIFEST_DIR").unwrap()); + let version = if let Ok(mut child) = cmd.spawn() { + let mut git_rev = String::new(); + child.stdout.as_mut().unwrap().read_to_string(&mut git_rev).unwrap(); + let status = child.wait().unwrap(); + if status.success() { + let git_rev = git_rev.trim().chars().take(9).collect::(); + format!("{pkg_version}-{git_rev}") + } else { + // not a git repo + pkg_version + } + } else { + // git not available + pkg_version + }; + std::fs::write( + std::path::Path::new(&out_dir).join("version.rs"), + format!( + "/// Version number of this crate. \n\ + pub const VERSION: &str = \"{version}\";" + ), + ) + .unwrap(); +} + +/// Strip the current directory from the file paths, because `islec` +/// includes them in the generated source, and this helps us maintain +/// deterministic builds that don't include those local file paths. +fn make_isle_source_path_relative( + cur_dir: &std::path::Path, + filename: &std::path::Path, +) -> std::path::PathBuf { + if let Ok(suffix) = filename.strip_prefix(cur_dir) { + suffix.to_path_buf() + } else { + filename.to_path_buf() + } +} + +fn build_isle( + crate_dir: &std::path::Path, + isle_dir: &std::path::Path, +) -> Result<(), Box> { + let cur_dir = std::env::current_dir()?; + let codegen_crate_dir = &make_isle_source_path_relative(&cur_dir, crate_dir); + let gen_dir = &make_isle_source_path_relative(&cur_dir, isle_dir); + + // Preludes. + let clif_lower_isle = gen_dir.join("clif_lower.isle"); + //let clif_opt_isle = gen_dir.join("clif_opt.isle"); + let prelude_isle = codegen_crate_dir.join("src").join("prelude.isle"); + //let prelude_opt_isle = codegen_crate_dir.join("src").join("prelude_opt.isle"); + let prelude_lower_isle = codegen_crate_dir.join("src").join("prelude_lower.isle"); + + // Directory for mid-end optimizations. + //let src_opts = codegen_crate_dir.join("src").join("opts"); + + let src_isa_risc_v = codegen_crate_dir.join("src"); + + // This is a set of ISLE compilation units. + // + // The format of each entry is: + // + // (output Rust code file, input ISLE source files) + // + // There should be one entry for each backend that uses ISLE for lowering, + // and if/when we replace our peephole optimization passes with ISLE, there + // should be an entry for each of those as well. + // + // N.B.: add any new compilation outputs to + // `scripts/force-rebuild-isle.sh` if they do not fit the pattern + // `cranelift/codegen/src/isa/*/lower/isle/generated_code.rs`! + let isle_compilations = IsleCompilations { + items: vec![ + // // The mid-end optimization rules. + // IsleCompilation { + // output: gen_dir.join("isle_opt.rs"), + // inputs: vec![ + // prelude_isle.clone(), + // prelude_opt_isle, + // src_opts.join("arithmetic.isle"), + // src_opts.join("bitops.isle"), + // src_opts.join("cprop.isle"), + // src_opts.join("extends.isle"), + // src_opts.join("icmp.isle"), + // src_opts.join("remat.isle"), + // src_opts.join("selects.isle"), + // src_opts.join("shifts.isle"), + // src_opts.join("spaceship.isle"), + // src_opts.join("spectre.isle"), + // src_opts.join("vector.isle"), + // ], + // untracked_inputs: vec![clif_opt_isle], + // }, + // The risc-v instruction selector. + IsleCompilation { + output: gen_dir.join("isle_riscv64.rs"), + inputs: vec![ + prelude_isle.clone(), + prelude_lower_isle.clone(), + src_isa_risc_v.join("inst.isle"), + src_isa_risc_v.join("inst_vector.isle"), + src_isa_risc_v.join("lower.isle"), + ], + untracked_inputs: vec![clif_lower_isle.clone()], + }, + ], + }; + + let mut had_error = false; + for compilation in &isle_compilations.items { + for file in &compilation.inputs { + println!("cargo:rerun-if-changed={}", file.display()); + } + + if let Err(e) = run_compilation(compilation) { + had_error = true; + eprintln!("Error building ISLE files:"); + eprintln!("{e:?}"); + #[cfg(not(feature = "isle-errors"))] + { + eprintln!("To see a more detailed error report, run: "); + eprintln!(); + eprintln!(" $ cargo check -p cranelift-codegen --features isle-errors"); + eprintln!(); + } + } + } + + if had_error { + std::process::exit(1); + } + + println!("cargo:rustc-env=ISLE_DIR={}", isle_dir.to_str().unwrap()); + + Ok(()) +} + +/// Build ISLE DSL source text into generated Rust code. +/// +/// NB: This must happen *after* the `cranelift-codegen-meta` functions, since +/// it consumes files generated by them. +fn run_compilation(compilation: &IsleCompilation) -> Result<(), Errors> { + use cranelift_isle as isle; + + eprintln!("Rebuilding {}", compilation.output.display()); + + let code = { + let file_paths = compilation.inputs.iter().chain(compilation.untracked_inputs.iter()); + + let options = isle::codegen::CodegenOptions { + // Because we include!() the generated ISLE source, we cannot + // put the global pragmas (`#![allow(...)]`) in the ISLE + // source itself; we have to put them in the source that + // include!()s it. (See + // https://github.com/rust-lang/rust/issues/47995.) + exclude_global_allow_pragmas: true, + }; + + isle::compile::from_files(file_paths, &options)? + }; + + let code = rustfmt(&code).unwrap_or_else(|e| { + println!("cargo:warning=Failed to run `rustfmt` on ISLE-generated code: {e:?}"); + code + }); + + eprintln!("Writing ISLE-generated Rust code to {}", compilation.output.display()); + std::fs::write(&compilation.output, code) + .map_err(|e| Errors::from_io(e, "failed writing output"))?; + + Ok(()) +} + +fn rustfmt(code: &str) -> std::io::Result { + use std::io::Write; + + let mut rustfmt = std::process::Command::new("rustfmt") + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn()?; + + let mut stdin = rustfmt.stdin.take().unwrap(); + stdin.write_all(code.as_bytes())?; + drop(stdin); + + let mut stdout = rustfmt.stdout.take().unwrap(); + let mut data = vec![]; + stdout.read_to_end(&mut data)?; + + let status = rustfmt.wait()?; + if !status.success() { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("`rustfmt` exited with status {status}"), + )); + } + + Ok(String::from_utf8(data).expect("rustfmt always writs utf-8 to stdout")) +} diff --git a/hbcb/src/abi.rs b/hbcb/src/abi.rs new file mode 100644 index 0000000..fb8fc26 --- /dev/null +++ b/hbcb/src/abi.rs @@ -0,0 +1,900 @@ +//! Implementation of a standard Riscv64 ABI. + +use { + alloc::{boxed::Box, vec::Vec}, + cranelift_codegen::{ + inst::*, + ir::{self, types::*, LibCall, Signature}, + isa::{self, unwind::UnwindInst, CallConv}, + machinst::*, + settings::{self, Flags as RiscvFlags}, + CodegenError, CodegenResult, + }, + regalloc2::{MachineEnv, PReg, PRegSet}, + smallvec::{smallvec, SmallVec}, + std::sync::OnceLock, +}; + +/// Support for the Riscv64 ABI from the callee side (within a function body). +pub(crate) type Riscv64Callee = Callee; + +/// Support for the Riscv64 ABI from the caller side (at a callsite). +pub(crate) type Riscv64ABICallSite = CallSite; + +/// This is the limit for the size of argument and return-value areas on the +/// stack. We place a reasonable limit here to avoid integer overflow issues +/// with 32-bit arithmetic: for now, 128 MB. +static STACK_ARG_RET_SIZE_LIMIT: u32 = 128 * 1024 * 1024; + +/// Riscv64-specific ABI behavior. This struct just serves as an implementation +/// point for the trait; it is never actually instantiated. +pub struct Riscv64MachineDeps; + +impl IsaFlags for RiscvFlags {} + +impl RiscvFlags { + pub(crate) fn min_vec_reg_size(&self) -> u64 { + let entries = [ + (self.has_zvl65536b(), 65536), + (self.has_zvl32768b(), 32768), + (self.has_zvl16384b(), 16384), + (self.has_zvl8192b(), 8192), + (self.has_zvl4096b(), 4096), + (self.has_zvl2048b(), 2048), + (self.has_zvl1024b(), 1024), + (self.has_zvl512b(), 512), + (self.has_zvl256b(), 256), + // In order to claim the Application Profile V extension, a minimum + // register size of 128 is required. i.e. V implies Zvl128b. + (self.has_v(), 128), + (self.has_zvl128b(), 128), + (self.has_zvl64b(), 64), + (self.has_zvl32b(), 32), + ]; + + for (has_flag, size) in entries.into_iter() { + if !has_flag { + continue; + } + + // Due to a limitation in regalloc2, we can't support types + // larger than 1024 bytes. So limit that here. + return std::cmp::min(size, 1024); + } + + return 0; + } +} + +impl ABIMachineSpec for Riscv64MachineDeps { + type F = RiscvFlags; + type I = Inst; + + fn word_bits() -> u32 { + 64 + } + + /// Return required stack alignment in bytes. + fn stack_align(_call_conv: isa::CallConv) -> u32 { + 16 + } + + fn compute_arg_locs( + call_conv: isa::CallConv, + _flags: &settings::Flags, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + mut args: ArgsAccumulator, + ) -> CodegenResult<(u32, Option)> { + assert_ne!( + call_conv, + isa::CallConv::Winch, + "riscv64 does not support the 'winch' calling convention yet" + ); + + // All registers that can be used as parameters or rets. + // both start and end are included. + let (x_start, x_end, f_start, f_end) = match args_or_rets { + ArgsOrRets::Args => (10, 17, 10, 17), + ArgsOrRets::Rets => (10, 11, 10, 11), + }; + let mut next_x_reg = x_start; + let mut next_f_reg = f_start; + // Stack space. + let mut next_stack: u32 = 0; + + for param in params { + if let ir::ArgumentPurpose::StructArgument(_) = param.purpose { + panic!( + "StructArgument parameters are not supported on riscv64. \ + Use regular pointer arguments instead." + ); + } + + // Find regclass(es) of the register(s) used to store a value of this type. + let (rcs, reg_tys) = Inst::rc_for_type(param.value_type)?; + let mut slots = ABIArgSlotVec::new(); + for (rc, reg_ty) in rcs.iter().zip(reg_tys.iter()) { + let next_reg = if (next_x_reg <= x_end) && *rc == RegClass::Int { + let x = Some(x_reg(next_x_reg)); + next_x_reg += 1; + x + } else if (next_f_reg <= f_end) && *rc == RegClass::Float { + let x = Some(f_reg(next_f_reg)); + next_f_reg += 1; + x + } else { + None + }; + if let Some(reg) = next_reg { + slots.push(ABIArgSlot::Reg { + reg: reg.to_real_reg().unwrap(), + ty: *reg_ty, + extension: param.extension, + }); + } else { + // Compute size and 16-byte stack alignment happens + // separately after all args. + let size = reg_ty.bits() / 8; + let size = std::cmp::max(size, 8); + // Align. + debug_assert!(size.is_power_of_two()); + next_stack = align_to(next_stack, size); + slots.push(ABIArgSlot::Stack { + offset: next_stack as i64, + ty: *reg_ty, + extension: param.extension, + }); + next_stack += size; + } + } + args.push(ABIArg::Slots { slots, purpose: param.purpose }); + } + let pos: Option = if add_ret_area_ptr { + assert!(ArgsOrRets::Args == args_or_rets); + if next_x_reg <= x_end { + let arg = ABIArg::reg( + x_reg(next_x_reg).to_real_reg().unwrap(), + I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + ); + args.push_non_formal(arg); + } else { + let arg = ABIArg::stack( + next_stack as i64, + I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + ); + args.push_non_formal(arg); + next_stack += 8; + } + Some(args.args().len() - 1) + } else { + None + }; + + next_stack = align_to(next_stack, Self::stack_align(call_conv)); + + // To avoid overflow issues, limit the arg/return size to something + // reasonable -- here, 128 MB. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((next_stack, pos)) + } + + fn gen_load_stack(mem: StackAMode, into_reg: Writable, ty: Type) -> Inst { + Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted()) + } + + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted()) + } + + fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_move(to_reg, from_reg, ty) + } + + fn gen_extend( + to_reg: Writable, + from_reg: Reg, + signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Inst { + assert!(from_bits < to_bits); + Inst::Extend { rd: to_reg, rn: from_reg, signed, from_bits, to_bits } + } + + fn get_ext_mode( + _call_conv: isa::CallConv, + specified: ir::ArgumentExtension, + ) -> ir::ArgumentExtension { + specified + } + + fn gen_args(args: Vec) -> Inst { + Inst::Args { args } + } + + fn gen_rets(rets: Vec) -> Inst { + Inst::Rets { rets } + } + + fn get_stacklimit_reg(_call_conv: isa::CallConv) -> Reg { + spilltmp_reg() + } + + fn gen_add_imm( + _call_conv: isa::CallConv, + into_reg: Writable, + from_reg: Reg, + imm: u32, + ) -> SmallInstVec { + let mut insts = SmallInstVec::new(); + if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) { + insts.push(Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: into_reg, + rs: from_reg, + imm12, + }); + } else { + insts.extend(Inst::load_constant_u32(writable_spilltmp_reg2(), imm as u64)); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: into_reg, + rs1: spilltmp_reg2(), + rs2: from_reg, + }); + } + insts + } + + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec { + let mut insts = SmallVec::new(); + insts.push(Inst::TrapIf { + cc: IntCC::UnsignedLessThan, + rs1: stack_reg(), + rs2: limit_reg, + trap_code: ir::TrapCode::StackOverflow, + }); + insts + } + + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable) -> Inst { + Inst::LoadAddr { rd: into_reg, mem: mem.into() } + } + + fn gen_load_base_offset(into_reg: Writable, base: Reg, offset: i32, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64); + Inst::gen_load(into_reg, mem, ty, MemFlags::trusted()) + } + + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64); + Inst::gen_store(mem, from_reg, ty, MemFlags::trusted()) + } + + fn gen_sp_reg_adjust(amount: i32) -> SmallInstVec { + let mut insts = SmallVec::new(); + + if amount == 0 { + return insts; + } + + if let Some(imm) = Imm12::maybe_from_i64(amount as i64) { + insts.push(Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: writable_stack_reg(), + rs: stack_reg(), + imm12: imm, + }) + } else { + let tmp = writable_spilltmp_reg(); + insts.extend(Inst::load_constant_u64(tmp, amount as i64 as u64)); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: writable_stack_reg(), + rs1: stack_reg(), + rs2: tmp.to_reg(), + }); + } + + insts + } + + fn gen_prologue_frame_setup( + _call_conv: isa::CallConv, + flags: &settings::Flags, + _isa_flags: &RiscvFlags, + frame_layout: &FrameLayout, + ) -> SmallInstVec { + let mut insts = SmallVec::new(); + + if frame_layout.setup_area_size > 0 { + // add sp,sp,-16 ;; alloc stack space for fp. + // sd ra,8(sp) ;; save ra. + // sd fp,0(sp) ;; store old fp. + // mv fp,sp ;; set fp to sp. + insts.extend(Self::gen_sp_reg_adjust(-16)); + insts.push(Inst::gen_store(AMode::SPOffset(8), link_reg(), I64, MemFlags::trusted())); + insts.push(Inst::gen_store(AMode::SPOffset(0), fp_reg(), I64, MemFlags::trusted())); + + if flags.unwind_info() { + insts.push(Inst::Unwind { + inst: UnwindInst::PushFrameRegs { + offset_upward_to_caller_sp: frame_layout.setup_area_size, + }, + }); + } + insts.push(Inst::Mov { rd: writable_fp_reg(), rm: stack_reg(), ty: I64 }); + } + + insts + } + + /// reverse of gen_prologue_frame_setup. + fn gen_epilogue_frame_restore( + call_conv: isa::CallConv, + _flags: &settings::Flags, + _isa_flags: &RiscvFlags, + frame_layout: &FrameLayout, + ) -> SmallInstVec { + let mut insts = SmallVec::new(); + + if frame_layout.setup_area_size > 0 { + insts.push(Inst::gen_load( + writable_link_reg(), + AMode::SPOffset(8), + I64, + MemFlags::trusted(), + )); + insts.push(Inst::gen_load( + writable_fp_reg(), + AMode::SPOffset(0), + I64, + MemFlags::trusted(), + )); + insts.extend(Self::gen_sp_reg_adjust(16)); + } + + if call_conv == isa::CallConv::Tail && frame_layout.tail_args_size > 0 { + insts.extend(Self::gen_sp_reg_adjust(frame_layout.tail_args_size.try_into().unwrap())); + } + + insts + } + + fn gen_return( + _call_conv: isa::CallConv, + _isa_flags: &RiscvFlags, + _frame_layout: &FrameLayout, + ) -> SmallInstVec { + smallvec![Inst::Ret {}] + } + + fn gen_probestack(insts: &mut SmallInstVec, frame_size: u32) { + insts.extend(Inst::load_constant_u32(writable_a0(), frame_size as u64)); + let mut info = + CallInfo::empty(ExternalName::LibCall(LibCall::Probestack), CallConv::SystemV); + info.uses.push(CallArgPair { vreg: a0(), preg: a0() }); + insts.push(Inst::Call { info: Box::new(info) }); + } + + fn gen_clobber_save( + _call_conv: isa::CallConv, + flags: &settings::Flags, + frame_layout: &FrameLayout, + ) -> SmallVec<[Inst; 16]> { + let mut insts = SmallVec::new(); + let setup_frame = frame_layout.setup_area_size > 0; + + let incoming_args_diff = frame_layout.tail_args_size - frame_layout.incoming_args_size; + if incoming_args_diff > 0 { + // Decrement SP by the amount of additional incoming argument space we need + insts.extend(Self::gen_sp_reg_adjust(-(incoming_args_diff as i32))); + + if setup_frame { + // Write the lr position on the stack again, as it hasn't changed since it was + // pushed in `gen_prologue_frame_setup` + insts.push(Inst::gen_store( + AMode::SPOffset(8), + link_reg(), + I64, + MemFlags::trusted(), + )); + insts.push(Inst::gen_load( + writable_fp_reg(), + AMode::SPOffset(i64::from(incoming_args_diff)), + I64, + MemFlags::trusted(), + )); + insts.push(Inst::gen_store(AMode::SPOffset(0), fp_reg(), I64, MemFlags::trusted())); + + // Finally, sync the frame pointer with SP + insts.push(Inst::gen_move(writable_fp_reg(), stack_reg(), I64)); + } + } + + if flags.unwind_info() && setup_frame { + // The *unwind* frame (but not the actual frame) starts at the + // clobbers, just below the saved FP/LR pair. + insts.push(Inst::Unwind { + inst: UnwindInst::DefineNewFrame { + offset_downward_to_clobbers: frame_layout.clobber_size, + offset_upward_to_caller_sp: frame_layout.setup_area_size, + }, + }); + } + + // Adjust the stack pointer downward for clobbers, the function fixed + // frame (spillslots and storage slots), and outgoing arguments. + let stack_size = frame_layout.clobber_size + + frame_layout.fixed_frame_storage_size + + frame_layout.outgoing_args_size; + + // Store each clobbered register in order at offsets from SP, + // placing them above the fixed frame slots. + if stack_size > 0 { + insts.extend(Self::gen_sp_reg_adjust(-(stack_size as i32))); + + let mut cur_offset = 8; + for reg in &frame_layout.clobbered_callee_saves { + let r_reg = reg.to_reg(); + let ty = match r_reg.class() { + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Saves"), + }; + insts.push(Inst::gen_store( + AMode::SPOffset((stack_size - cur_offset) as i64), + Reg::from(reg.to_reg()), + ty, + MemFlags::trusted(), + )); + + if flags.unwind_info() { + insts.push(Inst::Unwind { + inst: UnwindInst::SaveReg { + clobber_offset: frame_layout.clobber_size - cur_offset, + reg: r_reg, + }, + }); + } + + cur_offset += 8 + } + } + insts + } + + fn gen_clobber_restore( + _call_conv: isa::CallConv, + _flags: &settings::Flags, + frame_layout: &FrameLayout, + ) -> SmallVec<[Inst; 16]> { + let mut insts = SmallVec::new(); + + let stack_size = frame_layout.clobber_size + + frame_layout.fixed_frame_storage_size + + frame_layout.outgoing_args_size; + + let mut cur_offset = 8; + for reg in &frame_layout.clobbered_callee_saves { + let rreg = reg.to_reg(); + let ty = match rreg.class() { + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Restores"), + }; + insts.push(Inst::gen_load( + reg.map(Reg::from), + AMode::SPOffset(i64::from(stack_size - cur_offset)), + ty, + MemFlags::trusted(), + )); + cur_offset += 8 + } + + if stack_size > 0 { + insts.extend(Self::gen_sp_reg_adjust(stack_size as i32)); + } + + insts + } + + fn gen_call(dest: &CallDest, tmp: Writable, info: CallInfo<()>) -> SmallVec<[Self::I; 2]> { + let mut insts = SmallVec::new(); + match &dest { + &CallDest::ExtName(ref name, RelocDistance::Near) => { + let info = Box::new(info.map(|()| name.clone())); + insts.push(Inst::Call { info }) + } + &CallDest::ExtName(ref name, RelocDistance::Far) => { + insts.push(Inst::LoadExtName { rd: tmp, name: Box::new(name.clone()), offset: 0 }); + let info = Box::new(info.map(|()| tmp.to_reg())); + insts.push(Inst::CallInd { info }); + } + &CallDest::Reg(reg) => { + let info = Box::new(info.map(|()| *reg)); + insts.push(Inst::CallInd { info }); + } + } + insts + } + + fn gen_memcpy Writable>( + call_conv: isa::CallConv, + dst: Reg, + src: Reg, + size: usize, + mut alloc_tmp: F, + ) -> SmallVec<[Self::I; 8]> { + let mut insts = SmallVec::new(); + let arg0 = Writable::from_reg(x_reg(10)); + let arg1 = Writable::from_reg(x_reg(11)); + let arg2 = Writable::from_reg(x_reg(12)); + let tmp = alloc_tmp(Self::word_type()); + insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter()); + insts.push(Inst::Call { + info: Box::new(CallInfo { + dest: ExternalName::LibCall(LibCall::Memcpy), + uses: smallvec![ + CallArgPair { vreg: dst, preg: arg0.to_reg() }, + CallArgPair { vreg: src, preg: arg1.to_reg() }, + CallArgPair { vreg: tmp.to_reg(), preg: arg2.to_reg() } + ], + defs: smallvec![], + clobbers: Self::get_regs_clobbered_by_call(call_conv), + caller_conv: call_conv, + callee_conv: call_conv, + callee_pop_size: 0, + }), + }); + insts + } + + fn get_number_of_spillslots_for_value( + rc: RegClass, + _target_vector_bytes: u32, + isa_flags: &RiscvFlags, + ) -> u32 { + // We allocate in terms of 8-byte slots. + match rc { + RegClass::Int => 1, + RegClass::Float => 1, + RegClass::Vector => (isa_flags.min_vec_reg_size() / 8) as u32, + } + } + + fn get_machine_env(_flags: &settings::Flags, _call_conv: isa::CallConv) -> &MachineEnv { + static MACHINE_ENV: OnceLock = OnceLock::new(); + MACHINE_ENV.get_or_init(create_reg_enviroment) + } + + fn get_regs_clobbered_by_call(_call_conv_of_callee: isa::CallConv) -> PRegSet { + DEFAULT_CLOBBERS + } + + fn compute_frame_layout( + _call_conv: isa::CallConv, + flags: &settings::Flags, + _sig: &Signature, + regs: &[Writable], + is_leaf: bool, + incoming_args_size: u32, + tail_args_size: u32, + fixed_frame_storage_size: u32, + outgoing_args_size: u32, + ) -> FrameLayout { + let mut regs: Vec> = regs + .iter() + .cloned() + .filter(|r| DEFAULT_CALLEE_SAVES.contains(r.to_reg().into())) + .collect(); + + regs.sort_unstable(); + + // Compute clobber size. + let clobber_size = compute_clobber_size(®s); + + // Compute linkage frame size. + let setup_area_size = if flags.preserve_frame_pointers() + || !is_leaf + // The function arguments that are passed on the stack are addressed + // relative to the Frame Pointer. + || incoming_args_size > 0 + || clobber_size > 0 + || fixed_frame_storage_size > 0 + { + 16 // FP, LR + } else { + 0 + }; + + // Return FrameLayout structure. + FrameLayout { + incoming_args_size, + tail_args_size, + setup_area_size, + clobber_size, + fixed_frame_storage_size, + outgoing_args_size, + clobbered_callee_saves: regs, + } + } + + fn gen_inline_probestack( + insts: &mut SmallInstVec, + _call_conv: isa::CallConv, + frame_size: u32, + guard_size: u32, + ) { + // Unroll at most n consecutive probes, before falling back to using a loop + const PROBE_MAX_UNROLL: u32 = 3; + // Number of probes that we need to perform + let probe_count = align_to(frame_size, guard_size) / guard_size; + + // Must be a caller-saved register that is not an argument. + let tmp = Writable::from_reg(x_reg(28)); // t3 + + if probe_count <= PROBE_MAX_UNROLL { + Self::gen_probestack_unroll(insts, tmp, guard_size, probe_count) + } else { + insts.push(Inst::StackProbeLoop { guard_size, probe_count, tmp }); + } + } +} + +impl Riscv64ABICallSite { + pub fn emit_return_call(mut self, ctx: &mut Lower, args: isle::ValueSlice) { + let new_stack_arg_size = + u32::try_from(self.sig(ctx.sigs()).sized_stack_arg_space()).unwrap(); + + ctx.abi_mut().accumulate_tail_args_size(new_stack_arg_size); + + // Put all arguments in registers and stack slots (within that newly + // allocated stack space). + self.emit_args(ctx, args); + self.emit_stack_ret_arg_for_tail_call(ctx); + + let dest = self.dest().clone(); + let uses = self.take_uses(); + + match dest { + CallDest::ExtName(name, RelocDistance::Near) => { + let info = Box::new(ReturnCallInfo { dest: name, uses, new_stack_arg_size }); + ctx.emit(Inst::ReturnCall { info }); + } + CallDest::ExtName(name, RelocDistance::Far) => { + let callee = ctx.alloc_tmp(ir::types::I64).only_reg().unwrap(); + ctx.emit(Inst::LoadExtName { rd: callee, name: Box::new(name), offset: 0 }); + let info = + Box::new(ReturnCallInfo { dest: callee.to_reg(), uses, new_stack_arg_size }); + ctx.emit(Inst::ReturnCallInd { info }); + } + CallDest::Reg(callee) => { + let info = Box::new(ReturnCallInfo { dest: callee, uses, new_stack_arg_size }); + ctx.emit(Inst::ReturnCallInd { info }); + } + } + } +} + +// NOTE: no V regs are callee save. +const DEFAULT_CALLEE_SAVES: PRegSet = PRegSet::empty() + // X Regs + .with(px_reg(2)) + .with(px_reg(8)) + .with(px_reg(9)) + .with(px_reg(18)) + .with(px_reg(19)) + .with(px_reg(20)) + .with(px_reg(21)) + .with(px_reg(22)) + .with(px_reg(23)) + .with(px_reg(24)) + .with(px_reg(25)) + .with(px_reg(26)) + .with(px_reg(27)) + // F Regs + .with(pf_reg(8)) + .with(pf_reg(18)) + .with(pf_reg(19)) + .with(pf_reg(20)) + .with(pf_reg(21)) + .with(pf_reg(22)) + .with(pf_reg(23)) + .with(pf_reg(24)) + .with(pf_reg(25)) + .with(pf_reg(26)) + .with(pf_reg(27)); + +fn compute_clobber_size(clobbers: &[Writable]) -> u32 { + let mut clobbered_size = 0; + for reg in clobbers { + match reg.to_reg().class() { + RegClass::Int => { + clobbered_size += 8; + } + RegClass::Float => { + clobbered_size += 8; + } + RegClass::Vector => unimplemented!("Vector Size Clobbered"), + } + } + align_to(clobbered_size, 16) +} + +const DEFAULT_CLOBBERS: PRegSet = PRegSet::empty() + .with(px_reg(1)) + .with(px_reg(5)) + .with(px_reg(6)) + .with(px_reg(7)) + .with(px_reg(10)) + .with(px_reg(11)) + .with(px_reg(12)) + .with(px_reg(13)) + .with(px_reg(14)) + .with(px_reg(15)) + .with(px_reg(16)) + .with(px_reg(17)) + .with(px_reg(28)) + .with(px_reg(29)) + .with(px_reg(30)) + .with(px_reg(31)) + // F Regs + .with(pf_reg(0)) + .with(pf_reg(1)) + .with(pf_reg(2)) + .with(pf_reg(3)) + .with(pf_reg(4)) + .with(pf_reg(5)) + .with(pf_reg(6)) + .with(pf_reg(7)) + .with(pf_reg(9)) + .with(pf_reg(10)) + .with(pf_reg(11)) + .with(pf_reg(12)) + .with(pf_reg(13)) + .with(pf_reg(14)) + .with(pf_reg(15)) + .with(pf_reg(16)) + .with(pf_reg(17)) + .with(pf_reg(28)) + .with(pf_reg(29)) + .with(pf_reg(30)) + .with(pf_reg(31)) + // V Regs - All vector regs get clobbered + .with(pv_reg(0)) + .with(pv_reg(1)) + .with(pv_reg(2)) + .with(pv_reg(3)) + .with(pv_reg(4)) + .with(pv_reg(5)) + .with(pv_reg(6)) + .with(pv_reg(7)) + .with(pv_reg(8)) + .with(pv_reg(9)) + .with(pv_reg(10)) + .with(pv_reg(11)) + .with(pv_reg(12)) + .with(pv_reg(13)) + .with(pv_reg(14)) + .with(pv_reg(15)) + .with(pv_reg(16)) + .with(pv_reg(17)) + .with(pv_reg(18)) + .with(pv_reg(19)) + .with(pv_reg(20)) + .with(pv_reg(21)) + .with(pv_reg(22)) + .with(pv_reg(23)) + .with(pv_reg(24)) + .with(pv_reg(25)) + .with(pv_reg(26)) + .with(pv_reg(27)) + .with(pv_reg(28)) + .with(pv_reg(29)) + .with(pv_reg(30)) + .with(pv_reg(31)); + +fn create_reg_enviroment() -> MachineEnv { + // Some C Extension instructions can only use a subset of the registers. + // x8 - x15, f8 - f15, v8 - v15 so we should prefer to use those since + // they allow us to emit C instructions more often. + // + // In general the order of preference is: + // 1. Compressible Caller Saved registers. + // 2. Non-Compressible Caller Saved registers. + // 3. Compressible Callee Saved registers. + // 4. Non-Compressible Callee Saved registers. + + let preferred_regs_by_class: [Vec; 3] = { + let x_registers: Vec = (10..=15).map(px_reg).collect(); + let f_registers: Vec = (10..=15).map(pf_reg).collect(); + let v_registers: Vec = (8..=15).map(pv_reg).collect(); + + [x_registers, f_registers, v_registers] + }; + + let non_preferred_regs_by_class: [Vec; 3] = { + // x0 - x4 are special registers, so we don't want to use them. + // Omit x30 and x31 since they are the spilltmp registers. + + // Start with the Non-Compressible Caller Saved registers. + let x_registers: Vec = (5..=7) + .chain(16..=17) + .chain(28..=29) + // The first Callee Saved register is x9 since its Compressible + // Omit x8 since it's the frame pointer. + .chain(9..=9) + // The rest of the Callee Saved registers are Non-Compressible + .chain(18..=27) + .map(px_reg) + .collect(); + + // Prefer Caller Saved registers. + let f_registers: Vec = (0..=7) + .chain(16..=17) + .chain(28..=31) + // Once those are exhausted, we should prefer f8 and f9 since they are + // callee saved, but compressible. + .chain(8..=9) + .chain(18..=27) + .map(pf_reg) + .collect(); + + let v_registers = (0..=7).chain(16..=31).map(pv_reg).collect(); + + [x_registers, f_registers, v_registers] + }; + + MachineEnv { + preferred_regs_by_class, + non_preferred_regs_by_class, + fixed_stack_slots: vec![], + scratch_by_class: [None, None, None], + } +} + +impl Riscv64MachineDeps { + fn gen_probestack_unroll( + insts: &mut SmallInstVec, + tmp: Writable, + guard_size: u32, + probe_count: u32, + ) { + // When manually unrolling adjust the stack pointer and then write a zero + // to the stack at that offset. + // + // We do this because valgrind expects us to never write beyond the stack + // pointer and associated redzone. + // See: https://github.com/bytecodealliance/wasmtime/issues/7454 + + // Store the adjust amount in a register upfront, so we don't have to + // reload it for each probe. It's worth loading this as a negative and + // using an `add` instruction since we have compressed versions of `add` + // but not the `sub` instruction. + insts.extend(Inst::load_constant_u64(tmp, (-(guard_size as i64)) as u64)); + + for _ in 0..probe_count { + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: writable_stack_reg(), + rs1: stack_reg(), + rs2: tmp.to_reg(), + }); + + insts.push(Inst::gen_store(AMode::SPOffset(0), zero_reg(), I32, MemFlags::trusted())); + } + + // Restore the stack pointer to its original value + insts.extend(Self::gen_sp_reg_adjust((guard_size * probe_count) as i32)); + } +} diff --git a/hbcb/src/inst.isle b/hbcb/src/inst.isle new file mode 100644 index 0000000..f6e4570 --- /dev/null +++ b/hbcb/src/inst.isle @@ -0,0 +1,3128 @@ +;; Instruction formats. +(type MInst + (enum + ;; A no-op of zero size. + (Nop0) + (Nop4) + + ;; load immediate + (Lui + (rd WritableReg) + (imm Imm20)) + + (LoadInlineConst + (rd WritableReg) + (ty Type) + (imm u64)) + + (Auipc + (rd WritableReg) + (imm Imm20)) + + (Fli + (ty Type) + (imm FliConstant) + (rd WritableReg)) + + ;; An ALU operation with one register sources and a register destination. + (FpuRR + (alu_op FpuOPRR) + (width FpuOPWidth) + (frm FRM) + (rd WritableReg) + (rs Reg)) + + + ;; An ALU operation with two register sources and a register destination. + (AluRRR + (alu_op AluOPRRR) + (rd WritableReg) + (rs1 Reg) + (rs2 Reg)) + + ;; An ALU operation with two register sources and a register destination. + (FpuRRR + (alu_op FpuOPRRR) + (width FpuOPWidth) + (frm FRM) + (rd WritableReg) + (rs1 Reg) + (rs2 Reg)) + + ;; An ALU operation with three register sources and a register destination. + (FpuRRRR + (alu_op FpuOPRRRR) + (width FpuOPWidth) + (frm FRM) + (rd WritableReg) + (rs1 Reg) + (rs2 Reg) + (rs3 Reg)) + + ;; An ALU operation with a register source and an immediate-12 source, and a register + ;; destination. + (AluRRImm12 + (alu_op AluOPRRI) + (rd WritableReg) + (rs Reg) + (imm12 Imm12)) + + ;; A CSR Reading or Writing instruction with a register source and a register destination. + (CsrReg + (op CsrRegOP) + (rd WritableReg) + (rs Reg) + (csr CSR)) + + ;; A CSR Writing instruction with an immediate source and a register destination. + (CsrImm + (op CsrImmOP) + (rd WritableReg) + (imm UImm5) + (csr CSR)) + + ;; An load + (Load + (rd WritableReg) + (op LoadOP) + (flags MemFlags) + (from AMode)) + ;; An Store + (Store + (to AMode) + (op StoreOP) + (flags MemFlags) + (src Reg)) + + ;; A pseudo-instruction that captures register arguments in vregs. + (Args + (args VecArgPair)) + + ;; A pseudo-instruction that moves vregs to return registers. + (Rets + (rets VecRetPair)) + + (Ret) + + (Extend + (rd WritableReg) + (rn Reg) + (signed bool) + (from_bits u8) + (to_bits u8)) + + (Call (info BoxCallInfo)) + + ;; A machine indirect-call instruction. + (CallInd (info BoxCallIndInfo)) + + ;; A direct return-call macro instruction. + (ReturnCall (info BoxReturnCallInfo)) + + ;; An indirect return-call macro instruction. + (ReturnCallInd (info BoxReturnCallIndInfo)) + + ;; Emits a trap with the given trap code if the comparison succeeds + (TrapIf + (rs1 Reg) + (rs2 Reg) + (cc IntCC) + (trap_code TrapCode)) + + (Jal + ;; (rd WritableReg) don't use + (label MachLabel)) + + (CondBr + (taken CondBrTarget) + (not_taken CondBrTarget) + (kind IntegerCompare)) + + ;; Load an inline symbol reference. + (LoadExtName + (rd WritableReg) + (name BoxExternalName) + (offset i64)) + + ;; Load a TLS symbol address + (ElfTlsGetAddr + (rd WritableReg) + (name BoxExternalName)) + + ;; Load address referenced by `mem` into `rd`. + (LoadAddr + (rd WritableReg) + (mem AMode)) + + ;; A MOV instruction. These are encoded as OrR's (AluRRR form) but we + ;; keep them separate at the `Inst` level for better pretty-printing + ;; and faster `is_move()` logic. + (Mov + (rd WritableReg) + (rm Reg) + (ty Type)) + + ;; A MOV instruction, but where the source register is a non-allocatable + ;; PReg. It's important that the register be non-allocatable, as regalloc2 + ;; will not see it as used. + (MovFromPReg + (rd WritableReg) + (rm PReg)) + + (Fence + (pred FenceReq) + (succ FenceReq)) + + (EBreak) + + ;; An instruction guaranteed to always be undefined and to trigger an illegal instruction at + ;; runtime. + (Udf + (trap_code TrapCode)) + ;; a jump and link register operation + (Jalr + ;;Plain unconditional jumps (assembler pseudo-op J) are encoded as a JAL with rd=x0. + (rd WritableReg) + (base Reg) + (offset Imm12)) + + ;; atomic operations. + (Atomic + (op AtomicOP) + (rd WritableReg) + (addr Reg) + (src Reg) + (amo AMO)) + ;; an atomic store + (AtomicStore + (src Reg) + (ty Type) + (p Reg)) + ;; an atomic load. + (AtomicLoad + (rd WritableReg) + (ty Type) + (p Reg)) + + ;; an atomic nand need using loop to implement. + (AtomicRmwLoop + (offset Reg) + (op AtomicRmwOp) + (dst WritableReg) + (ty Type) + (p Reg) + (x Reg) + (t0 WritableReg)) + + ;; select x or y base on condition + (Select + (dst WritableValueRegs) + (condition IntegerCompare) + (x ValueRegs) + (y ValueRegs)) + + (BrTable + (index Reg) + (tmp1 WritableReg) + (tmp2 WritableReg) + (targets VecMachLabel)) + + ;; atomic compare and set operation + (AtomicCas + (offset Reg) + (t0 WritableReg) + (dst WritableReg) + (e Reg) + (addr Reg) + (v Reg) + (ty Type)) + + (RawData (data VecU8)) + + ;; An unwind pseudo-instruction. + (Unwind + (inst UnwindInst)) + + ;; A dummy use, useful to keep a value alive. + (DummyUse + (reg Reg)) + + ;; popcnt if target doesn't support extension B + ;; use iteration to implement. + (Popcnt + (sum WritableReg) + (step WritableReg) + (tmp WritableReg) + (rs Reg) + (ty Type)) + + ;;; counting leading or trailing zeros. + (Cltz + ;; leading or trailing. + (leading bool) + (sum WritableReg) + (step WritableReg) + (tmp WritableReg) + (rs Reg) + (ty Type)) + + (Brev8 + (rs Reg) + (ty Type) + (step WritableReg) + (tmp WritableReg) + (tmp2 WritableReg) + (rd WritableReg)) + (StackProbeLoop + (guard_size u32) + (probe_count u32) + (tmp WritableReg)) + + (VecAluRRRR + (op VecAluOpRRRR) + (vd WritableReg) + (vd_src Reg) + (vs2 Reg) + (vs1 Reg) + (mask VecOpMasking) + (vstate VState)) + + (VecAluRRRImm5 + (op VecAluOpRRRImm5) + (vd WritableReg) + (vd_src Reg) + (vs2 Reg) + (imm Imm5) + (mask VecOpMasking) + (vstate VState)) + + (VecAluRRR + (op VecAluOpRRR) + (vd WritableReg) + (vs2 Reg) + (vs1 Reg) + (mask VecOpMasking) + (vstate VState)) + + (VecAluRRImm5 + (op VecAluOpRRImm5) + (vd WritableReg) + (vs2 Reg) + (imm Imm5) + (mask VecOpMasking) + (vstate VState)) + + (VecAluRR + (op VecAluOpRR) + (vd WritableReg) + (vs Reg) + (mask VecOpMasking) + (vstate VState)) + + (VecAluRImm5 + (op VecAluOpRImm5) + (vd WritableReg) + (imm Imm5) + (mask VecOpMasking) + (vstate VState)) + + (VecSetState + (rd WritableReg) + (vstate VState)) + + (VecLoad + (eew VecElementWidth) + (to WritableReg) + (from VecAMode) + (flags MemFlags) + (mask VecOpMasking) + (vstate VState)) + + (VecStore + (eew VecElementWidth) + (to VecAMode) + (from Reg) + (flags MemFlags) + (mask VecOpMasking) + (vstate VState)) +)) + +(type AtomicOP (enum + (LrW) + (ScW) + (AmoswapW) + (AmoaddW) + (AmoxorW) + (AmoandW) + (AmoorW) + (AmominW) + (AmomaxW) + (AmominuW) + (AmomaxuW) + (LrD) + (ScD) + (AmoswapD) + (AmoaddD) + (AmoxorD) + (AmoandD) + (AmoorD) + (AmominD) + (AmomaxD) + (AmominuD) + (AmomaxuD) +)) + +(type FpuOPRRRR (enum + (Fmadd) + (Fmsub) + (Fnmsub) + (Fnmadd) +)) + +(type FClassResult (enum + ;;0 rs1 is −∞. + (NegInfinite) + ;; 1 rs1 is a negative normal number. + (NegNormal) + ;; 2 rs1 is a negative subnormal number. + (NegSubNormal) + ;; 3 rs1 is −0. + (NegZero) + ;; 4 rs1 is +0. + (PosZero) + ;; 5 rs1 is a positive subnormal number. + (PosSubNormal) + ;; 6 rs1 is a positive normal number. + (PosNormal) + ;; 7 rs1 is +∞. + (PosInfinite) + ;; 8 rs1 is a signaling NaN. + (SNaN) + ;; 9 rs1 is a quiet NaN. + (QNaN) +)) + +(type FliConstant (primitive FliConstant)) + +(type FpuOPWidth (enum + (S) + (D) + (H) + (Q) +)) + +(decl pure fpu_op_width_from_ty (Type) FpuOPWidth) +(extern constructor fpu_op_width_from_ty fpu_op_width_from_ty) +(convert Type FpuOPWidth fpu_op_width_from_ty) + +(type FpuOPRR (enum + (Fsqrt) ;; fsqrt.{fmt} + (Fclass) ;; fclass.{fmt} + (FcvtWFmt) ;; fcvt.w.{fmt} + (FcvtWuFmt) ;; fcvt.wu.{fmt} + (FcvtLFmt) ;; fcvt.l.{fmt} + (FcvtLuFmt) ;; fcvt.lu.{fmt} + (FcvtFmtW) ;; fcvt.{fmt}.w + (FcvtFmtWu) ;; fcvt.{fmt}.wu + (FcvtFmtL) ;; fcvt.{fmt}.l + (FcvtFmtLu) ;; fcvt.{fmt}.lu + (FmvXFmt) ;; fmv.x.{fmt} + (FmvFmtX) ;; fmv.{fmt}.x + (FcvtSD) ;; fcvt.s.d + (FcvtDS) ;; fcvt.d.s + + ;; Zfa Extension + (Fround) ;; fround.{fmt} +)) + +(type LoadOP (enum + (Lb) + (Lh) + (Lw) + (Lbu) + (Lhu) + (Lwu) + (Ld) + (Flh) + (Flw) + (Fld) +)) + +(type StoreOP (enum + (Sb) + (Sh) + (Sw) + (Sd) + (Fsh) + (Fsw) + (Fsd) +)) + +(type AluOPRRR (enum + ;; base set + (Add) + (Sub) + (Sll) + (Slt) + (SltU) + (Sgt) + (Sgtu) + (Xor) + (Srl) + (Sra) + (Or) + (And) + + ;; RV64I Base Instruction Set (in addition to RV32I) + (Addw) + (Subw) + (Sllw) + (Srlw) + (Sraw) + + + ;;RV32M Standard Extension + (Mul) + (Mulh) + (Mulhsu) + (Mulhu) + (Div) + (DivU) + (Rem) + (RemU) + + ;; RV64M Standard Extension (in addition to RV32M) + (Mulw) + (Divw) + (Divuw) + (Remw) + (Remuw) + + ;; Zba: Address Generation Instructions + (Adduw) + (Sh1add) + (Sh1adduw) + (Sh2add) + (Sh2adduw) + (Sh3add) + (Sh3adduw) + + ;; Zbb: Bit Manipulation Instructions + (Andn) + (Orn) + (Xnor) + (Max) + (Maxu) + (Min) + (Minu) + (Rol) + (Rolw) + (Ror) + (Rorw) + + ;; Zbs: Single-bit instructions + (Bclr) + (Bext) + (Binv) + (Bset) + + ;; Zbc: Carry-less multiplication + (Clmul) + (Clmulh) + (Clmulr) + + ;; Zbkb: Bit-manipulation for Cryptography + (Pack) + (Packw) + (Packh) + + ;; ZiCond: Integer Conditional Operations + (CzeroEqz) + (CzeroNez) +)) + + +(type FpuOPRRR (enum + (Fadd) + (Fsub) + (Fmul) + (Fdiv) + (Fsgnj) + (Fsgnjn) + (Fsgnjx) + (Fmin) + (Fmax) + (Feq) + (Flt) + (Fle) + + ;; Zfa Extension + (Fminm) + (Fmaxm) +)) + + + +(type AluOPRRI (enum + ;; Base ISA + (Addi) + (Slti) + (SltiU) + (Xori) + (Ori) + (Andi) + (Slli) + (Srli) + (Srai) + (Addiw) + (Slliw) + (SrliW) + (Sraiw) + + ;; Zba: Address Generation Instructions + (SlliUw) + + ;; Zbb: Bit Manipulation Instructions + (Clz) + (Clzw) + (Ctz) + (Ctzw) + (Cpop) + (Cpopw) + (Sextb) + (Sexth) + (Zexth) + (Rori) + (Roriw) + (Rev8) + (Brev8) + (Orcb) + + ;; Zbs: Single-bit instructions + (Bclri) + (Bexti) + (Binvi) + (Bseti) +)) + +(type COpcodeSpace (enum + (C0) + (C1) + (C2) +)) + +;; Opcodes for the CR compressed instruction format +(type CrOp (enum + (CMv) + (CAdd) + (CJr) + (CJalr) + ;; c.ebreak technically isn't a CR format instruction, but it's encoding + ;; lines up with this format. + (CEbreak) +)) + +;; Opcodes for the CA compressed instruction format +(type CaOp (enum + (CAnd) + (COr) + (CXor) + (CSub) + (CAddw) + (CSubw) + (CMul) +)) + +;; Opcodes for the CJ compressed instruction format +(type CjOp (enum + (CJ) +)) + +;; Opcodes for the CI compressed instruction format +(type CiOp (enum + (CAddi) + (CAddiw) + (CAddi16sp) + (CSlli) + (CLi) + (CLui) + (CLwsp) + (CLdsp) + (CFldsp) +)) + +;; Opcodes for the CIW compressed instruction format +(type CiwOp (enum + (CAddi4spn) +)) + +;; Opcodes for the CB compressed instruction format +(type CbOp (enum + (CSrli) + (CSrai) + (CAndi) +)) + +;; Opcodes for the CSS compressed instruction format +(type CssOp (enum + (CSwsp) + (CSdsp) + (CFsdsp) +)) + +;; Opcodes for the CS compressed instruction format +(type CsOp (enum + (CSw) + (CSd) + (CFsd) +)) + +;; Opcodes for the CL compressed instruction format +(type ClOp (enum + (CLw) + (CLd) + (CFld) +)) + +;; Opcodes for the CSZN compressed instruction format +(type CsznOp (enum + (CNot) + (CZextb) + (CZexth) + (CZextw) + (CSextb) + (CSexth) +)) + +;; This is a mix of all Zcb memory addressing instructions +;; +;; Technically they are split across 4 different formats. +;; But they are all very similar, so we just group them all together. +(type ZcbMemOp (enum + (CLbu) + (CLhu) + (CLh) + (CSb) + (CSh) +)) + + +(type CsrRegOP (enum + ;; Atomic Read/Write CSR + (CsrRW) + ;; Atomic Read and Set Bits in CSR + (CsrRS) + ;; Atomic Read and Clear Bits in CSR + (CsrRC) +)) + +(type CsrImmOP (enum + ;; Atomic Read/Write CSR (Immediate Source) + (CsrRWI) + ;; Atomic Read and Set Bits in CSR (Immediate Source) + (CsrRSI) + ;; Atomic Read and Clear Bits in CSR (Immediate Source) + (CsrRCI) +)) + +;; Enum of the known CSR registers +(type CSR (enum + ;; Floating-Point Dynamic Rounding Mode + (Frm) +)) + + +(type FRM (enum + ;; Round to Nearest, ties to Even + (RNE) + ;; Round towards Zero + (RTZ) + ;; Round Down (towards −∞) + (RDN) + ;; Round Up (towards +∞) + (RUP) + ;; Round to Nearest, ties to Max Magnitude + (RMM) + ;; In instruction’s rm field, selects dynamic rounding mode; + ;;In Rounding Mode register, Invalid. + (Fcsr) +)) + +(decl pure frm_bits (FRM) UImm5) +(extern constructor frm_bits frm_bits) +(convert FRM UImm5 frm_bits) + +(type FFlagsException (enum + ;; Invalid Operation + (NV) + ;; Divide by Zero + (DZ) + ;; Overflow + (OF) + ;; Underflow + (UF) + ;; Inexact + (NX) +)) + +;;;; input output read write +;;;; SI SO SR SW +;;;; PI PO PR PW +;;;; lowest four bit are used. +(type FenceReq (primitive u8)) + +(type BoxCallInfo (primitive BoxCallInfo)) +(type BoxCallIndInfo (primitive BoxCallIndInfo)) +(type BoxReturnCallInfo (primitive BoxReturnCallInfo)) +(type BoxReturnCallIndInfo (primitive BoxReturnCallIndInfo)) +(type IntegerCompare (primitive IntegerCompare)) +(type AMode (primitive AMode)) +(type OptionReg (primitive OptionReg)) +(type OptionImm12 (primitive OptionImm12)) +(type OptionUimm5 (primitive OptionUimm5)) +(type Imm12 (primitive Imm12)) +(type UImm5 (primitive UImm5)) +(type Imm5 (primitive Imm5)) +(type Imm20 (primitive Imm20)) +(type Imm3 (primitive Imm3)) +(type CondBrTarget (primitive CondBrTarget)) +(type VecU8 (primitive VecU8)) +(type AMO (primitive AMO)) +(type VecMachLabel extern (enum)) + + +;;;; Newtypes for Different Register Classes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(type XReg (primitive XReg)) +(type WritableXReg (primitive WritableXReg)) +(type FReg (primitive FReg)) +(type WritableFReg (primitive WritableFReg)) +(type VReg (primitive VReg)) +(type WritableVReg (primitive WritableVReg)) + +;; Construct a new `XReg` from a `Reg`. +;; +;; Asserts that the register has a Integer RegClass. +(decl xreg_new (Reg) XReg) +(extern constructor xreg_new xreg_new) +(convert Reg XReg xreg_new) + +;; Construct a new `WritableXReg` from a `WritableReg`. +;; +;; Asserts that the register has a Integer RegClass. +(decl writable_xreg_new (WritableReg) WritableXReg) +(extern constructor writable_xreg_new writable_xreg_new) +(convert WritableReg WritableXReg writable_xreg_new) + +;; Put a value into a XReg. +;; +;; Asserts that the value goes into a XReg. +(decl put_in_xreg (Value) XReg) +(rule (put_in_xreg val) (xreg_new (put_in_reg val))) +(convert Value XReg put_in_xreg) + +;; Construct an `InstOutput` out of a single XReg register. +(decl output_xreg (XReg) InstOutput) +(rule (output_xreg x) (output_reg x)) +(convert XReg InstOutput output_xreg) + +;; Convert a `WritableXReg` to an `XReg`. +(decl pure writable_xreg_to_xreg (WritableXReg) XReg) +(extern constructor writable_xreg_to_xreg writable_xreg_to_xreg) +(convert WritableXReg XReg writable_xreg_to_xreg) + +;; Convert a `WritableXReg` to an `WritableReg`. +(decl pure writable_xreg_to_writable_reg (WritableXReg) WritableReg) +(extern constructor writable_xreg_to_writable_reg writable_xreg_to_writable_reg) +(convert WritableXReg WritableReg writable_xreg_to_writable_reg) + +;; Convert a `WritableXReg` to an `Reg`. +(decl pure writable_xreg_to_reg (WritableXReg) Reg) +(rule (writable_xreg_to_reg x) (writable_xreg_to_writable_reg x)) +(convert WritableXReg Reg writable_xreg_to_reg) + +;; Convert an `XReg` to a `Reg`. +(decl pure xreg_to_reg (XReg) Reg) +(extern constructor xreg_to_reg xreg_to_reg) +(convert XReg Reg xreg_to_reg) + +;; Convert a `XReg` to a `ValueRegs`. +(decl xreg_to_value_regs (XReg) ValueRegs) +(rule (xreg_to_value_regs x) (value_reg x)) +(convert XReg ValueRegs xreg_to_reg) + +;; Convert a `WritableXReg` to a `ValueRegs`. +(decl writable_xreg_to_value_regs (WritableXReg) ValueRegs) +(rule (writable_xreg_to_value_regs x) (value_reg x)) +(convert WritableXReg ValueRegs writable_xreg_to_value_regs) + +;; Allocates a new `WritableXReg`. +(decl temp_writable_xreg () WritableXReg) +(rule (temp_writable_xreg) (temp_writable_reg $I64)) + + +;; Construct a new `FReg` from a `Reg`. +;; +;; Asserts that the register has a Float RegClass. +(decl freg_new (Reg) FReg) +(extern constructor freg_new freg_new) +(convert Reg FReg freg_new) + +;; Construct a new `WritableFReg` from a `WritableReg`. +;; +;; Asserts that the register has a Float RegClass. +(decl writable_freg_new (WritableReg) WritableFReg) +(extern constructor writable_freg_new writable_freg_new) +(convert WritableReg WritableFReg writable_freg_new) + +;; Put a value into a FReg. +;; +;; Asserts that the value goes into a FReg. +(decl put_in_freg (Value) FReg) +(rule (put_in_freg val) (freg_new (put_in_reg val))) +(convert Value FReg put_in_freg) + +;; Construct an `InstOutput` out of a single FReg register. +(decl output_freg (FReg) InstOutput) +(rule (output_freg x) (output_reg x)) +(convert FReg InstOutput output_freg) + +;; Convert a `WritableFReg` to an `FReg`. +(decl pure writable_freg_to_freg (WritableFReg) FReg) +(extern constructor writable_freg_to_freg writable_freg_to_freg) +(convert WritableFReg FReg writable_freg_to_freg) + +;; Convert a `WritableFReg` to an `WritableReg`. +(decl pure writable_freg_to_writable_reg (WritableFReg) WritableReg) +(extern constructor writable_freg_to_writable_reg writable_freg_to_writable_reg) +(convert WritableFReg WritableReg writable_freg_to_writable_reg) + +;; Convert a `WritableFReg` to an `Reg`. +(decl pure writable_freg_to_reg (WritableFReg) Reg) +(rule (writable_freg_to_reg x) (writable_freg_to_writable_reg x)) +(convert WritableFReg Reg writable_freg_to_reg) + +;; Convert an `FReg` to a `Reg`. +(decl pure freg_to_reg (FReg) Reg) +(extern constructor freg_to_reg freg_to_reg) +(convert FReg Reg freg_to_reg) + +;; Convert a `FReg` to a `ValueRegs`. +(decl freg_to_value_regs (FReg) ValueRegs) +(rule (freg_to_value_regs x) (value_reg x)) +(convert FReg ValueRegs xreg_to_reg) + +;; Convert a `WritableFReg` to a `ValueRegs`. +(decl writable_freg_to_value_regs (WritableFReg) ValueRegs) +(rule (writable_freg_to_value_regs x) (value_reg x)) +(convert WritableFReg ValueRegs writable_freg_to_value_regs) + +;; Allocates a new `WritableFReg`. +(decl temp_writable_freg () WritableFReg) +(rule (temp_writable_freg) (temp_writable_reg $F64)) + + + +;; Construct a new `VReg` from a `Reg`. +;; +;; Asserts that the register has a Vector RegClass. +(decl vreg_new (Reg) VReg) +(extern constructor vreg_new vreg_new) +(convert Reg VReg vreg_new) + +;; Construct a new `WritableVReg` from a `WritableReg`. +;; +;; Asserts that the register has a Vector RegClass. +(decl writable_vreg_new (WritableReg) WritableVReg) +(extern constructor writable_vreg_new writable_vreg_new) +(convert WritableReg WritableVReg writable_vreg_new) + +;; Put a value into a VReg. +;; +;; Asserts that the value goes into a VReg. +(decl put_in_vreg (Value) VReg) +(rule (put_in_vreg val) (vreg_new (put_in_reg val))) +(convert Value VReg put_in_vreg) + +;; Construct an `InstOutput` out of a single VReg register. +(decl output_vreg (VReg) InstOutput) +(rule (output_vreg x) (output_reg x)) +(convert VReg InstOutput output_vreg) + +;; Convert a `WritableVReg` to an `VReg`. +(decl pure writable_vreg_to_vreg (WritableVReg) VReg) +(extern constructor writable_vreg_to_vreg writable_vreg_to_vreg) +(convert WritableVReg VReg writable_vreg_to_vreg) + +;; Convert a `WritableVReg` to an `WritableReg`. +(decl pure writable_vreg_to_writable_reg (WritableVReg) WritableReg) +(extern constructor writable_vreg_to_writable_reg writable_vreg_to_writable_reg) +(convert WritableVReg WritableReg writable_vreg_to_writable_reg) + +;; Convert a `WritableVReg` to an `Reg`. +(decl pure writable_vreg_to_reg (WritableVReg) Reg) +(rule (writable_vreg_to_reg x) (writable_vreg_to_writable_reg x)) +(convert WritableVReg Reg writable_vreg_to_reg) + +;; Convert an `VReg` to a `Reg`. +(decl pure vreg_to_reg (VReg) Reg) +(extern constructor vreg_to_reg vreg_to_reg) +(convert VReg Reg vreg_to_reg) + +;; Convert a `VReg` to a `ValueRegs`. +(decl vreg_to_value_regs (VReg) ValueRegs) +(rule (vreg_to_value_regs x) (value_reg x)) +(convert VReg ValueRegs xreg_to_reg) + +;; Convert a `WritableVReg` to a `ValueRegs`. +(decl writable_vreg_to_value_regs (WritableVReg) ValueRegs) +(rule (writable_vreg_to_value_regs x) (value_reg x)) +(convert WritableVReg ValueRegs writable_vreg_to_value_regs) + +;; Allocates a new `WritableVReg`. +(decl temp_writable_vreg () WritableVReg) +(rule (temp_writable_vreg) (temp_writable_reg $I8X16)) + + +;; Converters + +(convert u8 i32 u8_as_i32) +(decl u8_as_i32 (u8) i32) +(extern constructor u8_as_i32 u8_as_i32) + +;; ISA Extension helpers + +(decl pure has_m () bool) +(extern constructor has_m has_m) + +(decl pure has_v () bool) +(extern constructor has_v has_v) + +(decl pure has_zfa () bool) +(extern constructor has_zfa has_zfa) + +(decl pure has_zfh () bool) +(extern constructor has_zfh has_zfh) + +(decl pure has_zbkb () bool) +(extern constructor has_zbkb has_zbkb) + +(decl pure has_zba () bool) +(extern constructor has_zba has_zba) + +(decl pure has_zbb () bool) +(extern constructor has_zbb has_zbb) + +(decl pure has_zbc () bool) +(extern constructor has_zbc has_zbc) + +(decl pure has_zbs () bool) +(extern constructor has_zbs has_zbs) + +(decl pure has_zicond () bool) +(extern constructor has_zicond has_zicond) + + +;;;; Type Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Helper that matches any supported type. This extractor checks the ISA flags +;; to determine if the type is supported. +(decl ty_supported (Type) Type) +(extern extractor ty_supported ty_supported) + +;; Helper that matches any scalar floating point type +(decl ty_supported_float (Type) Type) +(extern extractor ty_supported_float ty_supported_float) + +;; Helper that matches any supported vector type +(decl ty_supported_vec (Type) Type) +(extern extractor ty_supported_vec ty_supported_vec) + + +;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; RV32I Base Integer Instruction Set + +;; Helper for emitting the `add` instruction. +;; rd ← rs1 + rs2 +(decl rv_add (XReg XReg) XReg) +(rule (rv_add rs1 rs2) + (alu_rrr (AluOPRRR.Add) rs1 rs2)) + +;; Helper for emitting the `addi` ("Add Immediate") instruction. +;; rd ← rs1 + sext(imm) +(decl rv_addi (XReg Imm12) XReg) +(rule (rv_addi rs1 imm) + (alu_rr_imm12 (AluOPRRI.Addi) rs1 imm)) + +;; Helper for emitting the `sub` instruction. +;; rd ← rs1 - rs2 +(decl rv_sub (XReg XReg) XReg) +(rule (rv_sub rs1 rs2) + (alu_rrr (AluOPRRR.Sub) rs1 rs2)) + +;; Helper for emitting the `neg` instruction. +;; This instruction is a mnemonic for `sub rd, zero, rs1`. +(decl rv_neg (XReg) XReg) +(rule (rv_neg rs1) + (alu_rrr (AluOPRRR.Sub) (zero_reg) rs1)) + +;; Helper for emitting the `sll` ("Shift Left Logical") instruction. +;; rd ← rs1 << rs2 +(decl rv_sll (XReg XReg) XReg) +(rule (rv_sll rs1 rs2) + (alu_rrr (AluOPRRR.Sll) rs1 rs2)) + +;; Helper for emitting the `slli` ("Shift Left Logical Immediate") instruction. +;; rd ← rs1 << uext(imm) +(decl rv_slli (XReg Imm12) XReg) +(rule (rv_slli rs1 imm) + (alu_rr_imm12 (AluOPRRI.Slli) rs1 imm)) + +;; Helper for emitting the `srl` ("Shift Right Logical") instruction. +;; rd ← rs1 >> rs2 +(decl rv_srl (XReg XReg) XReg) +(rule (rv_srl rs1 rs2) + (alu_rrr (AluOPRRR.Srl) rs1 rs2)) + +;; Helper for emitting the `srli` ("Shift Right Logical Immediate") instruction. +;; rd ← rs1 >> uext(imm) +(decl rv_srli (XReg Imm12) XReg) +(rule (rv_srli rs1 imm) + (alu_rr_imm12 (AluOPRRI.Srli) rs1 imm)) + +;; Helper for emitting the `sra` ("Shift Right Arithmetic") instruction. +;; rd ← rs1 >> rs2 +(decl rv_sra (XReg XReg) XReg) +(rule (rv_sra rs1 rs2) + (alu_rrr (AluOPRRR.Sra) rs1 rs2)) + +;; Helper for emitting the `srai` ("Shift Right Arithmetic Immediate") instruction. +;; rd ← rs1 >> uext(imm) +(decl rv_srai (XReg Imm12) XReg) +(rule (rv_srai rs1 imm) + (alu_rr_imm12 (AluOPRRI.Srai) rs1 imm)) + +;; Helper for emitting the `or` instruction. +;; rd ← rs1 ∨ rs2 +(decl rv_or (XReg XReg) XReg) +(rule (rv_or rs1 rs2) + (alu_rrr (AluOPRRR.Or) rs1 rs2)) + +;; Helper for emitting the `ori` ("Or Immediate") instruction. +;; rd ← rs1 ∨ uext(imm) +(decl rv_ori (XReg Imm12) XReg) +(rule (rv_ori rs1 imm) + (alu_rr_imm12 (AluOPRRI.Ori) rs1 imm)) + +;; Helper for emitting the `xor` instruction. +;; rd ← rs1 ⊕ rs2 +(decl rv_xor (XReg XReg) XReg) +(rule (rv_xor rs1 rs2) + (alu_rrr (AluOPRRR.Xor) rs1 rs2)) + +;; Helper for emitting the `xori` ("Exclusive Or Immediate") instruction. +;; rd ← rs1 ⊕ uext(imm) +(decl rv_xori (XReg Imm12) XReg) +(rule (rv_xori rs1 imm) + (alu_rr_imm12 (AluOPRRI.Xori) rs1 imm)) + +;; Helper for emitting the `not` instruction. +;; This instruction is a mnemonic for `xori rd, rs1, -1`. +(decl rv_not (XReg) XReg) +(rule (rv_not rs1) + (rv_xori rs1 (imm12_const -1))) + +;; Helper for emitting the `and` instruction. +;; rd ← rs1 ∧ rs2 +(decl rv_and (XReg XReg) XReg) +(rule (rv_and rs1 rs2) + (alu_rrr (AluOPRRR.And) rs1 rs2)) + +;; Helper for emitting the `andi` ("And Immediate") instruction. +;; rd ← rs1 ∧ uext(imm) +(decl rv_andi (XReg Imm12) XReg) +(rule (rv_andi rs1 imm) + (alu_rr_imm12 (AluOPRRI.Andi) rs1 imm)) + +;; Helper for emitting the `slt` ("Set Less Than") instruction. +;; rd ← rs1 < rs2 +(decl rv_slt (XReg XReg) XReg) +(rule (rv_slt rs1 rs2) + (alu_rrr (AluOPRRR.Slt) rs1 rs2)) + +;; Helper for emitting the `sltu` ("Set Less Than Unsigned") instruction. +;; rd ← rs1 < rs2 +(decl rv_sltu (XReg XReg) XReg) +(rule (rv_sltu rs1 rs2) + (alu_rrr (AluOPRRR.SltU) rs1 rs2)) + +;; Helper for emitting the `snez` instruction. +;; This instruction is a mnemonic for `sltu rd, zero, rs`. +(decl rv_snez (XReg) XReg) +(rule (rv_snez rs1) + (rv_sltu (zero_reg) rs1)) + +;; Helper for emitting the `slti` ("Set Less Than Immediate") instruction. +;; rd ← rs1 < imm +(decl rv_slti (XReg Imm12) XReg) +(rule (rv_slti rs1 imm) + (alu_rr_imm12 (AluOPRRI.Slti) rs1 imm)) + +;; Helper for emitting the `sltiu` ("Set Less Than Immediate Unsigned") instruction. +;; rd ← rs1 < imm +(decl rv_sltiu (XReg Imm12) XReg) +(rule (rv_sltiu rs1 imm) + (alu_rr_imm12 (AluOPRRI.SltiU) rs1 imm)) + +;; Helper for emitting the `seqz` instruction. +;; This instruction is a mnemonic for `sltiu rd, rs, 1`. +(decl rv_seqz (XReg) XReg) +(rule (rv_seqz rs1) + (rv_sltiu rs1 (imm12_const 1))) + + +;; RV64I Base Integer Instruction Set +;; Unlike RV32I instructions these are only present in the 64bit ISA + +;; Helper for emitting the `addw` ("Add Word") instruction. +;; rd ← sext32(rs1) + sext32(rs2) +(decl rv_addw (XReg XReg) XReg) +(rule (rv_addw rs1 rs2) + (alu_rrr (AluOPRRR.Addw) rs1 rs2)) + +;; Helper for emitting the `addiw` ("Add Word Immediate") instruction. +;; rd ← sext32(rs1) + imm +(decl rv_addiw (XReg Imm12) XReg) +(rule (rv_addiw rs1 imm) + (alu_rr_imm12 (AluOPRRI.Addiw) rs1 imm)) + +;; Helper for emitting the `sext.w` ("Sign Extend Word") instruction. +;; This instruction is a mnemonic for `addiw rd, rs, zero`. +(decl rv_sextw (XReg) XReg) +(rule (rv_sextw rs1) + (rv_addiw rs1 (imm12_const 0))) + +;; Helper for emitting the `subw` ("Subtract Word") instruction. +;; rd ← sext32(rs1) - sext32(rs2) +(decl rv_subw (XReg XReg) XReg) +(rule (rv_subw rs1 rs2) + (alu_rrr (AluOPRRR.Subw) rs1 rs2)) + +;; Helper for emitting the `sllw` ("Shift Left Logical Word") instruction. +;; rd ← sext32(uext32(rs1) << rs2) +(decl rv_sllw (XReg XReg) XReg) +(rule (rv_sllw rs1 rs2) + (alu_rrr (AluOPRRR.Sllw) rs1 rs2)) + +;; Helper for emitting the `slliw` ("Shift Left Logical Immediate Word") instruction. +;; rd ← sext32(uext32(rs1) << imm) +(decl rv_slliw (XReg Imm12) XReg) +(rule (rv_slliw rs1 imm) + (alu_rr_imm12 (AluOPRRI.Slliw) rs1 imm)) + +;; Helper for emitting the `srlw` ("Shift Right Logical Word") instruction. +;; rd ← sext32(uext32(rs1) >> rs2) +(decl rv_srlw (XReg XReg) XReg) +(rule (rv_srlw rs1 rs2) + (alu_rrr (AluOPRRR.Srlw) rs1 rs2)) + +;; Helper for emitting the `srliw` ("Shift Right Logical Immediate Word") instruction. +;; rd ← sext32(uext32(rs1) >> imm) +(decl rv_srliw (XReg Imm12) XReg) +(rule (rv_srliw rs1 imm) + (alu_rr_imm12 (AluOPRRI.SrliW) rs1 imm)) + +;; Helper for emitting the `sraw` ("Shift Right Arithmetic Word") instruction. +;; rd ← sext32(rs1 >> rs2) +(decl rv_sraw (XReg XReg) XReg) +(rule (rv_sraw rs1 rs2) + (alu_rrr (AluOPRRR.Sraw) rs1 rs2)) + +;; Helper for emitting the `sraiw` ("Shift Right Arithmetic Immediate Word") instruction. +;; rd ← sext32(rs1 >> imm) +(decl rv_sraiw (XReg Imm12) XReg) +(rule (rv_sraiw rs1 imm) + (alu_rr_imm12 (AluOPRRI.Sraiw) rs1 imm)) + + +;; RV32M Extension +;; TODO: Enable these instructions only when we have the M extension + +;; Helper for emitting the `mul` instruction. +;; rd ← rs1 × rs2 +(decl rv_mul (XReg XReg) XReg) +(rule (rv_mul rs1 rs2) + (alu_rrr (AluOPRRR.Mul) rs1 rs2)) + +;; Helper for emitting the `mulh` ("Multiply High Signed Signed") instruction. +;; rd ← (sext(rs1) × sext(rs2)) » xlen +(decl rv_mulh (XReg XReg) XReg) +(rule (rv_mulh rs1 rs2) + (alu_rrr (AluOPRRR.Mulh) rs1 rs2)) + +;; Helper for emitting the `mulhu` ("Multiply High Unsigned Unsigned") instruction. +;; rd ← (uext(rs1) × uext(rs2)) » xlen +(decl rv_mulhu (XReg XReg) XReg) +(rule (rv_mulhu rs1 rs2) + (alu_rrr (AluOPRRR.Mulhu) rs1 rs2)) + +;; Helper for emitting the `div` instruction. +;; rd ← rs1 ÷ rs2 +(decl rv_div (XReg XReg) XReg) +(rule (rv_div rs1 rs2) + (alu_rrr (AluOPRRR.Div) rs1 rs2)) + +;; Helper for emitting the `divu` ("Divide Unsigned") instruction. +;; rd ← rs1 ÷ rs2 +(decl rv_divu (XReg XReg) XReg) +(rule (rv_divu rs1 rs2) + (alu_rrr (AluOPRRR.DivU) rs1 rs2)) + +;; Helper for emitting the `rem` instruction. +;; rd ← rs1 mod rs2 +(decl rv_rem (XReg XReg) XReg) +(rule (rv_rem rs1 rs2) + (alu_rrr (AluOPRRR.Rem) rs1 rs2)) + +;; Helper for emitting the `remu` ("Remainder Unsigned") instruction. +;; rd ← rs1 mod rs2 +(decl rv_remu (XReg XReg) XReg) +(rule (rv_remu rs1 rs2) + (alu_rrr (AluOPRRR.RemU) rs1 rs2)) + +;; RV64M Extension +;; TODO: Enable these instructions only when we have the M extension + +;; Helper for emitting the `mulw` ("Multiply Word") instruction. +;; rd ← uext32(rs1) × uext32(rs2) +(decl rv_mulw (XReg XReg) XReg) +(rule (rv_mulw rs1 rs2) + (alu_rrr (AluOPRRR.Mulw) rs1 rs2)) + +;; Helper for emitting the `divw` ("Divide Word") instruction. +;; rd ← sext32(rs1) ÷ sext32(rs2) +(decl rv_divw (XReg XReg) XReg) +(rule (rv_divw rs1 rs2) + (alu_rrr (AluOPRRR.Divw) rs1 rs2)) + +;; Helper for emitting the `divuw` ("Divide Unsigned Word") instruction. +;; rd ← uext32(rs1) ÷ uext32(rs2) +(decl rv_divuw (XReg XReg) XReg) +(rule (rv_divuw rs1 rs2) + (alu_rrr (AluOPRRR.Divuw) rs1 rs2)) + +;; Helper for emitting the `remw` ("Remainder Word") instruction. +;; rd ← sext32(rs1) mod sext32(rs2) +(decl rv_remw (XReg XReg) XReg) +(rule (rv_remw rs1 rs2) + (alu_rrr (AluOPRRR.Remw) rs1 rs2)) + +;; Helper for emitting the `remuw` ("Remainder Unsigned Word") instruction. +;; rd ← uext32(rs1) mod uext32(rs2) +(decl rv_remuw (XReg XReg) XReg) +(rule (rv_remuw rs1 rs2) + (alu_rrr (AluOPRRR.Remuw) rs1 rs2)) + + +;; F and D Extensions +;; TODO: Enable these instructions only when we have the F or D extensions + +;; Helper for emitting the `fadd` instruction. +(decl rv_fadd (Type FRM FReg FReg) FReg) +(rule (rv_fadd ty frm rs1 rs2) (fpu_rrr (FpuOPRRR.Fadd) ty frm rs1 rs2)) + +;; Helper for emitting the `fsub` instruction. +(decl rv_fsub (Type FRM FReg FReg) FReg) +(rule (rv_fsub ty frm rs1 rs2) (fpu_rrr (FpuOPRRR.Fsub) ty frm rs1 rs2)) + +;; Helper for emitting the `fmul` instruction. +(decl rv_fmul (Type FRM FReg FReg) FReg) +(rule (rv_fmul ty frm rs1 rs2) (fpu_rrr (FpuOPRRR.Fmul) ty frm rs1 rs2)) + +;; Helper for emitting the `fdiv` instruction. +(decl rv_fdiv (Type FRM FReg FReg) FReg) +(rule (rv_fdiv ty frm rs1 rs2) (fpu_rrr (FpuOPRRR.Fdiv) ty frm rs1 rs2)) + +;; Helper for emitting the `fsqrt` instruction. +(decl rv_fsqrt (Type FRM FReg) FReg) +(rule (rv_fsqrt ty frm rs1) (fpu_rr (FpuOPRR.Fsqrt) ty frm rs1)) + +;; Helper for emitting the `fmadd` instruction. +(decl rv_fmadd (Type FRM FReg FReg FReg) FReg) +(rule (rv_fmadd ty frm rs1 rs2 rs3) (fpu_rrrr (FpuOPRRRR.Fmadd) ty frm rs1 rs2 rs3)) + +;; Helper for emitting the `fmsub` instruction. +(decl rv_fmsub (Type FRM FReg FReg FReg) FReg) +(rule (rv_fmsub ty frm rs1 rs2 rs3) (fpu_rrrr (FpuOPRRRR.Fmsub) ty frm rs1 rs2 rs3)) + +;; Helper for emitting the `fnmadd` instruction. +(decl rv_fnmadd (Type FRM FReg FReg FReg) FReg) +(rule (rv_fnmadd ty frm rs1 rs2 rs3) (fpu_rrrr (FpuOPRRRR.Fnmadd) ty frm rs1 rs2 rs3)) + +;; Helper for emitting the `fnmsub` instruction. +(decl rv_fnmsub (Type FRM FReg FReg FReg) FReg) +(rule (rv_fnmsub ty frm rs1 rs2 rs3) (fpu_rrrr (FpuOPRRRR.Fnmsub) ty frm rs1 rs2 rs3)) + +;; Helper for emitting the `fmv.x.h` instruction. +(decl rv_fmvxh (FReg) XReg) +(rule (rv_fmvxh r) (fpu_rr_int (FpuOPRR.FmvXFmt) $F16 (FRM.RNE) r)) + +;; Helper for emitting the `fmv.x.w` instruction. +(decl rv_fmvxw (FReg) XReg) +(rule (rv_fmvxw r) (fpu_rr_int (FpuOPRR.FmvXFmt) $F32 (FRM.RNE) r)) + +;; Helper for emitting the `fmv.x.d` instruction. +(decl rv_fmvxd (FReg) XReg) +(rule (rv_fmvxd r) (fpu_rr_int (FpuOPRR.FmvXFmt) $F64 (FRM.RNE) r)) + +;; Helper for emitting the `fmv.h.x` instruction. +(decl rv_fmvhx (XReg) FReg) +(rule (rv_fmvhx r) (fpu_rr (FpuOPRR.FmvFmtX) $F16 (FRM.RNE) r)) + +;; Helper for emitting the `fmv.w.x` instruction. +(decl rv_fmvwx (XReg) FReg) +(rule (rv_fmvwx r) (fpu_rr (FpuOPRR.FmvFmtX) $F32 (FRM.RNE) r)) + +;; Helper for emitting the `fmv.d.x` instruction. +(decl rv_fmvdx (XReg) FReg) +(rule (rv_fmvdx r) (fpu_rr (FpuOPRR.FmvFmtX) $F64 (FRM.RNE) r)) + +;; Helper for emitting the `fcvt.d.s` ("Float Convert Double to Single") instruction. +(decl rv_fcvtds (FReg) FReg) +(rule (rv_fcvtds rs1) (fpu_rr (FpuOPRR.FcvtDS) $F64 (FRM.RNE) rs1)) + +;; Helper for emitting the `fcvt.s.d` ("Float Convert Single to Double") instruction. +(decl rv_fcvtsd (FRM FReg) FReg) +(rule (rv_fcvtsd frm rs1) (fpu_rr (FpuOPRR.FcvtSD) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.s.w` instruction. +(decl rv_fcvtsw (FRM XReg) FReg) +(rule (rv_fcvtsw frm rs1) (fpu_rr (FpuOPRR.FcvtFmtW) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.s.wu` instruction. +(decl rv_fcvtswu (FRM XReg) FReg) +(rule (rv_fcvtswu frm rs1) (fpu_rr (FpuOPRR.FcvtFmtWu) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.d.w` instruction. +(decl rv_fcvtdw (XReg) FReg) +(rule (rv_fcvtdw rs1) (fpu_rr (FpuOPRR.FcvtFmtW) $F64 (FRM.RNE) rs1)) + +;; Helper for emitting the `fcvt.d.wu` instruction. +(decl rv_fcvtdwu (XReg) FReg) +(rule (rv_fcvtdwu rs1) (fpu_rr (FpuOPRR.FcvtFmtWu) $F64 (FRM.RNE) rs1)) + +;; Helper for emitting the `fcvt.s.l` instruction. +(decl rv_fcvtsl (FRM XReg) FReg) +(rule (rv_fcvtsl frm rs1) (fpu_rr (FpuOPRR.FcvtFmtL) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.s.lu` instruction. +(decl rv_fcvtslu (FRM XReg) FReg) +(rule (rv_fcvtslu frm rs1) (fpu_rr (FpuOPRR.FcvtFmtLu) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.d.l` instruction. +(decl rv_fcvtdl (FRM XReg) FReg) +(rule (rv_fcvtdl frm rs1) (fpu_rr (FpuOPRR.FcvtFmtL) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.d.lu` instruction. +(decl rv_fcvtdlu (FRM XReg) FReg) +(rule (rv_fcvtdlu frm rs1) (fpu_rr (FpuOPRR.FcvtFmtLu) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.w.s` instruction. +(decl rv_fcvtws (FRM FReg) XReg) +(rule (rv_fcvtws frm rs1) (fpu_rr_int (FpuOPRR.FcvtWFmt) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.l.s` instruction. +(decl rv_fcvtls (FRM FReg) XReg) +(rule (rv_fcvtls frm rs1) (fpu_rr_int (FpuOPRR.FcvtLFmt) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.wu.s` instruction. +(decl rv_fcvtwus (FRM FReg) XReg) +(rule (rv_fcvtwus frm rs1) (fpu_rr_int (FpuOPRR.FcvtWuFmt) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.lu.s` instruction. +(decl rv_fcvtlus (FRM FReg) XReg) +(rule (rv_fcvtlus frm rs1) (fpu_rr_int (FpuOPRR.FcvtLuFmt) $F32 frm rs1)) + +;; Helper for emitting the `fcvt.w.d` instruction. +(decl rv_fcvtwd (FRM FReg) XReg) +(rule (rv_fcvtwd frm rs1) (fpu_rr_int (FpuOPRR.FcvtWFmt) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.l.d` instruction. +(decl rv_fcvtld (FRM FReg) XReg) +(rule (rv_fcvtld frm rs1) (fpu_rr_int (FpuOPRR.FcvtLFmt) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.wu.d` instruction. +(decl rv_fcvtwud (FRM FReg) XReg) +(rule (rv_fcvtwud frm rs1) (fpu_rr_int (FpuOPRR.FcvtWuFmt) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.lu.d` instruction. +(decl rv_fcvtlud (FRM FReg) XReg) +(rule (rv_fcvtlud frm rs1) (fpu_rr_int (FpuOPRR.FcvtLuFmt) $F64 frm rs1)) + +;; Helper for emitting the `fcvt.w.*` instructions. +(decl rv_fcvtw (Type FRM FReg) XReg) +(rule (rv_fcvtw $F32 frm rs1) (rv_fcvtws frm rs1)) +(rule (rv_fcvtw $F64 frm rs1) (rv_fcvtwd frm rs1)) + +;; Helper for emitting the `fcvt.l.*` instructions. +(decl rv_fcvtl (Type FRM FReg) XReg) +(rule (rv_fcvtl $F32 frm rs1) (rv_fcvtls frm rs1)) +(rule (rv_fcvtl $F64 frm rs1) (rv_fcvtld frm rs1)) + +;; Helper for emitting the `fcvt.wu.*` instructions. +(decl rv_fcvtwu (Type FRM FReg) XReg) +(rule (rv_fcvtwu $F32 frm rs1) (rv_fcvtwus frm rs1)) +(rule (rv_fcvtwu $F64 frm rs1) (rv_fcvtwud frm rs1)) + +;; Helper for emitting the `fcvt.lu.*` instructions. +(decl rv_fcvtlu (Type FRM FReg) XReg) +(rule (rv_fcvtlu $F32 frm rs1) (rv_fcvtlus frm rs1)) +(rule (rv_fcvtlu $F64 frm rs1) (rv_fcvtlud frm rs1)) + +;; Helper for emitting the `fsgnj` ("Floating Point Sign Injection") instruction. +;; The output of this instruction is `rs1` with the sign bit from `rs2` +;; This implements the `copysign` operation +(decl rv_fsgnj (Type FReg FReg) FReg) +(rule (rv_fsgnj ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fsgnj) ty (FRM.RNE) rs1 rs2)) + +;; Helper for emitting the `fsgnjn` ("Floating Point Sign Injection Negated") instruction. +;; The output of this instruction is `rs1` with the negated sign bit from `rs2` +;; When `rs1 == rs2` this implements the `neg` operation +(decl rv_fsgnjn (Type FReg FReg) FReg) +(rule (rv_fsgnjn ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fsgnjn) ty (FRM.RTZ) rs1 rs2)) + +;; Helper for emitting the `fneg` ("Floating Point Negate") instruction. +;; This instruction is a mnemonic for `fsgnjn rd, rs1, rs1` +(decl rv_fneg (Type FReg) FReg) +(rule (rv_fneg ty rs1) (rv_fsgnjn ty rs1 rs1)) + +;; Helper for emitting the `fsgnjx` ("Floating Point Sign Injection Exclusive") instruction. +;; The output of this instruction is `rs1` with the XOR of the sign bits from `rs1` and `rs2`. +;; When `rs1 == rs2` this implements `fabs` +(decl rv_fsgnjx (Type FReg FReg) FReg) +(rule (rv_fsgnjx ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fsgnjx) ty (FRM.RDN) rs1 rs2)) + +;; Helper for emitting the `fabs` ("Floating Point Absolute") instruction. +;; This instruction is a mnemonic for `fsgnjx rd, rs1, rs1` +(decl rv_fabs (Type FReg) FReg) +(rule (rv_fabs ty rs1) (rv_fsgnjx ty rs1 rs1)) + +;; Helper for emitting the `feq` ("Float Equal") instruction. +(decl rv_feq (Type FReg FReg) XReg) +(rule (rv_feq ty rs1 rs2) (fpu_rrr_int (FpuOPRRR.Feq) ty (FRM.RDN) rs1 rs2)) + +;; Helper for emitting the `flt` ("Float Less Than") instruction. +(decl rv_flt (Type FReg FReg) XReg) +(rule (rv_flt ty rs1 rs2) (fpu_rrr_int (FpuOPRRR.Flt) ty (FRM.RTZ) rs1 rs2)) + +;; Helper for emitting the `fle` ("Float Less Than or Equal") instruction. +(decl rv_fle (Type FReg FReg) XReg) +(rule (rv_fle ty rs1 rs2) (fpu_rrr_int (FpuOPRRR.Fle) ty (FRM.RNE) rs1 rs2)) + +;; Helper for emitting the `fgt` ("Float Greater Than") instruction. +;; Note: The arguments are reversed +(decl rv_fgt (Type FReg FReg) XReg) +(rule (rv_fgt ty rs1 rs2) (rv_flt ty rs2 rs1)) + +;; Helper for emitting the `fge` ("Float Greater Than or Equal") instruction. +;; Note: The arguments are reversed +(decl rv_fge (Type FReg FReg) XReg) +(rule (rv_fge ty rs1 rs2) (rv_fle ty rs2 rs1)) + +;; Helper for emitting the `fmin` instruction. +(decl rv_fmin (Type FReg FReg) FReg) +(rule (rv_fmin ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fmin) ty (FRM.RNE) rs1 rs2)) + +;; Helper for emitting the `fmax` instruction. +(decl rv_fmax (Type FReg FReg) FReg) +(rule (rv_fmax ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fmax) ty (FRM.RTZ) rs1 rs2)) + +;; `Zfa` Extension Instructions + +;; Helper for emitting the `fminm` instruction. +(decl rv_fminm (Type FReg FReg) FReg) +(rule (rv_fminm ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fminm) ty (FRM.RDN) rs1 rs2)) + +;; Helper for emitting the `fmaxm` instruction. +(decl rv_fmaxm (Type FReg FReg) FReg) +(rule (rv_fmaxm ty rs1 rs2) (fpu_rrr (FpuOPRRR.Fmaxm) ty (FRM.RUP) rs1 rs2)) + +;; Helper for emitting the `fround` instruction. +(decl rv_fround (Type FRM FReg) FReg) +(rule (rv_fround ty frm rs) (fpu_rr (FpuOPRR.Fround) ty frm rs)) + +;; Helper for emitting the `fli` instruction. +(decl rv_fli (Type FliConstant) FReg) +(rule (rv_fli ty imm) + (let ((dst WritableFReg (temp_writable_freg)) + (_ Unit (emit (MInst.Fli ty + imm + dst)))) + dst)) + +;; `Zba` Extension Instructions + +;; Helper for emitting the `adduw` ("Add Unsigned Word") instruction. +;; rd ← uext32(rs1) + uext32(rs2) +(decl rv_adduw (XReg XReg) XReg) +(rule (rv_adduw rs1 rs2) + (alu_rrr (AluOPRRR.Adduw) rs1 rs2)) + +;; Helper for emitting the `zext.w` ("Zero Extend Word") instruction. +;; This instruction is a mnemonic for `adduw rd, rs1, zero`. +;; rd ← uext32(rs1) +(decl rv_zextw (XReg) XReg) +(rule (rv_zextw rs1) + (rv_adduw rs1 (zero_reg))) + +;; Helper for emitting the `slli.uw` ("Shift Left Logical Immediate Unsigned Word") instruction. +;; rd ← uext32(rs1) << imm +(decl rv_slliuw (XReg Imm12) XReg) +(rule (rv_slliuw rs1 imm) + (alu_rr_imm12 (AluOPRRI.SlliUw) rs1 imm)) + + +;; `Zbb` Extension Instructions + +;; Helper for emitting the `andn` ("And Negated") instruction. +;; rd ← rs1 ∧ ~(rs2) +(decl rv_andn (XReg XReg) XReg) +(rule (rv_andn rs1 rs2) + (if-let $true (has_zbb)) + (alu_rrr (AluOPRRR.Andn) rs1 rs2)) +(rule (rv_andn rs1 rs2) + (if-let $false (has_zbb)) + (rv_and rs1 (rv_not rs2))) + +;; Helper for emitting the `orn` ("Or Negated") instruction. +;; rd ← rs1 ∨ ~(rs2) +(decl rv_orn (XReg XReg) XReg) +(rule (rv_orn rs1 rs2) + (alu_rrr (AluOPRRR.Orn) rs1 rs2)) + +;; Helper for emitting the `xnor` ("Exclusive NOR") instruction. +;; rd ← ~(rs1 ^ rs2) +(decl rv_xnor (XReg XReg) XReg) +(rule (rv_xnor rs1 rs2) + (alu_rrr (AluOPRRR.Xnor) rs1 rs2)) + +;; Helper for emitting the `clz` ("Count Leading Zero Bits") instruction. +(decl rv_clz (XReg) XReg) +(rule (rv_clz rs1) + (alu_rr_funct12 (AluOPRRI.Clz) rs1)) + +;; Helper for emitting the `clzw` ("Count Leading Zero Bits in Word") instruction. +(decl rv_clzw (XReg) XReg) +(rule (rv_clzw rs1) + (alu_rr_funct12 (AluOPRRI.Clzw) rs1)) + +;; Helper for emitting the `ctz` ("Count Trailing Zero Bits") instruction. +(decl rv_ctz (XReg) XReg) +(rule (rv_ctz rs1) + (alu_rr_funct12 (AluOPRRI.Ctz) rs1)) + +;; Helper for emitting the `ctzw` ("Count Trailing Zero Bits in Word") instruction. +(decl rv_ctzw (XReg) XReg) +(rule (rv_ctzw rs1) + (alu_rr_funct12 (AluOPRRI.Ctzw) rs1)) + +;; Helper for emitting the `cpop` ("Count Population") instruction. +(decl rv_cpop (XReg) XReg) +(rule (rv_cpop rs1) + (alu_rr_funct12 (AluOPRRI.Cpop) rs1)) + +;; Helper for emitting the `cpopw` ("Count Population") instruction. +(decl rv_cpopw (XReg) XReg) +(rule (rv_cpopw rs1) + (alu_rr_funct12 (AluOPRRI.Cpopw) rs1)) + +;; Helper for emitting the `max` instruction. +(decl rv_max (XReg XReg) XReg) +(rule (rv_max rs1 rs2) + (alu_rrr (AluOPRRR.Max) rs1 rs2)) + +;; Helper for emitting the `maxu` instruction. +(decl rv_maxu (XReg XReg) XReg) +(rule (rv_maxu rs1 rs2) + (alu_rrr (AluOPRRR.Maxu) rs1 rs2)) + +;; Helper for emitting the `min` instruction. +(decl rv_min (XReg XReg) XReg) +(rule (rv_min rs1 rs2) + (alu_rrr (AluOPRRR.Min) rs1 rs2)) + +;; Helper for emitting the `minu` instruction. +(decl rv_minu (XReg XReg) XReg) +(rule (rv_minu rs1 rs2) + (alu_rrr (AluOPRRR.Minu) rs1 rs2)) + +;; Helper for emitting the `sext.b` instruction. +(decl rv_sextb (XReg) XReg) +(rule (rv_sextb rs1) + (alu_rr_imm12 (AluOPRRI.Sextb) rs1 (imm12_const 0))) + +;; Helper for emitting the `sext.h` instruction. +(decl rv_sexth (XReg) XReg) +(rule (rv_sexth rs1) + (alu_rr_imm12 (AluOPRRI.Sexth) rs1 (imm12_const 0))) + +;; Helper for emitting the `zext.h` instruction. +(decl rv_zexth (XReg) XReg) +(rule (rv_zexth rs1) + (alu_rr_imm12 (AluOPRRI.Zexth) rs1 (imm12_const 0))) + +;; Helper for emitting the `rol` ("Rotate Left") instruction. +(decl rv_rol (XReg XReg) XReg) +(rule (rv_rol rs1 rs2) + (alu_rrr (AluOPRRR.Rol) rs1 rs2)) + +;; Helper for emitting the `rolw` ("Rotate Left Word") instruction. +(decl rv_rolw (XReg XReg) XReg) +(rule (rv_rolw rs1 rs2) + (alu_rrr (AluOPRRR.Rolw) rs1 rs2)) + +;; Helper for emitting the `ror` ("Rotate Right") instruction. +(decl rv_ror (XReg XReg) XReg) +(rule (rv_ror rs1 rs2) + (alu_rrr (AluOPRRR.Ror) rs1 rs2)) + +;; Helper for emitting the `rorw` ("Rotate Right Word") instruction. +(decl rv_rorw (XReg XReg) XReg) +(rule (rv_rorw rs1 rs2) + (alu_rrr (AluOPRRR.Rorw) rs1 rs2)) + +;; Helper for emitting the `rori` ("Rotate Right") instruction. +(decl rv_rori (XReg Imm12) XReg) +(rule (rv_rori rs1 rs2) + (alu_rr_imm12 (AluOPRRI.Rori) rs1 rs2)) + +;; Helper for emitting the `roriw` ("Rotate Right Word") instruction. +(decl rv_roriw (XReg Imm12) XReg) +(rule (rv_roriw rs1 rs2) + (alu_rr_imm12 (AluOPRRI.Roriw) rs1 rs2)) + +;; Helper for emitting the `rev8` ("Byte Reverse") instruction. +(decl rv_rev8 (XReg) XReg) +(rule (rv_rev8 rs1) + (alu_rr_funct12 (AluOPRRI.Rev8) rs1)) + +;; Helper for emitting the `brev8` ("Bit Reverse Inside Bytes") instruction. +;; TODO: This instruction is mentioned in some older versions of the +;; spec, but has since disappeared, we should follow up on this. +;; It probably was renamed to `rev.b` which seems to be the closest match. +(decl rv_brev8 (XReg) XReg) +(rule (rv_brev8 rs1) + (alu_rr_funct12 (AluOPRRI.Brev8) rs1)) + +;; `Zbs` Extension Instructions + +(decl rv_bclr (XReg XReg) XReg) +(rule (rv_bclr rs1 rs2) + (alu_rrr (AluOPRRR.Bclr) rs1 rs2)) + +(decl rv_bclri (XReg Imm12) XReg) +(rule (rv_bclri rs1 imm) + (alu_rr_imm12 (AluOPRRI.Bclri) rs1 imm)) + +(decl rv_bext (XReg XReg) XReg) +(rule (rv_bext rs1 rs2) + (alu_rrr (AluOPRRR.Bext) rs1 rs2)) + +(decl rv_bexti (XReg Imm12) XReg) +(rule (rv_bexti rs1 imm) + (alu_rr_imm12 (AluOPRRI.Bexti) rs1 imm)) + +(decl rv_binv (XReg XReg) XReg) +(rule (rv_binv rs1 rs2) + (alu_rrr (AluOPRRR.Binv) rs1 rs2)) + +(decl rv_binvi (XReg Imm12) XReg) +(rule (rv_binvi rs1 imm) + (alu_rr_imm12 (AluOPRRI.Binvi) rs1 imm)) + +(decl rv_bset (XReg XReg) XReg) +(rule (rv_bset rs1 rs2) + (alu_rrr (AluOPRRR.Bset) rs1 rs2)) + +;; Helper for emitting the `bseti` ("Single-Bit Set Immediate") instruction. +(decl rv_bseti (XReg Imm12) XReg) +(rule (rv_bseti rs1 imm) + (alu_rr_imm12 (AluOPRRI.Bseti) rs1 imm)) + +;; `Zbkb` Extension Instructions + +;; Helper for emitting the `pack` ("Pack low halves of registers") instruction. +(decl rv_pack (XReg XReg) XReg) +(rule (rv_pack rs1 rs2) + (alu_rrr (AluOPRRR.Pack) rs1 rs2)) + +;; Helper for emitting the `packw` ("Pack low 16-bits of registers") instruction. +(decl rv_packw (XReg XReg) XReg) +(rule (rv_packw rs1 rs2) + (alu_rrr (AluOPRRR.Packw) rs1 rs2)) + +;; `ZiCond` Extension Instructions + +;; Helper for emitting the `czero.eqz` ("Conditional zero, if condition is equal to zero") instruction. +;; RS1 is the data source +;; RS2 is the condition +;; +;; rd = (rs2 == 0) ? 0 : rs1 +(decl rv_czero_eqz (XReg XReg) XReg) +(rule (rv_czero_eqz rs1 rs2) + (alu_rrr (AluOPRRR.CzeroEqz) rs1 rs2)) + +;; Helper for emitting the `czero.nez` ("Conditional zero, if condition is nonzero") instruction. +;; RS1 is the data source +;; RS2 is the condition +;; +;; rd = (rs2 != 0) ? 0 : rs1 +(decl rv_czero_nez (XReg XReg) XReg) +(rule (rv_czero_nez rs1 rs2) + (alu_rrr (AluOPRRR.CzeroNez) rs1 rs2)) + + +;; `Zicsr` Extension Instructions + +;; Helper for emitting the `csrrwi` instruction. +(decl rv_csrrwi (CSR UImm5) XReg) +(rule (rv_csrrwi csr imm) + (csr_imm (CsrImmOP.CsrRWI) csr imm)) + +;; This is a special case of `csrrwi` when the CSR is the `frm` CSR. +(decl rv_fsrmi (FRM) XReg) +(rule (rv_fsrmi frm) (rv_csrrwi (CSR.Frm) frm)) + + +;; Helper for emitting the `csrw` instruction. This is a special case of +;; `csrrw` where the destination register is always `x0`. +(decl rv_csrw (CSR XReg) Unit) +(rule (rv_csrw csr rs) + (csr_reg_dst_zero (CsrRegOP.CsrRW) csr rs)) + +;; This is a special case of `csrw` when the CSR is the `frm` CSR. +(decl rv_fsrm (XReg) Unit) +(rule (rv_fsrm rs) (rv_csrw (CSR.Frm) rs)) + + + + + + +;; Helper for generating a FliConstant from a u64 constant +(decl pure partial fli_constant_from_u64 (Type u64) FliConstant) +(extern constructor fli_constant_from_u64 fli_constant_from_u64) + +;; Helper for generating a FliConstant from a u64 negated constant +(decl pure partial fli_constant_from_negated_u64 (Type u64) FliConstant) +(extern constructor fli_constant_from_negated_u64 fli_constant_from_negated_u64) + +;; Helper for generating a i64 from a pair of Imm20 and Imm12 constants +(decl i64_generate_imm (Imm20 Imm12) i64) +(extern extractor i64_generate_imm i64_generate_imm) + +;; Helper for generating a i64 from a shift of a Imm20 constant with LUI +(decl i64_shift_for_lui (u64 Imm12) i64) +(extern extractor i64_shift_for_lui i64_shift_for_lui) + +;; Helper for generating a i64 from a shift of a Imm20 constant +(decl i64_shift (i64 Imm12) i64) +(extern extractor i64_shift i64_shift) + +;; Immediate Loading rules +;; TODO: Loading the zero reg directly causes a bunch of regalloc errors, we should look into it. +;; TODO: Load floats using `fld` instead of `ld` +(decl imm (Type u64) Reg) + +;; Special-case 0.0 for floats to use the `(zero_reg)` directly. +;; See #7162 for why this doesn't fall out of the rules below. +(rule 9 (imm (ty_supported_float $F16) 0) (gen_bitcast (zero_reg) $I16 $F16)) +(rule 9 (imm (ty_supported_float $F32) 0) (gen_bitcast (zero_reg) $I32 $F32)) +(rule 9 (imm (ty_supported_float $F64) 0) (gen_bitcast (zero_reg) $I64 $F64)) + +;; If Zfa is enabled, we can load certain constants with the `fli` instruction. +(rule 8 (imm (ty_supported_float (ty_32_or_64 ty)) imm) + (if-let $true (has_zfa)) + (if-let const (fli_constant_from_u64 ty imm)) + (rv_fli ty const)) + +;; It is beneficial to load the negated constant with `fli` and then negate it +;; in a register. +;; +;; For f64's this saves one instruction, and for f32's it avoids +;; having to allocate an integer register, reducing integer register pressure. +(rule 7 (imm (ty_supported_float (ty_32_or_64 ty)) imm) + (if-let $true (has_zfa)) + (if-let const (fli_constant_from_negated_u64 ty imm)) + (rv_fneg ty (rv_fli ty const))) + +;; Otherwise floats get loaded as integers and then moved into an F register. +(rule 6 (imm (ty_supported_float $F16) c) (gen_bitcast (imm $I16 c) $I16 $F16)) +(rule 6 (imm (ty_supported_float $F32) c) (gen_bitcast (imm $I32 c) $I32 $F32)) +(rule 6 (imm (ty_supported_float $F64) c) (gen_bitcast (imm $I64 c) $I64 $F64)) + +;; Try to match just an imm12 +(rule 4 (imm (ty_int ty) c) + (if-let (i64_generate_imm (imm20_is_zero) imm12) (i64_sextend_u64 ty c)) + (rv_addi (zero_reg) imm12)) + +;; We can also try to load using a single LUI. +;; LUI takes a 20 bit immediate, places it on bits 13 to 32 of the register. +;; In RV64 this value is then sign extended to 64bits. +(rule 3 (imm (ty_int ty) c) + (if-let (i64_generate_imm imm20 (imm12_is_zero)) (i64_sextend_u64 ty c)) + (rv_lui imm20)) + +;; We can combo addi + lui to represent all 32-bit immediates +;; And some 64-bit immediates as well. +(rule 2 (imm (ty_int ty) c) + (if-let (i64_generate_imm imm20 imm12) (i64_sextend_u64 ty c)) + (rv_addi (rv_lui imm20) imm12)) + +;; If the non-zero bits of the immediate fit in 20 bits, we can use LUI + shift +(rule 1 (imm (ty_int ty) c) + (if-let (i64_shift_for_lui (imm20_from_u64 base) shift) (i64_sextend_u64 ty c)) + (rv_slli (rv_lui base) shift)) + +;; Combine one of the above rules with a shift-left if possible, This chops off +;; all trailing zeros from the input constant and then attempts if the resulting +;; constant can itself use one of the above rules via the `i64_generate_imm` +;; matcher. This will then recurse on the above rules to materialize a smaller +;; constant which is then shifted left to create the desired constant. +(rule 0 (imm (ty_int ty) c) + (if-let (i64_shift c_shifted shift) (i64_sextend_u64 ty c)) ;; constant to make + (if-let (i64_generate_imm _ _) c_shifted) ;; can the smaller constant be made? + (rv_slli (imm ty (i64_as_u64 c_shifted)) shift)) + +;; Otherwise we fall back to loading the immediate from the constant pool. +(rule -1 (imm (ty_int ty) c) + (gen_load + (gen_const_amode (emit_u64_le_const c)) + (LoadOP.Ld) + (mem_flags_trusted))) + +;; Imm12 Rules + +(decl pure imm12_zero () Imm12) +(rule (imm12_zero) (imm12_const 0)) + +(decl pure imm12_const (i32) Imm12) +(extern constructor imm12_const imm12_const) + +(decl load_imm12 (i32) Reg) +(rule + (load_imm12 x) + (rv_addi (zero_reg) (imm12_const x))) + +;; for load immediate +(decl imm_from_bits (u64) Imm12) +(extern constructor imm_from_bits imm_from_bits) + +(decl imm_from_neg_bits (i64) Imm12) +(extern constructor imm_from_neg_bits imm_from_neg_bits) + +(decl imm12_const_add (i32 i32) Imm12) +(extern constructor imm12_const_add imm12_const_add) + +;; Performs a fallible add of the `Imm12` value and the 32-bit value provided. +(decl pure partial imm12_add (Imm12 i32) Imm12) +(extern constructor imm12_add imm12_add) + +(decl imm12_and (Imm12 u64) Imm12) +(extern constructor imm12_and imm12_and) + +;; Imm12 Extractors + +;; Helper to go directly from a `Value`, when it's an `iconst`, to an `Imm12`. +(decl imm12_from_value (Imm12) Value) +(extractor (imm12_from_value n) (i64_from_iconst (imm12_from_i64 n))) + +;; Conceptually the same as `imm12_from_value`, but tries negating the constant +;; value (first sign-extending to handle narrow widths). +(decl pure partial imm12_from_negated_value (Value) Imm12) +(rule + (imm12_from_negated_value (has_type ty (iconst n))) + (if-let (imm12_from_u64 imm) (i64_as_u64 (i64_neg (i64_sextend_imm64 ty n)))) + imm) + +(decl imm12_from_u64 (Imm12) u64) +(extern extractor imm12_from_u64 imm12_from_u64) + +(decl imm12_from_i64 (Imm12) i64) +(extern extractor imm12_from_i64 imm12_from_i64) + +(decl pure partial u64_to_imm12 (u64) Imm12) +(rule (u64_to_imm12 (imm12_from_u64 n)) n) + +(decl pure imm12_is_zero () Imm12) +(extern extractor imm12_is_zero imm12_is_zero) + +;; Imm20 + +;; Extractor that matches if a Imm20 is zero +(decl pure imm20_is_zero () Imm20) +(extern extractor imm20_is_zero imm20_is_zero) + +(decl imm20_from_u64 (Imm20) u64) +(extern extractor imm20_from_u64 imm20_from_u64) + +(decl imm20_from_i64 (Imm20) i64) +(extern extractor imm20_from_i64 imm20_from_i64) + + +;; Imm5 Extractors + +(decl imm5_from_u64 (Imm5) u64) +(extern extractor imm5_from_u64 imm5_from_u64) + +(decl imm5_from_i64 (Imm5) i64) +(extern extractor imm5_from_i64 imm5_from_i64) + +;; Construct a Imm5 from an i8 +(decl pure partial i8_to_imm5 (i8) Imm5) +(extern constructor i8_to_imm5 i8_to_imm5) + +;; Helper to go directly from a `Value` to an `Imm5`. +(decl imm5_from_value (Imm5) Value) +(extractor (imm5_from_value n) (i64_from_iconst (imm5_from_i64 n))) + +;; Like imm5_from_value, but first negates the `Value`. +(decl pure partial imm5_from_negated_value (Value) Imm5) +(rule (imm5_from_negated_value (has_type ty (iconst n))) + (if-let (imm5_from_i64 imm) (i64_neg (i64_sextend_imm64 ty n))) + imm) + +;; Constructor that matches a `Value` equivalent to a replicated Imm5 on all lanes. +(decl pure partial replicated_imm5 (Value) Imm5) +(rule (replicated_imm5 (splat (imm5_from_value n))) n) +(rule (replicated_imm5 (vconst (u128_from_constant n128))) + (if-let (u128_replicated_u64 n64) n128) + (if-let (u64_replicated_u32 n32) n64) + (if-let (u32_replicated_u16 n16) n32) + (if-let (u16_replicated_u8 n8) n16) + (if-let n (i8_to_imm5 (u8_as_i8 n8))) + n) + +;; Like replicated_imm5, but first negates the `Value`. +(decl pure partial negated_replicated_imm5 (Value) Imm5) +(rule (negated_replicated_imm5 (splat n)) + (if-let imm5 (imm5_from_negated_value n)) + imm5) +(rule (negated_replicated_imm5 (vconst (u128_from_constant n128))) + (if-let (u128_replicated_u64 n64) n128) + (if-let (u64_replicated_u32 n32) n64) + (if-let (u32_replicated_u16 n16) n32) + (if-let (u16_replicated_u8 n8) n16) + (if-let n (i8_to_imm5 (i8_neg (u8_as_i8 n8)))) + n) + +;; UImm5 Helpers + +;; Constructor that matches a `Value` equivalent to a replicated UImm5 on all lanes. +(decl pure partial replicated_uimm5 (Value) UImm5) +(rule (replicated_uimm5 (splat (uimm5_from_value n))) n) +(rule 1 (replicated_uimm5 (vconst (u128_from_constant n128))) + (if-let (u128_replicated_u64 n64) n128) + (if-let (u64_replicated_u32 n32) n64) + (if-let (u32_replicated_u16 n16) n32) + (if-let (u16_replicated_u8 n8) n16) + (if-let (uimm5_from_u8 n) n8) + n) + +;; Helper to go directly from a `Value`, when it's an `iconst`, to an `UImm5`. +(decl uimm5_from_value (UImm5) Value) +(extractor (uimm5_from_value n) + (iconst (u64_from_imm64 (uimm5_from_u64 n)))) + +;; Extract a `UImm5` from an `u8`. +(decl pure partial uimm5_from_u8 (UImm5) u8) +(extern extractor uimm5_from_u8 uimm5_from_u8) + +;; Extract a `UImm5` from an `u64`. +(decl pure partial uimm5_from_u64 (UImm5) u64) +(extern extractor uimm5_from_u64 uimm5_from_u64) + +;; Convert a `u64` into an `UImm5` +(decl pure partial u64_to_uimm5 (u64) UImm5) +(rule (u64_to_uimm5 (uimm5_from_u64 n)) n) + +(decl uimm5_bitcast_to_imm5 (UImm5) Imm5) +(extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5) + +;; Float Helpers + +;; Returns the bitpattern of the Canonical NaN for the given type. +(decl pure canonical_nan_u64 (Type) u64) +(rule (canonical_nan_u64 $F32) 0x7fc00000) +(rule (canonical_nan_u64 $F64) 0x7ff8000000000000) + +;; Helper for emitting `MInst.FpuRR` instructions. +(decl fpu_rr (FpuOPRR Type FRM Reg) FReg) +(rule (fpu_rr op ty frm src) + (let ((dst WritableFReg (temp_writable_freg)) + (_ Unit (emit (MInst.FpuRR op ty frm dst src)))) + dst)) + +;; Similar to fpu_rr but with an integer destination register +(decl fpu_rr_int (FpuOPRR Type FRM Reg) XReg) +(rule (fpu_rr_int op ty frm src) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.FpuRR op ty frm dst src)))) + dst)) + +;; Helper for emitting `MInst.AluRRR` instructions. +(decl alu_rrr (AluOPRRR Reg Reg) Reg) +(rule (alu_rrr op src1 src2) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.AluRRR op dst src1 src2)))) + dst)) + +;; Helper for emitting `MInst.FpuRRR` instructions. +(decl fpu_rrr (FpuOPRRR Type FRM Reg Reg) FReg) +(rule (fpu_rrr op ty frm src1 src2) + (let ((dst WritableFReg (temp_writable_freg)) + (_ Unit (emit (MInst.FpuRRR op ty frm dst src1 src2)))) + dst)) + +;; Similar to fpu_rrr but with an integer destination register +(decl fpu_rrr_int (FpuOPRRR Type FRM Reg Reg) XReg) +(rule (fpu_rrr_int op ty frm src1 src2) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.FpuRRR op ty frm dst src1 src2)))) + dst)) + +;; Helper for emitting `MInst.FpuRRRR` instructions. +(decl fpu_rrrr (FpuOPRRRR Type FRM Reg Reg Reg) FReg) +(rule (fpu_rrrr op ty frm src1 src2 src3) + (let ((dst WritableFReg (temp_writable_freg)) + (_ Unit (emit (MInst.FpuRRRR op ty frm dst src1 src2 src3)))) + dst)) + + +;; Helper for emitting `MInst.AluRRImm12` instructions. +(decl alu_rr_imm12 (AluOPRRI Reg Imm12) Reg) +(rule (alu_rr_imm12 op src imm) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.AluRRImm12 op dst src imm)))) + dst)) + +;; some instruction use imm12 as funct12. +;; so we don't need the imm12 parameter. +(decl alu_rr_funct12 (AluOPRRI Reg) Reg) +(rule (alu_rr_funct12 op src) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.AluRRImm12 op dst src (imm12_zero))))) + dst)) + +;; Helper for emitting the `Lui` instruction. +;; TODO: This should be something like `emit_u_type`. And should share the +;; `MInst` with `auipc` since these instructions share the U-Type format. +(decl rv_lui (Imm20) XReg) +(rule (rv_lui imm) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Lui dst imm)))) + dst)) + +;; Helper for emitting `MInst.CsrImm` instructions. +(decl csr_imm (CsrImmOP CSR UImm5) XReg) +(rule (csr_imm op csr imm) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.CsrImm op dst imm csr)))) + dst)) + +;; Helper for emitting a `MInst.CsrReg` instruction that writes the result to x0. +(decl csr_reg_dst_zero (CsrRegOP CSR XReg) Unit) +(rule (csr_reg_dst_zero op csr rs) + (emit (MInst.CsrReg op (writable_zero_reg) rs csr))) + + + +(decl select_addi (Type) AluOPRRI) +(rule 1 (select_addi (fits_in_32 ty)) (AluOPRRI.Addiw)) +(rule (select_addi (fits_in_64 ty)) (AluOPRRI.Addi)) + + +(decl gen_andi (XReg u64) XReg) +(rule 1 (gen_andi x (imm12_from_u64 y)) + (rv_andi x y)) + +(rule 0 (gen_andi x y) + (rv_and x (imm $I64 y))) + + +(decl gen_or (Type ValueRegs ValueRegs) ValueRegs) +(rule 1 (gen_or $I128 x y) + (value_regs + (rv_or (value_regs_get x 0) (value_regs_get y 0)) + (rv_or (value_regs_get x 1) (value_regs_get y 1)))) + +(rule 0 (gen_or (fits_in_64 _) x y) + (rv_or (value_regs_get x 0) (value_regs_get y 0))) + + +(decl lower_ctz (Type Reg) Reg) +(rule (lower_ctz ty x) + (gen_cltz $false x ty)) + +(rule 1 (lower_ctz (fits_in_16 ty) x) + (if-let $true (has_zbb)) + (let ((tmp Reg (gen_bseti x (ty_bits ty)))) + (rv_ctzw tmp))) + +(rule 2 (lower_ctz $I32 x) + (if-let $true (has_zbb)) + (rv_ctzw x)) + +(rule 2 (lower_ctz $I64 x) + (if-let $true (has_zbb)) + (rv_ctz x)) + +;; Count leading zeros from a i128 bit value. +;; We count both halves separately and conditionally add them if it makes sense. + +(decl gen_cltz (bool XReg Type) XReg) +(rule (gen_cltz leading rs ty) + (let ((tmp WritableXReg (temp_writable_xreg)) + (step WritableXReg (temp_writable_xreg)) + (sum WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Cltz leading sum step tmp rs ty)))) + sum)) + +;; Performs a zero extension of the given value +(decl zext (Value) XReg) + +;; In the most generic case, we shift left and then shift right. +(rule 0 (zext val @ (value_type (fits_in_32 ty))) + (let ((shift Imm12 (imm_from_bits (u64_sub 64 (ty_bits ty))))) + (rv_srli (rv_slli val shift) shift))) + +;; If we are zero extending a U8 we can use a `andi` instruction. +(rule 1 (zext val @ (value_type $I8)) + (rv_andi val (imm12_const 0xff))) + +;; No point in trying to use `packh` here to zero extend 8 bit values +;; since we can just use `andi` instead which is part of the base ISA. + +;; If we have the `zbkb` extension `packw` can be used to zero extend 16 bit values +(rule 1 (zext val @ (value_type $I16)) + (if-let $true (has_zbkb)) + (rv_packw val (zero_reg))) + +;; If we have the `zbkb` extension `pack` can be used to zero extend 32 bit registers +(rule 1 (zext val @ (value_type $I32)) + (if-let $true (has_zbkb)) + (rv_pack val (zero_reg))) + +;; If we have the `zbb` extension we can use the dedicated `zext.h` instruction. +(rule 2 (zext val @ (value_type $I16)) + (if-let $true (has_zbb)) + (rv_zexth val)) + +;; With `zba` we have a `zext.w` instruction +(rule 2 (zext val @ (value_type $I32)) + (if-let $true (has_zba)) + (rv_zextw val)) + +;; Ignore sign extensions for values whose representation is already the full +;; register width. +(rule 3 (zext val) + (if (val_already_extended (ExtendOp.Zero) val)) + val) + +;; Performs a signed extension of the given value +(decl sext (Value) XReg) + +;; Same base case as `zext`, shift left-then-right. +(rule 0 (sext val @ (value_type (fits_in_32 ty))) + (let ((shift Imm12 (imm_from_bits (u64_sub 64 (ty_bits ty))))) + (rv_srai (rv_slli val shift) shift))) + +;; If we have the `zbb` extension we can use the dedicated `sext.b` instruction. +(rule 1 (sext val @ (value_type $I8)) + (if-let $true (has_zbb)) + (rv_sextb val)) + +;; If we have the `zbb` extension we can use the dedicated `sext.h` instruction. +(rule 1 (sext val @ (value_type $I16)) + (if-let $true (has_zbb)) + (rv_sexth val)) + +;; When signed extending from 32 to 64 bits we can use a +;; `addiw val 0`. Also known as a `sext.w` +(rule 1 (sext val @ (value_type $I32)) + (rv_sextw val)) + +;; Ignore sign extensions for values whose representation is already the full +;; register width. +(rule 2 (sext val) + (if (val_already_extended (ExtendOp.Signed) val)) + val) + +;; Helper matcher for when a value's representation is already sign or zero +;; extended to the full 64-bit register representation. This is used by `zext` +;; and `sext` above to skip the extension instruction entirely in some +;; circumstances. +(decl pure partial val_already_extended (ExtendOp Value) bool) +(rule 0 (val_already_extended _ v @ (value_type $I64)) $true) + +;; When extending our backend always extends to the full register width, so +;; there's no need to extend-an-extend. +(rule 1 (val_already_extended (ExtendOp.Zero) (uextend _)) $true) +(rule 1 (val_already_extended (ExtendOp.Signed) (sextend _)) $true) + +;; The result of `icmp`/`fcmp` is zero or one, meaning that it's already sign +;; extended to the full register width. +(rule 1 (val_already_extended _ (icmp _ _ _)) $true) +(rule 1 (val_already_extended _ (fcmp _ _ _)) $true) + +;; The lowering for these operations always sign-extend their results due to the +;; use of the `*w` instructions in RV64I. Note that this requires that the +;; extension is from 32 to 64, 16/8-bit operations are explicitly excluded here. +;; There are no native instructions for the 16/8 bit operations so they must +;; fall through to actual sign extension above. +(rule 1 (val_already_extended (ExtendOp.Signed) (has_type $I32 (ishl _ _))) $true) +(rule 1 (val_already_extended (ExtendOp.Signed) (has_type $I32 (ushr _ _))) $true) +(rule 1 (val_already_extended (ExtendOp.Signed) (has_type $I32 (sshr _ _))) $true) +(rule 1 (val_already_extended (ExtendOp.Signed) (has_type $I32 (iadd _ _))) $true) +(rule 1 (val_already_extended (ExtendOp.Signed) (has_type $I32 (isub _ _))) $true) + +(type ExtendOp + (enum + (Zero) + (Signed))) + +(decl lower_b128_binary (AluOPRRR ValueRegs ValueRegs) ValueRegs) +(rule + (lower_b128_binary op a b) + (let + ( ;; low part. + (low XReg (alu_rrr op (value_regs_get a 0) (value_regs_get b 0))) + ;; high part. + (high XReg (alu_rrr op (value_regs_get a 1) (value_regs_get b 1)))) + (value_regs low high))) + +(decl lower_smlhi (Type XReg XReg) XReg) +(rule 1 + (lower_smlhi $I64 rs1 rs2) + (rv_mulh rs1 rs2)) + +(rule + (lower_smlhi ty rs1 rs2) + (let + ((tmp XReg (rv_mul rs1 rs2))) + (rv_srli tmp (imm12_const (ty_bits ty))))) + +;;;; construct shift amount.rotl on i128 will use shift to implement. So can call this function. +;;;; this will return shift amount and (ty_bits - "shift amount") +;;;; if ty_bits is greater than 64 like i128, then shmat will fallback to 64.because We are 64 bit platform. +(decl gen_shamt (Type XReg) ValueRegs) +(extern constructor gen_shamt gen_shamt) + +;; bseti: Set a single bit in a register, indexed by a constant. +(decl gen_bseti (Reg u64) Reg) +(rule (gen_bseti val bit) + (if-let $false (has_zbs)) + (if-let $false (u64_le bit 12)) + (let ((const XReg (imm $I64 (u64_shl 1 bit)))) + (rv_or val const))) + +(rule (gen_bseti val bit) + (if-let $false (has_zbs)) + (if-let $true (u64_le bit 12)) + (rv_ori val (imm12_const (u64_as_i32 (u64_shl 1 bit))))) + +(rule (gen_bseti val bit) + (if-let $true (has_zbs)) + (rv_bseti val (imm12_const (u64_as_i32 bit)))) + + +(decl gen_popcnt (XReg) Reg) +(rule (gen_popcnt rs) + (let + ((tmp WritableXReg (temp_writable_xreg)) + (step WritableXReg (temp_writable_xreg)) + (sum WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Popcnt sum step tmp rs $I64)))) + (writable_reg_to_reg sum))) + +;; Generates a AMode that points to a register plus an offset. +(decl gen_reg_offset_amode (Reg i64) AMode) +(extern constructor gen_reg_offset_amode gen_reg_offset_amode) + +;; Generates a AMode that an offset from the stack pointer. +(decl gen_sp_offset_amode (i64) AMode) +(extern constructor gen_sp_offset_amode gen_sp_offset_amode) + +;; Generates a AMode that an offset from the frame pointer. +(decl gen_fp_offset_amode (i64) AMode) +(extern constructor gen_fp_offset_amode gen_fp_offset_amode) + +;; Generates an AMode that points to a stack slot + offset. +(decl gen_stack_slot_amode (StackSlot i64) AMode) +(extern constructor gen_stack_slot_amode gen_stack_slot_amode) + +;; Generates a AMode that points to a constant in the constant pool. +(decl gen_const_amode (VCodeConstant) AMode) +(extern constructor gen_const_amode gen_const_amode) + + + +;; Tries to match a Value + Offset into an AMode +(decl amode (Value i32) AMode) +(rule 0 (amode addr offset) (amode_inner addr offset)) + +;; If we are adding a constant offset with an iadd we can instead make that +;; offset part of the amode offset. +;; +;; We can't recurse into `amode` again since that could cause stack overflows. +;; See: https://github.com/bytecodealliance/wasmtime/pull/6968 +(rule 1 (amode (iadd addr (i32_from_iconst y)) offset) + (if-let new_offset (s32_add_fallible y offset)) + (amode_inner addr new_offset)) +(rule 2 (amode (iadd (i32_from_iconst x) addr) offset) + (if-let new_offset (s32_add_fallible x offset)) + (amode_inner addr new_offset)) + + +;; These are the normal rules for generating an AMode. +(decl amode_inner (Value i32) AMode) + +;; In the simplest case we just lower into a Reg+Offset +(rule 0 (amode_inner r @ (value_type (ty_addr64 _)) offset) + (gen_reg_offset_amode r offset)) + +;; If the value is a `get_frame_pointer`, we can just use the offset from that. +(rule 1 (amode_inner (get_frame_pointer) offset) + (gen_fp_offset_amode offset)) + +;; If the value is a `get_stack_pointer`, we can just use the offset from that. +(rule 1 (amode_inner (get_stack_pointer) offset) + (gen_sp_offset_amode offset)) + +;; Similarly if the value is a `stack_addr` we can also turn that into an sp offset. +(rule 1 (amode_inner (stack_addr ss ss_offset) amode_offset) + (if-let combined_offset (s32_add_fallible ss_offset amode_offset)) + (gen_stack_slot_amode ss combined_offset)) + + +;; Helpers for sinkable loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; RISC-V doesen't really have sinkable loads. But the regular load instructions +;; sign / zero extend their results to 64 bits. So we can pretend they are +;; an extend instruction with a sinkable load. This allows us to have better +;; lowerings on these cases. + +;; Extract a sinkable instruction from a value operand. +(decl sinkable_inst (Inst) Value) +(extern extractor sinkable_inst sinkable_inst) + +;; Matches a sinkable load. +(decl sinkable_load (Inst Type MemFlags Value Offset32) Value) +(extractor (sinkable_load inst ty flags addr offset) + (and + (load flags addr offset) + (sinkable_inst (has_type ty inst)))) + +;; Returns a canonical type for a LoadOP. We only return I64 or F64. +(decl load_op_reg_type (LoadOP) Type) +(rule 1 (load_op_reg_type (LoadOP.Fld)) $F64) +(rule 1 (load_op_reg_type (LoadOP.Flw)) $F64) +(rule 1 (load_op_reg_type (LoadOP.Flh)) $F64) +(rule 0 (load_op_reg_type _) $I64) + +;; Helper constructor to build a load instruction. +(decl gen_load (AMode LoadOP MemFlags) Reg) +(rule (gen_load amode op flags) + (let ((dst WritableReg (temp_writable_reg (load_op_reg_type op))) + (_ Unit (emit (MInst.Load dst op flags amode)))) + dst)) + +;; Similar to `gen_load` but marks `Inst` as sunk at the current point. +;; +;; This is only useful for load op's that perform some additional computation +;; such as extending the loaded value. +(decl gen_sunk_load (Inst AMode LoadOP MemFlags) Reg) +(rule (gen_sunk_load inst amode op flags) + (let ((_ Unit (sink_inst inst))) + (gen_load amode op flags))) + + +;; Helper constructor to build a store instruction. +;; +;; This helper contains a special-case for zero constants stored to memory to +;; directly store the `zero` register to memory. See #7162 for some discussion +;; on why this doesn't just fall out. +(decl gen_store (AMode MemFlags Value) InstOutput) +(rule 1 (gen_store amode flags val @ (value_type ty)) + (if-let (u64_from_iconst 0) val) + (rv_store amode (store_op ty) flags (zero_reg))) +(rule 0 (gen_store amode flags val @ (value_type ty)) + (rv_store amode (store_op ty) flags val)) + +;; Emit a raw instruction to store a register into memory. +;; +;; Note that the `src` operand must have the correct type for the `op` +;; specified. +(decl rv_store (AMode StoreOP MemFlags Reg) InstOutput) +(rule (rv_store amode op flags src) + (side_effect (SideEffectNoResult.Inst (MInst.Store amode op flags src)))) + + + + +(decl valid_atomic_transaction (Type) Type) +(extern extractor valid_atomic_transaction valid_atomic_transaction) + +;;helper function. +;;construct an atomic instruction. +(decl gen_atomic (AtomicOP Reg Reg AMO) Reg) +(rule + (gen_atomic op addr src amo) + (let + ((tmp WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Atomic op tmp addr src amo)))) + tmp)) + +;; helper function +(decl get_atomic_rmw_op (Type AtomicRmwOp) AtomicOP) +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Add)) + (AtomicOP.AmoaddW)) +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Add)) + (AtomicOP.AmoaddD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.And)) + (AtomicOP.AmoandW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.And)) + (AtomicOP.AmoandD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Or)) + (AtomicOP.AmoorW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Or)) + (AtomicOP.AmoorD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Smax)) + (AtomicOP.AmomaxW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Smax)) + (AtomicOP.AmomaxD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Smin)) + (AtomicOP.AmominW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Smin)) + (AtomicOP.AmominD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Umax)) + (AtomicOP.AmomaxuW) +) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Umax)) + (AtomicOP.AmomaxuD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Umin)) + (AtomicOP.AmominuW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Umin)) + (AtomicOP.AmominuD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Xchg)) + (AtomicOP.AmoswapW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Xchg)) + (AtomicOP.AmoswapD)) + +(rule + (get_atomic_rmw_op $I32 (AtomicRmwOp.Xor)) + (AtomicOP.AmoxorW)) + +(rule + (get_atomic_rmw_op $I64 (AtomicRmwOp.Xor)) + (AtomicOP.AmoxorD)) + +(decl atomic_amo () AMO) +(extern constructor atomic_amo atomic_amo) + + +(decl gen_atomic_load (Reg Type) Reg) +(rule + (gen_atomic_load p ty) + (let + ((tmp WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.AtomicLoad tmp ty p)))) + (writable_reg_to_reg tmp))) + +;;; +(decl gen_atomic_store (Reg Type Reg) InstOutput) +(rule + (gen_atomic_store p ty src) + (side_effect (SideEffectNoResult.Inst (MInst.AtomicStore src ty p))) +) + + +;; Rounds a FReg by converting the value into an integer and back with a specified +;; float rounding mode. +(decl float_round_fcvt (Type FRM FReg) FReg) +(rule (float_round_fcvt $F32 frm rs) (rv_fcvtsw frm (rv_fcvtws frm rs))) +(rule (float_round_fcvt $F64 frm rs) (rv_fcvtdl frm (rv_fcvtld frm rs))) + +(decl gen_float_round (FRM FReg Type) FReg) +(rule 0 (gen_float_round frm rs ty) + (let (;; if rs is NaN/+-Infinity/+-Zero or if the exponent is larger than # of bits + ;; in mantissa, the result is the same as src, check for these cases first. + (max FReg (imm ty (float_int_max ty))) + (abs FReg (rv_fabs ty rs)) + (exact XReg (rv_flt ty abs max)) + + ;; Manually round the value using the fcvt instructions + ;; to move the value to an integer register and back. + (fcvt FReg (float_round_fcvt ty frm rs)) + ;; Restore the sign bit from the initial value. + (rounded FReg (rv_fsgnj ty fcvt rs)) + + ;; We want to return a arithmetic nan if the input is a canonical nan. + ;; Convert them by adding 0.0 to the input. + (float_zero FReg (gen_bitcast (zero_reg) (float_int_of_same_size ty) ty)) + (corrected_nan FReg (rv_fadd ty (FRM.RNE) rs float_zero))) + + ;; Check if the value cannot be rounded exactly and return the source input if so + (gen_select_freg (cmp_eqz exact) corrected_nan rounded))) + +;; With Zfa we can use the dedicated `fround` instruction. +(rule 1 (gen_float_round frm rs ty) + (if-let $true (has_zfa)) + (rv_fround ty frm rs)) + + + +(decl gen_stack_addr (StackSlot Offset32) Reg) +(extern constructor gen_stack_addr gen_stack_addr) + +(decl gen_select_xreg (IntegerCompare XReg XReg) XReg) + +(rule 6 (gen_select_xreg (int_compare_decompose cc x y) x y) + (if-let (IntCC.UnsignedLessThan) (intcc_without_eq cc)) + (if-let $true (has_zbb)) + (rv_minu x y)) + +(rule 6 (gen_select_xreg (int_compare_decompose cc x y) x y) + (if-let (IntCC.SignedLessThan) (intcc_without_eq cc)) + (if-let $true (has_zbb)) + (rv_min x y)) + +(rule 6 (gen_select_xreg (int_compare_decompose cc x y) x y) + (if-let (IntCC.UnsignedGreaterThan) (intcc_without_eq cc)) + (if-let $true (has_zbb)) + (rv_maxu x y)) + +(rule 6 (gen_select_xreg (int_compare_decompose cc x y) x y) + (if-let (IntCC.SignedGreaterThan) (intcc_without_eq cc)) + (if-let $true (has_zbb)) + (rv_max x y)) + +;; Rotate Zero Reg to the right. This allows us to write fewer rules +;; below when matching the zero register +;; +;; Additionally prevent this rule from recursing infinitely by only +;; matching when one of the inputs is the zero register, but not both. + +(rule 5 (gen_select_xreg (int_compare_decompose cc a @ (zero_reg) b @ (non_zero_reg)) x y) + (if-let $true (has_zicond)) + (gen_select_xreg (int_compare (intcc_swap_args cc) b a) x y)) + +(rule 4 (gen_select_xreg c @ (int_compare_decompose cc a b) x @ (zero_reg) y @ (non_zero_reg)) + (if-let $true (has_zicond)) + (gen_select_xreg (int_compare (intcc_complement cc) a b) y x)) + +(rule 3 (gen_select_xreg (int_compare_decompose (IntCC.Equal) c (zero_reg)) x (zero_reg)) + (if-let $true (has_zicond)) + (rv_czero_nez x c)) + +(rule 3 (gen_select_xreg (int_compare_decompose (IntCC.NotEqual) c (zero_reg)) x (zero_reg)) + (if-let $true (has_zicond)) + (rv_czero_eqz x c)) + +(rule 2 (gen_select_xreg (int_compare_decompose (IntCC.Equal) c (zero_reg)) x y) + (if-let $true (has_zicond)) + (rv_or + (rv_czero_nez x c) + (rv_czero_eqz y c))) + +(rule 2 (gen_select_xreg (int_compare_decompose (IntCC.NotEqual) c (zero_reg)) x y) + (if-let $true (has_zicond)) + (rv_or + (rv_czero_eqz x c) + (rv_czero_nez y c))) + +;; It is still beneficial to emit the full compare instruction, and then the 3 instruction +;; select using zicond, so do that here as a last resort. +(rule 1 (gen_select_xreg compare x y) + (if-let $true (has_zicond)) + (gen_select_xreg (cmp_nez (lower_int_compare compare)) x y)) + +;; In the base case we emit a conditional branch and a few moves. + +(rule 0 (gen_select_xreg c x y) + (let + ((dst WritableReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Select dst c x y)))) + (writable_reg_to_reg dst))) + + +(decl gen_select_vreg (IntegerCompare VReg VReg) VReg) +(rule (gen_select_vreg c x y) + (let + ((dst WritableReg (temp_writable_vreg)) + (_ Unit (emit (MInst.Select dst c (vreg_to_reg x) (vreg_to_reg y))))) + (writable_reg_to_reg dst))) +(decl gen_select_freg (IntegerCompare FReg FReg) FReg) +(rule (gen_select_freg c x y) + (let + ((dst WritableReg (temp_writable_freg)) + (_ Unit (emit (MInst.Select dst c (freg_to_reg x) (freg_to_reg y))))) + (writable_reg_to_reg dst))) +(decl gen_select_regs (IntegerCompare ValueRegs ValueRegs) ValueRegs) +(rule (gen_select_regs c x y) + (let + ((dst1 WritableReg (temp_writable_xreg)) + (dst2 WritableReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Select (writable_value_regs dst1 dst2) c x y)))) + (value_regs dst1 dst2))) + +(decl udf (TrapCode) InstOutput) +(rule + (udf code) + (side_effect (SideEffectNoResult.Inst (MInst.Udf code)))) + +(decl load_op (Type) LoadOP) +(extern constructor load_op load_op) + +(decl store_op (Type) StoreOP) +(extern constructor store_op store_op) + + +;;;; load extern name +(decl load_ext_name (ExternalName i64) Reg) +(extern constructor load_ext_name load_ext_name) + +(decl elf_tls_get_addr (ExternalName) Reg) +(rule (elf_tls_get_addr name) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.ElfTlsGetAddr dst name)))) + dst)) + +;;; some float binary operation +;;; 1. need move into x register. +;;; 2. do the operation. +;;; 3. move back. +(decl lower_float_binary (AluOPRRR FReg FReg Type) FReg) +(rule + (lower_float_binary op rs1 rs2 ty) + (let ((x_rs1 XReg (move_f_to_x rs1 ty)) + (x_rs2 XReg (move_f_to_x rs2 ty)) + (tmp XReg (alu_rrr op x_rs1 x_rs2))) + (move_x_to_f tmp (float_int_of_same_size ty)))) + + +(decl i128_sub (ValueRegs ValueRegs) ValueRegs) +(rule + (i128_sub x y ) + (let + (;; low part. + (low XReg (rv_sub (value_regs_get x 0) (value_regs_get y 0))) + ;; compute borrow. + (borrow XReg (rv_sltu (value_regs_get x 0) low)) + ;; + (high_tmp XReg (rv_sub (value_regs_get x 1) (value_regs_get y 1))) + ;; + (high XReg (rv_sub high_tmp borrow))) + (value_regs low high))) + +;; Consume a CmpResult, producing a branch on its result. +(decl cond_br (IntegerCompare CondBrTarget CondBrTarget) SideEffectNoResult) +(rule (cond_br cmp then else) + (SideEffectNoResult.Inst + (MInst.CondBr then else cmp))) + +;; Helper for emitting the `j` mnemonic, an unconditional jump to label. +(decl rv_j (MachLabel) SideEffectNoResult) +(rule (rv_j label) + (SideEffectNoResult.Inst (MInst.Jal label))) + +;; Construct an IntegerCompare value. +(decl int_compare (IntCC XReg XReg) IntegerCompare) +(extern constructor int_compare int_compare) + +;; Extract the components of an `IntegerCompare` +(decl int_compare_decompose (IntCC XReg XReg) IntegerCompare) +(extern extractor infallible int_compare_decompose int_compare_decompose) + +(decl label_to_br_target (MachLabel) CondBrTarget) +(extern constructor label_to_br_target label_to_br_target) +(convert MachLabel CondBrTarget label_to_br_target) + +(decl cmp_eqz (XReg) IntegerCompare) +(rule (cmp_eqz r) (int_compare (IntCC.Equal) r (zero_reg))) + +(decl cmp_nez (XReg) IntegerCompare) +(rule (cmp_nez r) (int_compare (IntCC.NotEqual) r (zero_reg))) + +(decl cmp_eq (XReg XReg) IntegerCompare) +(rule (cmp_eq rs1 rs2) (int_compare (IntCC.Equal) rs1 rs2)) + +(decl cmp_ne (XReg XReg) IntegerCompare) +(rule (cmp_ne rs1 rs2) (int_compare (IntCC.NotEqual) rs1 rs2)) + +(decl cmp_lt (XReg XReg) IntegerCompare) +(rule (cmp_lt rs1 rs2) (int_compare (IntCC.SignedLessThan) rs1 rs2)) + +(decl cmp_ltz (XReg) IntegerCompare) +(rule (cmp_ltz rs) (int_compare (IntCC.SignedLessThan) rs (zero_reg))) + +(decl cmp_gt (XReg XReg) IntegerCompare) +(rule (cmp_gt rs1 rs2) (int_compare (IntCC.SignedGreaterThan) rs1 rs2)) + +(decl cmp_ge (XReg XReg) IntegerCompare) +(rule (cmp_ge rs1 rs2) (int_compare (IntCC.SignedGreaterThanOrEqual) rs1 rs2)) + +(decl cmp_le (XReg XReg) IntegerCompare) +(rule (cmp_le rs1 rs2) (int_compare (IntCC.SignedLessThanOrEqual) rs1 rs2)) + +(decl cmp_gtu (XReg XReg) IntegerCompare) +(rule (cmp_gtu rs1 rs2) (int_compare (IntCC.UnsignedGreaterThan) rs1 rs2)) + +(decl cmp_geu (XReg XReg) IntegerCompare) +(rule (cmp_geu rs1 rs2) (int_compare (IntCC.UnsignedGreaterThanOrEqual) rs1 rs2)) + +(decl cmp_ltu (XReg XReg) IntegerCompare) +(rule (cmp_ltu rs1 rs2) (int_compare (IntCC.UnsignedLessThan) rs1 rs2)) + +(decl cmp_leu (XReg XReg) IntegerCompare) +(rule (cmp_leu rs1 rs2) (int_compare (IntCC.UnsignedLessThanOrEqual) rs1 rs2)) + +;; Helper to generate an `IntegerCompare` which represents the "truthy" value of +;; the input provided. +;; +;; This is used in `Select` and `brif` for example to generate conditional +;; branches. The returned comparison, when taken, represents that `Value` is +;; nonzero. When not taken the input `Value` is zero. +(decl is_nonzero_cmp (Value) IntegerCompare) + +;; Base case - convert to a "truthy" value and compare it against zero. +;; +;; Note that non-64-bit types need to be extended since the upper bits from +;; Cranelift's point of view are undefined. Favor a zero extension for 8-bit +;; types because that's a single `andi` instruction, but favor sign-extension +;; for 16 and 32-bit types because many RISC-V which operate on the low 32-bits. +;; Additionally the base 64-bit ISA has a single instruction for sign-extending +;; from 32 to 64-bits which makes that a bit cheaper if used. +;; of registers sign-extend the results. +(rule 0 (is_nonzero_cmp val @ (value_type (fits_in_64 _))) + (cmp_nez (sext val))) +(rule 1 (is_nonzero_cmp val @ (value_type $I8)) + (cmp_nez (zext val))) +(rule 1 (is_nonzero_cmp val @ (value_type $I128)) + (cmp_nez (rv_or (value_regs_get val 0) (value_regs_get val 1)))) + +;; If the input value is itself an `icmp` or `fcmp` we can avoid generating the +;; result of the comparison and instead move the comparison directly into the +;; `IntegerCompare` that's returned. +(rule 2 (is_nonzero_cmp (maybe_uextend (icmp cc a b @ (value_type (fits_in_64 _))))) + (icmp_to_int_compare cc a b)) +(rule 2 (is_nonzero_cmp (maybe_uextend (fcmp cc a @ (value_type ty) b))) + (fcmp_to_float_compare cc ty a b)) + +;; Creates an `IntegerCompare` from an `icmp` node's parts. This will extend +;; values as necessary to their full register width to perform the +;; comparison. The returned `IntegerCompare` is suitable to use in conditional +;; branches for example. +;; +;; Note that this should ideally only be used when the `IntegerCompare` returned +;; is fed into a branch. If `IntegerCompare` is materialized this will miss out +;; on optimizations to compare against constants using some native instructions. +(decl icmp_to_int_compare (IntCC Value Value) IntegerCompare) +(rule 0 (icmp_to_int_compare cc a b @ (value_type (fits_in_64 in_ty))) + (int_compare cc (put_value_in_reg_for_icmp cc a) (put_value_in_reg_for_icmp cc b))) +(rule 1 (icmp_to_int_compare cc a b @ (value_type $I128)) + (cmp_nez (lower_icmp_i128 cc a b))) + +;; Places a `Value` into a full register width to prepare for a comparison +;; using `IntCC`. +;; +;; This is largely a glorified means of choosing sign-extension or +;; zero-extension for the `Value` input. +(decl put_value_in_reg_for_icmp (IntCC Value) XReg) + +;; Base cases, use the `cc` to determine whether to zero or sign extend. +(rule 0 (put_value_in_reg_for_icmp cc val) + (zext val)) +(rule 1 (put_value_in_reg_for_icmp cc val) + (if (signed_cond_code cc)) + (sext val)) + +;; For equality and inequality favor sign extension since it's generally +;; easier to perform sign extension on RV64 via native instructions. For 8-bit +;; types though use zero-extension since that's a single instruction `and`. +(rule 2 (put_value_in_reg_for_icmp (IntCC.Equal) val @ (value_type (fits_in_64 _))) + (sext val)) +(rule 2 (put_value_in_reg_for_icmp (IntCC.NotEqual) val @ (value_type (fits_in_64 _))) + (sext val)) +(rule 3 (put_value_in_reg_for_icmp (IntCC.Equal) val @ (value_type $I8)) + (zext val)) +(rule 3 (put_value_in_reg_for_icmp (IntCC.NotEqual) val @ (value_type $I8)) + (zext val)) + +;; As a special case use `x0` directly if a constant is 0. +(rule 4 (put_value_in_reg_for_icmp _ (i64_from_iconst 0)) + (zero_reg)) + + +(decl partial lower_branch (Inst MachLabelSlice) Unit) +(rule (lower_branch (jump _) (single_target label)) + (emit_side_effect (rv_j label))) + +(rule (lower_branch (brif v _ _) (two_targets then else)) + (emit_side_effect (cond_br (is_nonzero_cmp v) then else))) + +(decl lower_br_table (Reg MachLabelSlice) Unit) +(extern constructor lower_br_table lower_br_table) + +(rule (lower_branch (br_table index _) targets) + (lower_br_table index targets)) + +(decl load_ra () Reg) +(extern constructor load_ra load_ra) + + +;; Generates a bitcast instruction. +;; Args are: src, src_ty, dst_ty +(decl gen_bitcast (Reg Type Type) Reg) + +;; To support FP16 vfmv.* we need to check for the `zvfh` isa flag, which we currently don't +;; support, so restrict the floating point types to 32/64 bits. +(rule 5 (gen_bitcast r (ty_supported_float (ty_32_or_64 src_ty)) (ty_supported_vec _)) (rv_vfmv_sf r src_ty)) +(rule 4 (gen_bitcast r (ty_supported_vec _) (ty_supported_float (ty_32_or_64 dst_ty))) (rv_vfmv_fs r dst_ty)) + +(rule 3 (gen_bitcast r (ty_int_ref_scalar_64 src_ty) (ty_supported_vec _)) (rv_vmv_sx r src_ty)) +(rule 2 (gen_bitcast r (ty_supported_vec _) (ty_int_ref_scalar_64 dst_ty)) (rv_vmv_xs r dst_ty)) +(rule 1 (gen_bitcast r $F16 $I16) (rv_fmvxh r)) +(rule 1 (gen_bitcast r $F32 $I32) (rv_fmvxw r)) +(rule 1 (gen_bitcast r $F64 $I64) (rv_fmvxd r)) +(rule 1 (gen_bitcast r $I16 $F16) (rv_fmvhx r)) +(rule 1 (gen_bitcast r $I32 $F32) (rv_fmvwx r)) +(rule 1 (gen_bitcast r $I64 $F64) (rv_fmvdx r)) +(rule (gen_bitcast r _ _) r) + +(decl move_f_to_x (FReg Type) XReg) +(rule (move_f_to_x r $F32) (gen_bitcast r $F32 $I32)) +(rule (move_f_to_x r $F64) (gen_bitcast r $F64 $I64)) + +(decl move_x_to_f (XReg Type) FReg) +(rule (move_x_to_f r $I32) (gen_bitcast r $I32 $F32)) +(rule (move_x_to_f r $I64) (gen_bitcast r $I64 $F64)) + +(decl float_int_of_same_size (Type) Type) +(rule (float_int_of_same_size $F32) $I32) +(rule (float_int_of_same_size $F64) $I64) + + +(decl gen_brev8 (Reg Type) Reg) +(rule 1 + (gen_brev8 rs _) + (if-let $true (has_zbkb)) + (rv_brev8 rs)) +(rule + (gen_brev8 rs ty) + (if-let $false (has_zbkb)) + (let + ((tmp WritableXReg (temp_writable_xreg)) + (tmp2 WritableXReg (temp_writable_xreg)) + (step WritableXReg (temp_writable_xreg)) + (rd WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.Brev8 rs ty step tmp tmp2 rd)))) + (writable_reg_to_reg rd))) + +;; Negates x +;; Equivalent to 0 - x +(decl neg (Type ValueRegs) ValueRegs) +(rule 1 (neg (fits_in_64 (ty_int ty)) val) + (value_reg + (rv_neg (value_regs_get val 0)))) + +(rule 2 (neg $I128 val) + (i128_sub (value_regs_zero) val)) + + +;; Builds an instruction sequence that traps if the comparison succeeds. +(decl gen_trapif (IntCC XReg XReg TrapCode) InstOutput) +(rule (gen_trapif cc a b trap_code) + (side_effect (SideEffectNoResult.Inst (MInst.TrapIf a b cc trap_code)))) + +;; Builds an instruction sequence that traps if the input is non-zero. +(decl gen_trapnz (XReg TrapCode) InstOutput) +(rule (gen_trapnz test trap_code) + (gen_trapif (IntCC.NotEqual) test (zero_reg) trap_code)) + +;; Builds an instruction sequence that traps if the input is zero. +(decl gen_trapz (XReg TrapCode) InstOutput) +(rule (gen_trapz test trap_code) + (gen_trapif (IntCC.Equal) test (zero_reg) trap_code)) + +;;;; Helpers for Emitting Calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl gen_call (SigRef ExternalName RelocDistance ValueSlice) InstOutput) +(extern constructor gen_call gen_call) + +(decl gen_call_indirect (SigRef Value ValueSlice) InstOutput) +(extern constructor gen_call_indirect gen_call_indirect) + +;;; this is trying to imitate aarch64 `madd` instruction. +(decl madd (XReg XReg XReg) XReg) +(rule + (madd n m a) + (let + ((t XReg (rv_mul n m))) + (rv_add t a))) + +;;;; Helpers for bmask ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Generates either 0 if `Value` is zero or -1 otherwise. +(decl gen_bmask (Value) XReg) + +;; Base cases: use `snez` after a sign extension to ensure that the entire +;; register is defined. For i128 we test both the upper and lower half. +(rule 0 (gen_bmask val @ (value_type (fits_in_64 _))) + (let ((non_zero XReg (rv_snez (sext val)))) + (rv_neg non_zero))) +(rule 1 (gen_bmask val @ (value_type $I128)) + (let ((non_zero XReg (rv_snez (rv_or (value_regs_get val 0) (value_regs_get val 1))))) + (rv_neg non_zero))) + +;; If the input value is an `icmp` or an `fcmp` directly then the `snez` can +;; be omitted because the result of the icmp or fcmp is a 0 or 1 directly. This +;; means we can go straight to the `neg` instruction to produce the final +;; result. +(rule 2 (gen_bmask val @ (maybe_uextend (icmp _ _ _))) (rv_neg val)) +(rule 2 (gen_bmask val @ (maybe_uextend (fcmp _ _ _))) (rv_neg val)) + +(decl lower_bmask (Value Type) ValueRegs) +(rule 0 (lower_bmask val (fits_in_64 _)) + (value_reg (gen_bmask val))) +(rule 1 (lower_bmask val $I128) + (let ((bits XReg (gen_bmask val))) + (value_regs bits bits))) + +;;;; Helpers for physical registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl gen_mov_from_preg (PReg) Reg) + +(rule + (gen_mov_from_preg rm) + (let ((rd WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.MovFromPReg rd rm)))) + rd)) + +(decl fp_reg () PReg) +(extern constructor fp_reg fp_reg) + +(decl sp_reg () PReg) +(extern constructor sp_reg sp_reg) + +;; Extractor that matches all registers, except the zero register +(decl non_zero_reg () XReg) +(extern extractor non_zero_reg is_non_zero_reg) + +;; Helper for creating the zero register. +(decl zero_reg () XReg) +(extern constructor zero_reg zero_reg) +(extern extractor zero_reg is_zero_reg) + +(decl value_regs_zero () ValueRegs) +(rule (value_regs_zero) + (value_regs (imm $I64 0) (imm $I64 0))) + +(decl writable_zero_reg () WritableReg) +(extern constructor writable_zero_reg writable_zero_reg) + + +;;;; Helpers for floating point comparisons ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(type FloatCompare (enum + ;; The comparison succeeded if `r` is one + (One (r XReg)) + ;; The comparison succeeded if `r` is zero + (Zero (r XReg)) +)) + +(decl float_compare_invert (FloatCompare) FloatCompare) +(rule (float_compare_invert (FloatCompare.One r)) (FloatCompare.Zero r)) +(rule (float_compare_invert (FloatCompare.Zero r)) (FloatCompare.One r)) + +(decl float_to_int_compare (FloatCompare) IntegerCompare) +(rule (float_to_int_compare (FloatCompare.One r)) (cmp_nez r)) +(rule (float_to_int_compare (FloatCompare.Zero r)) (cmp_eqz r)) +(convert FloatCompare IntegerCompare float_to_int_compare) + +;; Compare two floating point numbers and return a zero/non-zero result. +(decl fcmp_to_float_compare (FloatCC Type FReg FReg) FloatCompare) + +;; Direct codegen for unordered comparisons is not that efficient, so invert +;; the comparison to get an ordered comparison and generate that. Then invert +;; the result to produce the final fcmp result. +(rule 0 (fcmp_to_float_compare cc ty a b) + (if-let $true (floatcc_unordered cc)) + (float_compare_invert (fcmp_to_float_compare (floatcc_complement cc) ty a b))) + +;; a is not nan && b is not nan +(rule 1 (fcmp_to_float_compare (FloatCC.Ordered) ty a b) + (FloatCompare.One (rv_and (is_not_nan ty a) (is_not_nan ty b)))) + +(decl is_not_nan (Type FReg) XReg) +(rule (is_not_nan ty a) (rv_feq ty a a)) + +;; a == b +(rule 1 (fcmp_to_float_compare (FloatCC.Equal) ty a b) + (FloatCompare.One (rv_feq ty a b))) + +;; a != b +;; == !(a == b) +(rule 1 (fcmp_to_float_compare (FloatCC.NotEqual) ty a b) + (FloatCompare.Zero (rv_feq ty a b))) + +;; a < b || a > b +(rule 1 (fcmp_to_float_compare (FloatCC.OrderedNotEqual) ty a b) + (FloatCompare.One (rv_or (rv_flt ty a b) (rv_fgt ty a b)))) + +;; a < b +(rule 1 (fcmp_to_float_compare (FloatCC.LessThan) ty a b) + (FloatCompare.One (rv_flt ty a b))) + +;; a <= b +(rule 1 (fcmp_to_float_compare (FloatCC.LessThanOrEqual) ty a b) + (FloatCompare.One (rv_fle ty a b))) + +;; a > b +(rule 1 (fcmp_to_float_compare (FloatCC.GreaterThan) ty a b) + (FloatCompare.One (rv_fgt ty a b))) + +;; a >= b +(rule 1 (fcmp_to_float_compare (FloatCC.GreaterThanOrEqual) ty a b) + (FloatCompare.One (rv_fge ty a b))) diff --git a/hbcb/src/inst/args.rs b/hbcb/src/inst/args.rs new file mode 100644 index 0000000..d28e59b --- /dev/null +++ b/hbcb/src/inst/args.rs @@ -0,0 +1,1957 @@ +//! Riscv64 ISA definitions: instruction arguments. + +use super::*; +use crate::ir::condcodes::CondCode; + +use crate::lower::isle::generated_code::{ + COpcodeSpace, CaOp, CbOp, CiOp, CiwOp, ClOp, CrOp, CsOp, CssOp, CsznOp, FpuOPWidth, ZcbMemOp, +}; +use crate::machinst::isle::WritableReg; + +use std::fmt::Result; + +/// A macro for defining a newtype of `Reg` that enforces some invariant about +/// the wrapped `Reg` (such as that it is of a particular register class). +macro_rules! newtype_of_reg { + ( + $newtype_reg:ident, + $newtype_writable_reg:ident, + |$check_reg:ident| $check:expr + ) => { + /// A newtype wrapper around `Reg`. + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub struct $newtype_reg(Reg); + + impl PartialEq for $newtype_reg { + fn eq(&self, other: &Reg) -> bool { + self.0 == *other + } + } + + impl From<$newtype_reg> for Reg { + fn from(r: $newtype_reg) -> Self { + r.0 + } + } + + impl $newtype_reg { + /// Create this newtype from the given register, or return `None` if the register + /// is not a valid instance of this newtype. + pub fn new($check_reg: Reg) -> Option { + if $check { + Some(Self($check_reg)) + } else { + None + } + } + + /// Get this newtype's underlying `Reg`. + pub fn to_reg(self) -> Reg { + self.0 + } + } + + // Convenience impl so that people working with this newtype can use it + // "just like" a plain `Reg`. + // + // NB: We cannot implement `DerefMut` because that would let people do + // nasty stuff like `*my_xreg.deref_mut() = some_freg`, breaking the + // invariants that `XReg` provides. + impl std::ops::Deref for $newtype_reg { + type Target = Reg; + + fn deref(&self) -> &Reg { + &self.0 + } + } + + /// Writable Reg. + pub type $newtype_writable_reg = Writable<$newtype_reg>; + }; +} + +// Newtypes for registers classes. +newtype_of_reg!(XReg, WritableXReg, |reg| reg.class() == RegClass::Int); +newtype_of_reg!(FReg, WritableFReg, |reg| reg.class() == RegClass::Float); +newtype_of_reg!(VReg, WritableVReg, |reg| reg.class() == RegClass::Vector); + +/// An addressing mode specified for a load/store operation. +#[derive(Clone, Debug, Copy)] +pub enum AMode { + /// Arbitrary offset from a register. Converted to generation of large + /// offsets with multiple instructions as necessary during code emission. + RegOffset(Reg, i64), + /// Offset from the stack pointer. + SPOffset(i64), + + /// Offset from the frame pointer. + FPOffset(i64), + + /// Offset into the slot area of the stack, which lies just above the + /// outgoing argument area that's setup by the function prologue. + /// At emission time, this is converted to `SPOffset` with a fixup added to + /// the offset constant. The fixup is a running value that is tracked as + /// emission iterates through instructions in linear order, and can be + /// adjusted up and down with [Inst::VirtualSPOffsetAdj]. + /// + /// The standard ABI is in charge of handling this (by emitting the + /// adjustment meta-instructions). See the diagram in the documentation + /// for [crate::isa::aarch64::abi](the ABI module) for more details. + SlotOffset(i64), + + /// Offset into the argument area. + IncomingArg(i64), + + /// A reference to a constant which is placed outside of the function's + /// body, typically at the end. + Const(VCodeConstant), + + /// A reference to a label. + Label(MachLabel), +} + +impl AMode { + /// Add the registers referenced by this AMode to `collector`. + pub(crate) fn get_operands(&mut self, collector: &mut impl OperandVisitor) { + match self { + AMode::RegOffset(reg, ..) => collector.reg_use(reg), + // Registers used in these modes aren't allocatable. + AMode::SPOffset(..) + | AMode::FPOffset(..) + | AMode::SlotOffset(..) + | AMode::IncomingArg(..) + | AMode::Const(..) + | AMode::Label(..) => {} + } + } + + pub(crate) fn get_base_register(&self) -> Option { + match self { + &AMode::RegOffset(reg, ..) => Some(reg), + &AMode::SPOffset(..) => Some(stack_reg()), + &AMode::FPOffset(..) => Some(fp_reg()), + &AMode::SlotOffset(..) => Some(stack_reg()), + &AMode::IncomingArg(..) => Some(stack_reg()), + &AMode::Const(..) | AMode::Label(..) => None, + } + } + + pub(crate) fn get_offset_with_state(&self, state: &EmitState) -> i64 { + match self { + &AMode::SlotOffset(offset) => { + offset + i64::from(state.frame_layout().outgoing_args_size) + } + + // Compute the offset into the incoming argument area relative to SP + &AMode::IncomingArg(offset) => { + let frame_layout = state.frame_layout(); + let sp_offset = frame_layout.tail_args_size + + frame_layout.setup_area_size + + frame_layout.clobber_size + + frame_layout.fixed_frame_storage_size + + frame_layout.outgoing_args_size; + i64::from(sp_offset) - offset + } + + &AMode::RegOffset(_, offset) => offset, + &AMode::SPOffset(offset) => offset, + &AMode::FPOffset(offset) => offset, + &AMode::Const(_) | &AMode::Label(_) => 0, + } + } + + /// Retrieve a MachLabel that corresponds to this addressing mode, if it exists. + pub(crate) fn get_label_with_sink(&self, sink: &mut MachBuffer) -> Option { + match self { + &AMode::Const(addr) => Some(sink.get_label_for_constant(addr)), + &AMode::Label(label) => Some(label), + &AMode::RegOffset(..) + | &AMode::SPOffset(..) + | &AMode::FPOffset(..) + | &AMode::IncomingArg(..) + | &AMode::SlotOffset(..) => None, + } + } +} + +impl Display for AMode { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + match self { + &AMode::RegOffset(r, offset, ..) => { + write!(f, "{}({})", offset, reg_name(r)) + } + &AMode::SPOffset(offset, ..) => { + write!(f, "{offset}(sp)") + } + &AMode::SlotOffset(offset, ..) => { + write!(f, "{offset}(slot)") + } + &AMode::IncomingArg(offset) => { + write!(f, "-{offset}(incoming_arg)") + } + &AMode::FPOffset(offset, ..) => { + write!(f, "{offset}(fp)") + } + &AMode::Const(addr, ..) => { + write!(f, "[const({})]", addr.as_u32()) + } + &AMode::Label(label) => { + write!(f, "[label{}]", label.as_u32()) + } + } + } +} + +impl Into for StackAMode { + fn into(self) -> AMode { + match self { + StackAMode::IncomingArg(offset, stack_args_size) => { + AMode::IncomingArg(i64::from(stack_args_size) - offset) + } + StackAMode::OutgoingArg(offset) => AMode::SPOffset(offset), + StackAMode::Slot(offset) => AMode::SlotOffset(offset), + } + } +} + +/// risc-v always take two register to compare +#[derive(Clone, Copy, Debug)] +pub struct IntegerCompare { + pub(crate) kind: IntCC, + pub(crate) rs1: Reg, + pub(crate) rs2: Reg, +} + +pub(crate) enum BranchFunct3 { + // == + Eq, + // != + Ne, + // signed < + Lt, + // signed >= + Ge, + // unsigned < + Ltu, + // unsigned >= + Geu, +} + +impl BranchFunct3 { + pub(crate) fn funct3(self) -> u32 { + match self { + BranchFunct3::Eq => 0b000, + BranchFunct3::Ne => 0b001, + BranchFunct3::Lt => 0b100, + BranchFunct3::Ge => 0b101, + BranchFunct3::Ltu => 0b110, + BranchFunct3::Geu => 0b111, + } + } +} + +impl IntegerCompare { + pub(crate) fn op_code(self) -> u32 { + 0b1100011 + } + + // funct3 and if need inverse the register + pub(crate) fn funct3(&self) -> (BranchFunct3, bool) { + match self.kind { + IntCC::Equal => (BranchFunct3::Eq, false), + IntCC::NotEqual => (BranchFunct3::Ne, false), + IntCC::SignedLessThan => (BranchFunct3::Lt, false), + IntCC::SignedGreaterThanOrEqual => (BranchFunct3::Ge, false), + + IntCC::SignedGreaterThan => (BranchFunct3::Lt, true), + IntCC::SignedLessThanOrEqual => (BranchFunct3::Ge, true), + + IntCC::UnsignedLessThan => (BranchFunct3::Ltu, false), + IntCC::UnsignedGreaterThanOrEqual => (BranchFunct3::Geu, false), + + IntCC::UnsignedGreaterThan => (BranchFunct3::Ltu, true), + IntCC::UnsignedLessThanOrEqual => (BranchFunct3::Geu, true), + } + } + + #[inline] + pub(crate) fn op_name(&self) -> &'static str { + match self.kind { + IntCC::Equal => "beq", + IntCC::NotEqual => "bne", + IntCC::SignedLessThan => "blt", + IntCC::SignedGreaterThanOrEqual => "bge", + IntCC::SignedGreaterThan => "bgt", + IntCC::SignedLessThanOrEqual => "ble", + IntCC::UnsignedLessThan => "bltu", + IntCC::UnsignedGreaterThanOrEqual => "bgeu", + IntCC::UnsignedGreaterThan => "bgtu", + IntCC::UnsignedLessThanOrEqual => "bleu", + } + } + + pub(crate) fn emit(self) -> u32 { + let (funct3, reverse) = self.funct3(); + let (rs1, rs2) = if reverse { + (self.rs2, self.rs1) + } else { + (self.rs1, self.rs2) + }; + + self.op_code() + | funct3.funct3() << 12 + | reg_to_gpr_num(rs1) << 15 + | reg_to_gpr_num(rs2) << 20 + } + + pub(crate) fn inverse(self) -> Self { + Self { + kind: self.kind.complement(), + ..self + } + } + + pub(crate) fn regs(&self) -> [Reg; 2] { + [self.rs1, self.rs2] + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct FliConstant(u8); + +impl FliConstant { + pub(crate) fn new(value: u8) -> Self { + debug_assert!(value <= 31, "Invalid FliConstant: {value}"); + Self(value) + } + + pub(crate) fn maybe_from_u64(ty: Type, imm: u64) -> Option { + // Convert the value into an F64, this allows us to represent + // values from both f32 and f64 in the same value. + let value = match ty { + F32 => f32::from_bits(imm as u32) as f64, + F64 => f64::from_bits(imm), + _ => unimplemented!(), + }; + + Some(match (ty, value) { + (_, f) if f == -1.0 => Self::new(0), + + // Since f64 can represent all f32 values, f32::min_positive won't be + // the same as f64::min_positive, so we need to check for both indepenendtly + (F32, f) if f == (f32::MIN_POSITIVE as f64) => Self::new(1), + (F64, f) if f == f64::MIN_POSITIVE => Self::new(1), + + (_, f) if f == 2.0f64.powi(-16) => Self::new(2), + (_, f) if f == 2.0f64.powi(-15) => Self::new(3), + (_, f) if f == 2.0f64.powi(-8) => Self::new(4), + (_, f) if f == 2.0f64.powi(-7) => Self::new(5), + (_, f) if f == 0.0625 => Self::new(6), + (_, f) if f == 0.125 => Self::new(7), + (_, f) if f == 0.25 => Self::new(8), + (_, f) if f == 0.3125 => Self::new(9), + (_, f) if f == 0.375 => Self::new(10), + (_, f) if f == 0.4375 => Self::new(11), + (_, f) if f == 0.5 => Self::new(12), + (_, f) if f == 0.625 => Self::new(13), + (_, f) if f == 0.75 => Self::new(14), + (_, f) if f == 0.875 => Self::new(15), + (_, f) if f == 1.0 => Self::new(16), + (_, f) if f == 1.25 => Self::new(17), + (_, f) if f == 1.5 => Self::new(18), + (_, f) if f == 1.75 => Self::new(19), + (_, f) if f == 2.0 => Self::new(20), + (_, f) if f == 2.5 => Self::new(21), + (_, f) if f == 3.0 => Self::new(22), + (_, f) if f == 4.0 => Self::new(23), + (_, f) if f == 8.0 => Self::new(24), + (_, f) if f == 16.0 => Self::new(25), + (_, f) if f == 128.0 => Self::new(26), + (_, f) if f == 256.0 => Self::new(27), + (_, f) if f == 32768.0 => Self::new(28), + (_, f) if f == 65536.0 => Self::new(29), + (_, f) if f == f64::INFINITY => Self::new(30), + + // NaN's are not guaranteed to preserve the sign / payload bits, so we need to check + // the original bits directly. + (F32, f) if f.is_nan() && imm == 0x7fc0_0000 => Self::new(31), // Canonical NaN + (F64, f) if f.is_nan() && imm == 0x7ff8_0000_0000_0000 => Self::new(31), // Canonical NaN + _ => return None, + }) + } + + pub(crate) fn format(self) -> &'static str { + // The preferred assembly syntax for entries 1, 30, and 31 is min, inf, and nan, respectively. + // For entries 0 through 29 (including entry 1), the assembler will accept decimal constants + // in C-like syntax. + match self.0 { + 0 => "-1.0", + 1 => "min", + 2 => "2^-16", + 3 => "2^-15", + 4 => "2^-8", + 5 => "2^-7", + 6 => "0.0625", + 7 => "0.125", + 8 => "0.25", + 9 => "0.3125", + 10 => "0.375", + 11 => "0.4375", + 12 => "0.5", + 13 => "0.625", + 14 => "0.75", + 15 => "0.875", + 16 => "1.0", + 17 => "1.25", + 18 => "1.5", + 19 => "1.75", + 20 => "2.0", + 21 => "2.5", + 22 => "3.0", + 23 => "4.0", + 24 => "8.0", + 25 => "16.0", + 26 => "128.0", + 27 => "256.0", + 28 => "32768.0", + 29 => "65536.0", + 30 => "inf", + 31 => "nan", + _ => panic!("Invalid FliConstant"), + } + } + + pub(crate) fn bits(self) -> u8 { + self.0 + } +} + +impl FpuOPRRRR { + pub(crate) fn op_name(self, width: FpuOPWidth) -> String { + match self { + Self::Fmadd => format!("fmadd.{width}"), + Self::Fmsub => format!("fmsub.{width}"), + Self::Fnmsub => format!("fnmsub.{width}"), + Self::Fnmadd => format!("fnmadd.{width}"), + } + } + + pub(crate) fn opcode(self) -> u32 { + match self { + Self::Fmadd => 0b1000011, + Self::Fmsub => 0b1000111, + Self::Fnmsub => 0b1001011, + Self::Fnmadd => 0b1001111, + } + } +} + +impl FpuOPRR { + pub(crate) fn op_name(self, width: FpuOPWidth) -> String { + let fmv_width = match width { + FpuOPWidth::H => "h", + FpuOPWidth::S => "w", + FpuOPWidth::D => "d", + FpuOPWidth::Q => "q", + }; + match self { + Self::Fsqrt => format!("fsqrt.{width}"), + Self::Fround => format!("fround.{width}"), + Self::Fclass => format!("fclass.{width}"), + Self::FcvtWFmt => format!("fcvt.w.{width}"), + Self::FcvtWuFmt => format!("fcvt.wu.{width}"), + Self::FcvtLFmt => format!("fcvt.l.{width}"), + Self::FcvtLuFmt => format!("fcvt.lu.{width}"), + Self::FcvtFmtW => format!("fcvt.{width}.w"), + Self::FcvtFmtWu => format!("fcvt.{width}.wu"), + Self::FcvtFmtL => format!("fcvt.{width}.l"), + Self::FcvtFmtLu => format!("fcvt.{width}.lu"), + + // fmv instructions deviate from the normal encoding and instead + // encode the width as "w" instead of "s". The ISA manual gives this rationale: + // + // Instructions FMV.S.X and FMV.X.S were renamed to FMV.W.X and FMV.X.W respectively + // to be more consistent with their semantics, which did not change. The old names will continue + // to be supported in the tools. + Self::FmvXFmt => format!("fmv.x.{fmv_width}"), + Self::FmvFmtX => format!("fmv.{fmv_width}.x"), + + Self::FcvtSD => "fcvt.s.d".to_string(), + Self::FcvtDS => "fcvt.d.s".to_string(), + } + } + + pub(crate) fn is_convert_to_int(self) -> bool { + match self { + Self::FcvtWFmt | Self::FcvtWuFmt | Self::FcvtLFmt | Self::FcvtLuFmt => true, + _ => false, + } + } + + pub(crate) fn has_frm(self) -> bool { + match self { + FpuOPRR::FmvXFmt | FpuOPRR::FmvFmtX | FpuOPRR::Fclass => false, + _ => true, + } + } + + pub(crate) fn opcode(self) -> u32 { + // OP-FP Major opcode + 0b1010011 + } + + pub(crate) fn rs2(self) -> u32 { + match self { + Self::Fsqrt => 0b00000, + Self::Fround => 0b00100, + Self::Fclass => 0b00000, + Self::FcvtWFmt => 0b00000, + Self::FcvtWuFmt => 0b00001, + Self::FcvtLFmt => 0b00010, + Self::FcvtLuFmt => 0b00011, + Self::FcvtFmtW => 0b00000, + Self::FcvtFmtWu => 0b00001, + Self::FcvtFmtL => 0b00010, + Self::FcvtFmtLu => 0b00011, + Self::FmvXFmt => 0b00000, + Self::FmvFmtX => 0b00000, + Self::FcvtSD => 0b00001, + Self::FcvtDS => 0b00000, + } + } + + pub(crate) fn funct5(self) -> u32 { + match self { + Self::Fsqrt => 0b01011, + Self::Fround => 0b01000, + Self::Fclass => 0b11100, + Self::FcvtWFmt => 0b11000, + Self::FcvtWuFmt => 0b11000, + Self::FcvtLFmt => 0b11000, + Self::FcvtLuFmt => 0b11000, + Self::FcvtFmtW => 0b11010, + Self::FcvtFmtWu => 0b11010, + Self::FcvtFmtL => 0b11010, + Self::FcvtFmtLu => 0b11010, + Self::FmvXFmt => 0b11100, + Self::FmvFmtX => 0b11110, + Self::FcvtSD => 0b01000, + Self::FcvtDS => 0b01000, + } + } + + pub(crate) fn funct7(self, width: FpuOPWidth) -> u32 { + (self.funct5() << 2) | width.as_u32() + } +} + +impl FpuOPRRR { + pub(crate) fn op_name(self, width: FpuOPWidth) -> String { + match self { + Self::Fadd => format!("fadd.{width}"), + Self::Fsub => format!("fsub.{width}"), + Self::Fmul => format!("fmul.{width}"), + Self::Fdiv => format!("fdiv.{width}"), + Self::Fsgnj => format!("fsgnj.{width}"), + Self::Fsgnjn => format!("fsgnjn.{width}"), + Self::Fsgnjx => format!("fsgnjx.{width}"), + Self::Fmin => format!("fmin.{width}"), + Self::Fmax => format!("fmax.{width}"), + Self::Feq => format!("feq.{width}"), + Self::Flt => format!("flt.{width}"), + Self::Fle => format!("fle.{width}"), + Self::Fminm => format!("fminm.{width}"), + Self::Fmaxm => format!("fmaxm.{width}"), + } + } + + pub(crate) fn opcode(self) -> u32 { + // OP-FP Major opcode + 0b1010011 + } + + pub(crate) const fn funct5(self) -> u32 { + match self { + Self::Fadd => 0b00000, + Self::Fsub => 0b00001, + Self::Fmul => 0b00010, + Self::Fdiv => 0b00011, + Self::Fsgnj => 0b00100, + Self::Fsgnjn => 0b00100, + Self::Fsgnjx => 0b00100, + Self::Fmin => 0b00101, + Self::Fmax => 0b00101, + Self::Feq => 0b10100, + Self::Flt => 0b10100, + Self::Fle => 0b10100, + Self::Fminm => 0b00101, + Self::Fmaxm => 0b00101, + } + } + + pub(crate) fn funct7(self, width: FpuOPWidth) -> u32 { + (self.funct5() << 2) | width.as_u32() + } + + pub(crate) fn has_frm(self) -> bool { + match self { + FpuOPRRR::Fsgnj + | FpuOPRRR::Fsgnjn + | FpuOPRRR::Fsgnjx + | FpuOPRRR::Fmin + | FpuOPRRR::Fmax + | FpuOPRRR::Feq + | FpuOPRRR::Flt + | FpuOPRRR::Fle => false, + _ => true, + } + } +} + +impl Display for FpuOPWidth { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!( + f, + "{}", + match self { + FpuOPWidth::H => "h", + FpuOPWidth::S => "s", + FpuOPWidth::D => "d", + FpuOPWidth::Q => "q", + } + ) + } +} + +impl TryFrom for FpuOPWidth { + type Error = &'static str; + + fn try_from(value: Type) -> std::result::Result { + match value { + F16 => Ok(FpuOPWidth::H), + F32 => Ok(FpuOPWidth::S), + F64 => Ok(FpuOPWidth::D), + F128 => Ok(FpuOPWidth::Q), + _ => Err("Invalid type for FpuOPWidth"), + } + } +} + +impl FpuOPWidth { + pub(crate) fn as_u32(&self) -> u32 { + match self { + FpuOPWidth::S => 0b00, + FpuOPWidth::D => 0b01, + FpuOPWidth::H => 0b10, + FpuOPWidth::Q => 0b11, + } + } +} + +impl AluOPRRR { + pub(crate) const fn op_name(self) -> &'static str { + match self { + Self::Add => "add", + Self::Sub => "sub", + Self::Sll => "sll", + Self::Slt => "slt", + Self::Sgt => "sgt", + Self::SltU => "sltu", + Self::Sgtu => "sgtu", + Self::Xor => "xor", + Self::Srl => "srl", + Self::Sra => "sra", + Self::Or => "or", + Self::And => "and", + Self::Addw => "addw", + Self::Subw => "subw", + Self::Sllw => "sllw", + Self::Srlw => "srlw", + Self::Sraw => "sraw", + Self::Mul => "mul", + Self::Mulh => "mulh", + Self::Mulhsu => "mulhsu", + Self::Mulhu => "mulhu", + Self::Div => "div", + Self::DivU => "divu", + Self::Rem => "rem", + Self::RemU => "remu", + Self::Mulw => "mulw", + Self::Divw => "divw", + Self::Divuw => "divuw", + Self::Remw => "remw", + Self::Remuw => "remuw", + Self::Adduw => "add.uw", + Self::Andn => "andn", + Self::Bclr => "bclr", + Self::Bext => "bext", + Self::Binv => "binv", + Self::Bset => "bset", + Self::Clmul => "clmul", + Self::Clmulh => "clmulh", + Self::Clmulr => "clmulr", + Self::Max => "max", + Self::Maxu => "maxu", + Self::Min => "min", + Self::Minu => "minu", + Self::Orn => "orn", + Self::Rol => "rol", + Self::Rolw => "rolw", + Self::Ror => "ror", + Self::Rorw => "rorw", + Self::Sh1add => "sh1add", + Self::Sh1adduw => "sh1add.uw", + Self::Sh2add => "sh2add", + Self::Sh2adduw => "sh2add.uw", + Self::Sh3add => "sh3add", + Self::Sh3adduw => "sh3add.uw", + Self::Xnor => "xnor", + Self::Pack => "pack", + Self::Packw => "packw", + Self::Packh => "packh", + Self::CzeroEqz => "czero.eqz", + Self::CzeroNez => "czero.nez", + } + } + + pub fn funct3(self) -> u32 { + match self { + AluOPRRR::Add => 0b000, + AluOPRRR::Sll => 0b001, + AluOPRRR::Slt => 0b010, + AluOPRRR::Sgt => 0b010, + AluOPRRR::SltU => 0b011, + AluOPRRR::Sgtu => 0b011, + AluOPRRR::Xor => 0b100, + AluOPRRR::Srl => 0b101, + AluOPRRR::Sra => 0b101, + AluOPRRR::Or => 0b110, + AluOPRRR::And => 0b111, + AluOPRRR::Sub => 0b000, + + AluOPRRR::Addw => 0b000, + AluOPRRR::Subw => 0b000, + AluOPRRR::Sllw => 0b001, + AluOPRRR::Srlw => 0b101, + AluOPRRR::Sraw => 0b101, + + AluOPRRR::Mul => 0b000, + AluOPRRR::Mulh => 0b001, + AluOPRRR::Mulhsu => 0b010, + AluOPRRR::Mulhu => 0b011, + AluOPRRR::Div => 0b100, + AluOPRRR::DivU => 0b101, + AluOPRRR::Rem => 0b110, + AluOPRRR::RemU => 0b111, + + AluOPRRR::Mulw => 0b000, + AluOPRRR::Divw => 0b100, + AluOPRRR::Divuw => 0b101, + AluOPRRR::Remw => 0b110, + AluOPRRR::Remuw => 0b111, + + // Zbb + AluOPRRR::Adduw => 0b000, + AluOPRRR::Andn => 0b111, + AluOPRRR::Bclr => 0b001, + AluOPRRR::Bext => 0b101, + AluOPRRR::Binv => 0b001, + AluOPRRR::Bset => 0b001, + AluOPRRR::Clmul => 0b001, + AluOPRRR::Clmulh => 0b011, + AluOPRRR::Clmulr => 0b010, + AluOPRRR::Max => 0b110, + AluOPRRR::Maxu => 0b111, + AluOPRRR::Min => 0b100, + AluOPRRR::Minu => 0b101, + AluOPRRR::Orn => 0b110, + AluOPRRR::Rol => 0b001, + AluOPRRR::Rolw => 0b001, + AluOPRRR::Ror => 0b101, + AluOPRRR::Rorw => 0b101, + AluOPRRR::Sh1add => 0b010, + AluOPRRR::Sh1adduw => 0b010, + AluOPRRR::Sh2add => 0b100, + AluOPRRR::Sh2adduw => 0b100, + AluOPRRR::Sh3add => 0b110, + AluOPRRR::Sh3adduw => 0b110, + AluOPRRR::Xnor => 0b100, + + // Zbkb + AluOPRRR::Pack => 0b100, + AluOPRRR::Packw => 0b100, + AluOPRRR::Packh => 0b111, + + // ZiCond + AluOPRRR::CzeroEqz => 0b101, + AluOPRRR::CzeroNez => 0b111, + } + } + + pub fn op_code(self) -> u32 { + match self { + AluOPRRR::Add + | AluOPRRR::Sub + | AluOPRRR::Sll + | AluOPRRR::Slt + | AluOPRRR::Sgt + | AluOPRRR::SltU + | AluOPRRR::Sgtu + | AluOPRRR::Xor + | AluOPRRR::Srl + | AluOPRRR::Sra + | AluOPRRR::Or + | AluOPRRR::And + | AluOPRRR::Pack + | AluOPRRR::Packh => 0b0110011, + + AluOPRRR::Addw + | AluOPRRR::Subw + | AluOPRRR::Sllw + | AluOPRRR::Srlw + | AluOPRRR::Sraw + | AluOPRRR::Packw => 0b0111011, + + AluOPRRR::Mul + | AluOPRRR::Mulh + | AluOPRRR::Mulhsu + | AluOPRRR::Mulhu + | AluOPRRR::Div + | AluOPRRR::DivU + | AluOPRRR::Rem + | AluOPRRR::RemU => 0b0110011, + + AluOPRRR::Mulw + | AluOPRRR::Divw + | AluOPRRR::Divuw + | AluOPRRR::Remw + | AluOPRRR::Remuw => 0b0111011, + + AluOPRRR::Adduw => 0b0111011, + AluOPRRR::Andn + | AluOPRRR::Bclr + | AluOPRRR::Bext + | AluOPRRR::Binv + | AluOPRRR::Bset + | AluOPRRR::Clmul + | AluOPRRR::Clmulh + | AluOPRRR::Clmulr + | AluOPRRR::Max + | AluOPRRR::Maxu + | AluOPRRR::Min + | AluOPRRR::Minu + | AluOPRRR::Orn + | AluOPRRR::Rol + | AluOPRRR::Ror + | AluOPRRR::Sh1add + | AluOPRRR::Sh2add + | AluOPRRR::Sh3add + | AluOPRRR::Xnor + | AluOPRRR::CzeroEqz + | AluOPRRR::CzeroNez => 0b0110011, + + AluOPRRR::Rolw + | AluOPRRR::Rorw + | AluOPRRR::Sh2adduw + | AluOPRRR::Sh3adduw + | AluOPRRR::Sh1adduw => 0b0111011, + } + } + + pub const fn funct7(self) -> u32 { + match self { + AluOPRRR::Add => 0b0000000, + AluOPRRR::Sub => 0b0100000, + AluOPRRR::Sll => 0b0000000, + AluOPRRR::Slt => 0b0000000, + AluOPRRR::Sgt => 0b0000000, + AluOPRRR::SltU => 0b0000000, + AluOPRRR::Sgtu => 0b0000000, + + AluOPRRR::Xor => 0b0000000, + AluOPRRR::Srl => 0b0000000, + AluOPRRR::Sra => 0b0100000, + AluOPRRR::Or => 0b0000000, + AluOPRRR::And => 0b0000000, + + AluOPRRR::Addw => 0b0000000, + AluOPRRR::Subw => 0b0100000, + AluOPRRR::Sllw => 0b0000000, + AluOPRRR::Srlw => 0b0000000, + AluOPRRR::Sraw => 0b0100000, + + AluOPRRR::Mul => 0b0000001, + AluOPRRR::Mulh => 0b0000001, + AluOPRRR::Mulhsu => 0b0000001, + AluOPRRR::Mulhu => 0b0000001, + AluOPRRR::Div => 0b0000001, + AluOPRRR::DivU => 0b0000001, + AluOPRRR::Rem => 0b0000001, + AluOPRRR::RemU => 0b0000001, + + AluOPRRR::Mulw => 0b0000001, + AluOPRRR::Divw => 0b0000001, + AluOPRRR::Divuw => 0b0000001, + AluOPRRR::Remw => 0b0000001, + AluOPRRR::Remuw => 0b0000001, + AluOPRRR::Adduw => 0b0000100, + AluOPRRR::Andn => 0b0100000, + AluOPRRR::Bclr => 0b0100100, + AluOPRRR::Bext => 0b0100100, + AluOPRRR::Binv => 0b0110100, + AluOPRRR::Bset => 0b0010100, + AluOPRRR::Clmul => 0b0000101, + AluOPRRR::Clmulh => 0b0000101, + AluOPRRR::Clmulr => 0b0000101, + AluOPRRR::Max => 0b0000101, + AluOPRRR::Maxu => 0b0000101, + AluOPRRR::Min => 0b0000101, + AluOPRRR::Minu => 0b0000101, + AluOPRRR::Orn => 0b0100000, + AluOPRRR::Rol => 0b0110000, + AluOPRRR::Rolw => 0b0110000, + AluOPRRR::Ror => 0b0110000, + AluOPRRR::Rorw => 0b0110000, + AluOPRRR::Sh1add => 0b0010000, + AluOPRRR::Sh1adduw => 0b0010000, + AluOPRRR::Sh2add => 0b0010000, + AluOPRRR::Sh2adduw => 0b0010000, + AluOPRRR::Sh3add => 0b0010000, + AluOPRRR::Sh3adduw => 0b0010000, + AluOPRRR::Xnor => 0b0100000, + + // Zbkb + AluOPRRR::Pack => 0b0000100, + AluOPRRR::Packw => 0b0000100, + AluOPRRR::Packh => 0b0000100, + + // ZiCond + AluOPRRR::CzeroEqz => 0b0000111, + AluOPRRR::CzeroNez => 0b0000111, + } + } + + pub(crate) fn reverse_rs(self) -> bool { + // special case. + // sgt and sgtu is not defined in isa. + // emit should reverse rs1 and rs2. + self == AluOPRRR::Sgt || self == AluOPRRR::Sgtu + } +} + +impl AluOPRRI { + pub(crate) fn option_funct6(self) -> Option { + let x: Option = match self { + Self::Slli => Some(0b00_0000), + Self::Srli => Some(0b00_0000), + Self::Srai => Some(0b01_0000), + Self::Bclri => Some(0b010010), + Self::Bexti => Some(0b010010), + Self::Binvi => Some(0b011010), + Self::Bseti => Some(0b001010), + Self::Rori => Some(0b011000), + Self::SlliUw => Some(0b000010), + _ => None, + }; + x + } + + pub(crate) fn option_funct7(self) -> Option { + let x = match self { + Self::Slliw => Some(0b000_0000), + Self::SrliW => Some(0b000_0000), + Self::Sraiw => Some(0b010_0000), + Self::Roriw => Some(0b0110000), + _ => None, + }; + x + } + + pub(crate) fn imm12(self, imm12: Imm12) -> u32 { + let x = imm12.bits(); + if let Some(func) = self.option_funct6() { + func << 6 | (x & 0b11_1111) + } else if let Some(func) = self.option_funct7() { + func << 5 | (x & 0b1_1111) + } else if let Some(func) = self.option_funct12() { + func + } else { + x + } + } + + pub(crate) fn option_funct12(self) -> Option { + match self { + Self::Clz => Some(0b011000000000), + Self::Clzw => Some(0b011000000000), + Self::Cpop => Some(0b011000000010), + Self::Cpopw => Some(0b011000000010), + Self::Ctz => Some(0b011000000001), + Self::Ctzw => Some(0b011000000001), + Self::Rev8 => Some(0b011010111000), + Self::Sextb => Some(0b011000000100), + Self::Sexth => Some(0b011000000101), + Self::Zexth => Some(0b000010000000), + Self::Orcb => Some(0b001010000111), + Self::Brev8 => Some(0b0110_1000_0111), + _ => None, + } + } + + pub(crate) fn op_name(self) -> &'static str { + match self { + Self::Addi => "addi", + Self::Slti => "slti", + Self::SltiU => "sltiu", + Self::Xori => "xori", + Self::Ori => "ori", + Self::Andi => "andi", + Self::Slli => "slli", + Self::Srli => "srli", + Self::Srai => "srai", + Self::Addiw => "addiw", + Self::Slliw => "slliw", + Self::SrliW => "srliw", + Self::Sraiw => "sraiw", + Self::Bclri => "bclri", + Self::Bexti => "bexti", + Self::Binvi => "binvi", + Self::Bseti => "bseti", + Self::Rori => "rori", + Self::Roriw => "roriw", + Self::SlliUw => "slli.uw", + Self::Clz => "clz", + Self::Clzw => "clzw", + Self::Cpop => "cpop", + Self::Cpopw => "cpopw", + Self::Ctz => "ctz", + Self::Ctzw => "ctzw", + Self::Rev8 => "rev8", + Self::Sextb => "sext.b", + Self::Sexth => "sext.h", + Self::Zexth => "zext.h", + Self::Orcb => "orc.b", + Self::Brev8 => "brev8", + } + } + + pub fn funct3(self) -> u32 { + match self { + AluOPRRI::Addi => 0b000, + AluOPRRI::Slti => 0b010, + AluOPRRI::SltiU => 0b011, + AluOPRRI::Xori => 0b100, + AluOPRRI::Ori => 0b110, + AluOPRRI::Andi => 0b111, + AluOPRRI::Slli => 0b001, + AluOPRRI::Srli => 0b101, + AluOPRRI::Srai => 0b101, + AluOPRRI::Addiw => 0b000, + AluOPRRI::Slliw => 0b001, + AluOPRRI::SrliW => 0b101, + AluOPRRI::Sraiw => 0b101, + AluOPRRI::Bclri => 0b001, + AluOPRRI::Bexti => 0b101, + AluOPRRI::Binvi => 0b001, + AluOPRRI::Bseti => 0b001, + AluOPRRI::Rori => 0b101, + AluOPRRI::Roriw => 0b101, + AluOPRRI::SlliUw => 0b001, + AluOPRRI::Clz => 0b001, + AluOPRRI::Clzw => 0b001, + AluOPRRI::Cpop => 0b001, + AluOPRRI::Cpopw => 0b001, + AluOPRRI::Ctz => 0b001, + AluOPRRI::Ctzw => 0b001, + AluOPRRI::Rev8 => 0b101, + AluOPRRI::Sextb => 0b001, + AluOPRRI::Sexth => 0b001, + AluOPRRI::Zexth => 0b100, + AluOPRRI::Orcb => 0b101, + AluOPRRI::Brev8 => 0b101, + } + } + + pub fn op_code(self) -> u32 { + match self { + AluOPRRI::Addi + | AluOPRRI::Slti + | AluOPRRI::SltiU + | AluOPRRI::Xori + | AluOPRRI::Ori + | AluOPRRI::Andi + | AluOPRRI::Slli + | AluOPRRI::Srli + | AluOPRRI::Srai + | AluOPRRI::Bclri + | AluOPRRI::Bexti + | AluOPRRI::Binvi + | AluOPRRI::Bseti + | AluOPRRI::Rori + | AluOPRRI::Clz + | AluOPRRI::Cpop + | AluOPRRI::Ctz + | AluOPRRI::Rev8 + | AluOPRRI::Sextb + | AluOPRRI::Sexth + | AluOPRRI::Orcb + | AluOPRRI::Brev8 => 0b0010011, + + AluOPRRI::Addiw + | AluOPRRI::Slliw + | AluOPRRI::SrliW + | AluOPRRI::Sraiw + | AluOPRRI::Roriw + | AluOPRRI::SlliUw + | AluOPRRI::Clzw + | AluOPRRI::Cpopw + | AluOPRRI::Ctzw => 0b0011011, + AluOPRRI::Zexth => 0b0111011, + } + } +} + +impl Default for FRM { + fn default() -> Self { + Self::Fcsr + } +} + +/// float rounding mode. +impl FRM { + pub(crate) fn to_static_str(self) -> &'static str { + match self { + FRM::RNE => "rne", + FRM::RTZ => "rtz", + FRM::RDN => "rdn", + FRM::RUP => "rup", + FRM::RMM => "rmm", + FRM::Fcsr => "fcsr", + } + } + + #[inline] + pub(crate) fn bits(self) -> u8 { + match self { + FRM::RNE => 0b000, + FRM::RTZ => 0b001, + FRM::RDN => 0b010, + FRM::RUP => 0b011, + FRM::RMM => 0b100, + FRM::Fcsr => 0b111, + } + } + pub(crate) fn as_u32(self) -> u32 { + self.bits() as u32 + } +} + +impl FFlagsException { + #[inline] + #[allow(dead_code)] + pub(crate) fn mask(self) -> u32 { + match self { + FFlagsException::NV => 1 << 4, + FFlagsException::DZ => 1 << 3, + FFlagsException::OF => 1 << 2, + FFlagsException::UF => 1 << 1, + FFlagsException::NX => 1 << 0, + } + } +} + +impl LoadOP { + pub(crate) fn op_name(self) -> &'static str { + match self { + Self::Lb => "lb", + Self::Lh => "lh", + Self::Lw => "lw", + Self::Lbu => "lbu", + Self::Lhu => "lhu", + Self::Lwu => "lwu", + Self::Ld => "ld", + Self::Flh => "flh", + Self::Flw => "flw", + Self::Fld => "fld", + } + } + + pub(crate) fn from_type(ty: Type) -> Self { + match ty { + F16 => Self::Flh, + F32 => Self::Flw, + F64 => Self::Fld, + I8 => Self::Lb, + I16 => Self::Lh, + I32 => Self::Lw, + I64 => Self::Ld, + _ => unreachable!(), + } + } + + pub(crate) fn size(&self) -> i64 { + match self { + Self::Lb | Self::Lbu => 1, + Self::Lh | Self::Lhu | Self::Flh => 2, + Self::Lw | Self::Lwu | Self::Flw => 4, + Self::Ld | Self::Fld => 8, + } + } + + pub(crate) fn op_code(self) -> u32 { + match self { + Self::Lb | Self::Lh | Self::Lw | Self::Lbu | Self::Lhu | Self::Lwu | Self::Ld => { + 0b0000011 + } + Self::Flh | Self::Flw | Self::Fld => 0b0000111, + } + } + pub(crate) fn funct3(self) -> u32 { + match self { + Self::Lb => 0b000, + Self::Lh => 0b001, + Self::Lw => 0b010, + Self::Lwu => 0b110, + Self::Lbu => 0b100, + Self::Lhu => 0b101, + Self::Ld => 0b011, + Self::Flh => 0b001, + Self::Flw => 0b010, + Self::Fld => 0b011, + } + } +} + +impl StoreOP { + pub(crate) fn op_name(self) -> &'static str { + match self { + Self::Sb => "sb", + Self::Sh => "sh", + Self::Sw => "sw", + Self::Sd => "sd", + Self::Fsh => "fsh", + Self::Fsw => "fsw", + Self::Fsd => "fsd", + } + } + pub(crate) fn from_type(ty: Type) -> Self { + match ty { + F16 => Self::Fsh, + F32 => Self::Fsw, + F64 => Self::Fsd, + I8 => Self::Sb, + I16 => Self::Sh, + I32 => Self::Sw, + I64 => Self::Sd, + _ => unreachable!(), + } + } + + pub(crate) fn size(&self) -> i64 { + match self { + Self::Sb => 1, + Self::Sh | Self::Fsh => 2, + Self::Sw | Self::Fsw => 4, + Self::Sd | Self::Fsd => 8, + } + } + + pub(crate) fn op_code(self) -> u32 { + match self { + Self::Sb | Self::Sh | Self::Sw | Self::Sd => 0b0100011, + Self::Fsh | Self::Fsw | Self::Fsd => 0b0100111, + } + } + pub(crate) fn funct3(self) -> u32 { + match self { + Self::Sb => 0b000, + Self::Sh => 0b001, + Self::Sw => 0b010, + Self::Sd => 0b011, + Self::Fsh => 0b001, + Self::Fsw => 0b010, + Self::Fsd => 0b011, + } + } +} + +#[allow(dead_code)] +impl FClassResult { + pub(crate) const fn bit(self) -> u32 { + match self { + FClassResult::NegInfinite => 1 << 0, + FClassResult::NegNormal => 1 << 1, + FClassResult::NegSubNormal => 1 << 2, + FClassResult::NegZero => 1 << 3, + FClassResult::PosZero => 1 << 4, + FClassResult::PosSubNormal => 1 << 5, + FClassResult::PosNormal => 1 << 6, + FClassResult::PosInfinite => 1 << 7, + FClassResult::SNaN => 1 << 8, + FClassResult::QNaN => 1 << 9, + } + } + + #[inline] + pub(crate) const fn is_nan_bits() -> u32 { + Self::SNaN.bit() | Self::QNaN.bit() + } + #[inline] + pub(crate) fn is_zero_bits() -> u32 { + Self::NegZero.bit() | Self::PosZero.bit() + } + + #[inline] + pub(crate) fn is_infinite_bits() -> u32 { + Self::PosInfinite.bit() | Self::NegInfinite.bit() + } +} + +impl AtomicOP { + #[inline] + pub(crate) fn is_load(self) -> bool { + match self { + Self::LrW | Self::LrD => true, + _ => false, + } + } + + #[inline] + pub(crate) fn op_name(self, amo: AMO) -> String { + let s = match self { + Self::LrW => "lr.w", + Self::ScW => "sc.w", + + Self::AmoswapW => "amoswap.w", + Self::AmoaddW => "amoadd.w", + Self::AmoxorW => "amoxor.w", + Self::AmoandW => "amoand.w", + Self::AmoorW => "amoor.w", + Self::AmominW => "amomin.w", + Self::AmomaxW => "amomax.w", + Self::AmominuW => "amominu.w", + Self::AmomaxuW => "amomaxu.w", + Self::LrD => "lr.d", + Self::ScD => "sc.d", + Self::AmoswapD => "amoswap.d", + Self::AmoaddD => "amoadd.d", + Self::AmoxorD => "amoxor.d", + Self::AmoandD => "amoand.d", + Self::AmoorD => "amoor.d", + Self::AmominD => "amomin.d", + Self::AmomaxD => "amomax.d", + Self::AmominuD => "amominu.d", + Self::AmomaxuD => "amomaxu.d", + }; + format!("{}{}", s, amo.to_static_str()) + } + #[inline] + pub(crate) fn op_code(self) -> u32 { + 0b0101111 + } + + #[inline] + pub(crate) fn funct7(self, amo: AMO) -> u32 { + self.funct5() << 2 | amo.as_u32() & 0b11 + } + + pub(crate) fn funct3(self) -> u32 { + match self { + AtomicOP::LrW + | AtomicOP::ScW + | AtomicOP::AmoswapW + | AtomicOP::AmoaddW + | AtomicOP::AmoxorW + | AtomicOP::AmoandW + | AtomicOP::AmoorW + | AtomicOP::AmominW + | AtomicOP::AmomaxW + | AtomicOP::AmominuW + | AtomicOP::AmomaxuW => 0b010, + AtomicOP::LrD + | AtomicOP::ScD + | AtomicOP::AmoswapD + | AtomicOP::AmoaddD + | AtomicOP::AmoxorD + | AtomicOP::AmoandD + | AtomicOP::AmoorD + | AtomicOP::AmominD + | AtomicOP::AmomaxD + | AtomicOP::AmominuD + | AtomicOP::AmomaxuD => 0b011, + } + } + pub(crate) fn funct5(self) -> u32 { + match self { + AtomicOP::LrW => 0b00010, + AtomicOP::ScW => 0b00011, + AtomicOP::AmoswapW => 0b00001, + AtomicOP::AmoaddW => 0b00000, + AtomicOP::AmoxorW => 0b00100, + AtomicOP::AmoandW => 0b01100, + AtomicOP::AmoorW => 0b01000, + AtomicOP::AmominW => 0b10000, + AtomicOP::AmomaxW => 0b10100, + AtomicOP::AmominuW => 0b11000, + AtomicOP::AmomaxuW => 0b11100, + AtomicOP::LrD => 0b00010, + AtomicOP::ScD => 0b00011, + AtomicOP::AmoswapD => 0b00001, + AtomicOP::AmoaddD => 0b00000, + AtomicOP::AmoxorD => 0b00100, + AtomicOP::AmoandD => 0b01100, + AtomicOP::AmoorD => 0b01000, + AtomicOP::AmominD => 0b10000, + AtomicOP::AmomaxD => 0b10100, + AtomicOP::AmominuD => 0b11000, + AtomicOP::AmomaxuD => 0b11100, + } + } + + pub(crate) fn load_op(t: Type) -> Self { + if t.bits() <= 32 { + Self::LrW + } else { + Self::LrD + } + } + pub(crate) fn store_op(t: Type) -> Self { + if t.bits() <= 32 { + Self::ScW + } else { + Self::ScD + } + } + + /// extract + pub(crate) fn extract(rd: WritableReg, offset: Reg, rs: Reg, ty: Type) -> SmallInstVec { + let mut insts = SmallInstVec::new(); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Srl, + rd: rd, + rs1: rs, + rs2: offset, + }); + // + insts.push(Inst::Extend { + rd: rd, + rn: rd.to_reg(), + signed: false, + from_bits: ty.bits() as u8, + to_bits: 64, + }); + insts + } + + /// like extract but sign extend the value. + /// suitable for smax,etc. + pub(crate) fn extract_sext( + rd: WritableReg, + offset: Reg, + rs: Reg, + ty: Type, + ) -> SmallInstVec { + let mut insts = SmallInstVec::new(); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Srl, + rd: rd, + rs1: rs, + rs2: offset, + }); + // + insts.push(Inst::Extend { + rd: rd, + rn: rd.to_reg(), + signed: true, + from_bits: ty.bits() as u8, + to_bits: 64, + }); + insts + } + + pub(crate) fn unset( + rd: WritableReg, + tmp: WritableReg, + offset: Reg, + ty: Type, + ) -> SmallInstVec { + assert!(rd != tmp); + let mut insts = SmallInstVec::new(); + insts.extend(Inst::load_int_mask(tmp, ty)); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Sll, + rd: tmp, + rs1: tmp.to_reg(), + rs2: offset, + }); + insts.push(Inst::construct_bit_not(tmp, tmp.to_reg())); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: rd, + rs1: rd.to_reg(), + rs2: tmp.to_reg(), + }); + insts + } + + pub(crate) fn set( + rd: WritableReg, + tmp: WritableReg, + offset: Reg, + rs: Reg, + ty: Type, + ) -> SmallInstVec { + assert!(rd != tmp); + let mut insts = SmallInstVec::new(); + // make rs into tmp. + insts.push(Inst::Extend { + rd: tmp, + rn: rs, + signed: false, + from_bits: ty.bits() as u8, + to_bits: 64, + }); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Sll, + rd: tmp, + rs1: tmp.to_reg(), + rs2: offset, + }); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Or, + rd: rd, + rs1: rd.to_reg(), + rs2: tmp.to_reg(), + }); + insts + } + + /// Merge reset part of rs into rd. + /// Call this function must make sure that other part of value is already in rd. + pub(crate) fn merge( + rd: WritableReg, + tmp: WritableReg, + offset: Reg, + rs: Reg, + ty: Type, + ) -> SmallInstVec { + let mut insts = Self::unset(rd, tmp, offset, ty); + insts.extend(Self::set(rd, tmp, offset, rs, ty)); + insts + } +} + +///Atomic Memory ordering. +#[derive(Copy, Clone, Debug)] +pub enum AMO { + Relax = 0b00, + Release = 0b01, + Aquire = 0b10, + SeqCst = 0b11, +} + +impl AMO { + pub(crate) fn to_static_str(self) -> &'static str { + match self { + AMO::Relax => "", + AMO::Release => ".rl", + AMO::Aquire => ".aq", + AMO::SeqCst => ".aqrl", + } + } + pub(crate) fn as_u32(self) -> u32 { + self as u32 + } +} + +impl Inst { + /// fence request bits. + pub(crate) const FENCE_REQ_I: u8 = 1 << 3; + pub(crate) const FENCE_REQ_O: u8 = 1 << 2; + pub(crate) const FENCE_REQ_R: u8 = 1 << 1; + pub(crate) const FENCE_REQ_W: u8 = 1 << 0; + pub(crate) fn fence_req_to_string(x: u8) -> String { + let mut s = String::default(); + if x & Self::FENCE_REQ_I != 0 { + s.push_str("i"); + } + if x & Self::FENCE_REQ_O != 0 { + s.push_str("o"); + } + if x & Self::FENCE_REQ_R != 0 { + s.push_str("r"); + } + if x & Self::FENCE_REQ_W != 0 { + s.push_str("w"); + } + s + } +} + +pub(crate) fn f32_cvt_to_int_bounds(signed: bool, out_bits: u32) -> (f32, f32) { + match (signed, out_bits) { + (true, 8) => (i8::min_value() as f32 - 1., i8::max_value() as f32 + 1.), + (true, 16) => (i16::min_value() as f32 - 1., i16::max_value() as f32 + 1.), + (true, 32) => (-2147483904.0, 2147483648.0), + (true, 64) => (-9223373136366403584.0, 9223372036854775808.0), + (false, 8) => (-1., u8::max_value() as f32 + 1.), + (false, 16) => (-1., u16::max_value() as f32 + 1.), + (false, 32) => (-1., 4294967296.0), + (false, 64) => (-1., 18446744073709551616.0), + _ => unreachable!(), + } +} + +pub(crate) fn f64_cvt_to_int_bounds(signed: bool, out_bits: u32) -> (f64, f64) { + match (signed, out_bits) { + (true, 8) => (i8::min_value() as f64 - 1., i8::max_value() as f64 + 1.), + (true, 16) => (i16::min_value() as f64 - 1., i16::max_value() as f64 + 1.), + (true, 32) => (-2147483649.0, 2147483648.0), + (true, 64) => (-9223372036854777856.0, 9223372036854775808.0), + (false, 8) => (-1., u8::max_value() as f64 + 1.), + (false, 16) => (-1., u16::max_value() as f64 + 1.), + (false, 32) => (-1., 4294967296.0), + (false, 64) => (-1., 18446744073709551616.0), + _ => unreachable!(), + } +} + +impl CsrRegOP { + pub(crate) fn funct3(self) -> u32 { + match self { + CsrRegOP::CsrRW => 0b001, + CsrRegOP::CsrRS => 0b010, + CsrRegOP::CsrRC => 0b011, + } + } + + pub(crate) fn opcode(self) -> u32 { + 0b1110011 + } + + pub(crate) fn name(self) -> &'static str { + match self { + CsrRegOP::CsrRW => "csrrw", + CsrRegOP::CsrRS => "csrrs", + CsrRegOP::CsrRC => "csrrc", + } + } +} + +impl Display for CsrRegOP { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} + +impl CsrImmOP { + pub(crate) fn funct3(self) -> u32 { + match self { + CsrImmOP::CsrRWI => 0b101, + CsrImmOP::CsrRSI => 0b110, + CsrImmOP::CsrRCI => 0b111, + } + } + + pub(crate) fn opcode(self) -> u32 { + 0b1110011 + } + + pub(crate) fn name(self) -> &'static str { + match self { + CsrImmOP::CsrRWI => "csrrwi", + CsrImmOP::CsrRSI => "csrrsi", + CsrImmOP::CsrRCI => "csrrci", + } + } +} + +impl Display for CsrImmOP { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} + +impl CSR { + pub(crate) fn bits(self) -> Imm12 { + Imm12::from_i16(match self { + CSR::Frm => 0x0002, + }) + } + + pub(crate) fn name(self) -> &'static str { + match self { + CSR::Frm => "frm", + } + } +} + +impl Display for CSR { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} + +impl COpcodeSpace { + pub fn bits(&self) -> u32 { + match self { + COpcodeSpace::C0 => 0b00, + COpcodeSpace::C1 => 0b01, + COpcodeSpace::C2 => 0b10, + } + } +} + +impl CrOp { + pub fn funct4(&self) -> u32 { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + // `c.jr` has the same op/funct4 as C.MV, but RS2 is 0, which is illegal for mv. + CrOp::CMv | CrOp::CJr => 0b1000, + CrOp::CAdd | CrOp::CJalr | CrOp::CEbreak => 0b1001, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CrOp::CMv | CrOp::CAdd | CrOp::CJr | CrOp::CJalr | CrOp::CEbreak => COpcodeSpace::C2, + } + } +} + +impl CaOp { + pub fn funct2(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CaOp::CAnd => 0b11, + CaOp::COr => 0b10, + CaOp::CXor => 0b01, + CaOp::CSub => 0b00, + CaOp::CAddw => 0b01, + CaOp::CSubw => 0b00, + CaOp::CMul => 0b10, + } + } + + pub fn funct6(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CaOp::CAnd | CaOp::COr | CaOp::CXor | CaOp::CSub => 0b100_011, + CaOp::CSubw | CaOp::CAddw | CaOp::CMul => 0b100_111, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CaOp::CAnd + | CaOp::COr + | CaOp::CXor + | CaOp::CSub + | CaOp::CAddw + | CaOp::CSubw + | CaOp::CMul => COpcodeSpace::C1, + } + } +} + +impl CjOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CjOp::CJ => 0b101, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CjOp::CJ => COpcodeSpace::C1, + } + } +} + +impl CiOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CiOp::CAddi | CiOp::CSlli => 0b000, + CiOp::CAddiw | CiOp::CFldsp => 0b001, + CiOp::CLi | CiOp::CLwsp => 0b010, + CiOp::CAddi16sp | CiOp::CLui | CiOp::CLdsp => 0b011, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CiOp::CAddi | CiOp::CAddiw | CiOp::CAddi16sp | CiOp::CLi | CiOp::CLui => { + COpcodeSpace::C1 + } + CiOp::CSlli | CiOp::CLwsp | CiOp::CLdsp | CiOp::CFldsp => COpcodeSpace::C2, + } + } +} + +impl CiwOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CiwOp::CAddi4spn => 0b000, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CiwOp::CAddi4spn => COpcodeSpace::C0, + } + } +} + +impl CbOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CbOp::CSrli | CbOp::CSrai | CbOp::CAndi => 0b100, + } + } + + pub fn funct2(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CbOp::CSrli => 0b00, + CbOp::CSrai => 0b01, + CbOp::CAndi => 0b10, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CbOp::CSrli | CbOp::CSrai | CbOp::CAndi => COpcodeSpace::C1, + } + } +} + +impl CssOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CssOp::CFsdsp => 0b101, + CssOp::CSwsp => 0b110, + CssOp::CSdsp => 0b111, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CssOp::CSwsp | CssOp::CSdsp | CssOp::CFsdsp => COpcodeSpace::C2, + } + } +} + +impl CsOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CsOp::CFsd => 0b101, + CsOp::CSw => 0b110, + CsOp::CSd => 0b111, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CsOp::CSw | CsOp::CSd | CsOp::CFsd => COpcodeSpace::C0, + } + } +} + +impl ClOp { + pub fn funct3(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + ClOp::CFld => 0b001, + ClOp::CLw => 0b010, + ClOp::CLd => 0b011, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + ClOp::CLw | ClOp::CLd | ClOp::CFld => COpcodeSpace::C0, + } + } +} + +impl CsznOp { + pub fn funct6(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CsznOp::CNot + | CsznOp::CZextw + | CsznOp::CZextb + | CsznOp::CZexth + | CsznOp::CSextb + | CsznOp::CSexth => 0b100_111, + } + } + + pub fn funct5(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + CsznOp::CNot => 0b11_101, + CsznOp::CZextb => 0b11_000, + CsznOp::CZexth => 0b11_010, + CsznOp::CZextw => 0b11_100, + CsznOp::CSextb => 0b11_001, + CsznOp::CSexth => 0b11_011, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + CsznOp::CNot + | CsznOp::CZextb + | CsznOp::CZexth + | CsznOp::CZextw + | CsznOp::CSextb + | CsznOp::CSexth => COpcodeSpace::C1, + } + } +} + +impl ZcbMemOp { + pub fn funct6(&self) -> u32 { + // https://github.com/michaeljclark/riscv-meta/blob/master/opcodes + match self { + ZcbMemOp::CLbu => 0b100_000, + // These two opcodes are differentiated in the imm field of the instruction. + ZcbMemOp::CLhu | ZcbMemOp::CLh => 0b100_001, + ZcbMemOp::CSb => 0b100_010, + ZcbMemOp::CSh => 0b100_011, + } + } + + pub fn imm_bits(&self) -> u8 { + match self { + ZcbMemOp::CLhu | ZcbMemOp::CLh | ZcbMemOp::CSh => 1, + ZcbMemOp::CLbu | ZcbMemOp::CSb => 2, + } + } + + pub fn op(&self) -> COpcodeSpace { + // https://five-embeddev.com/riscv-isa-manual/latest/rvc-opcode-map.html#rvcopcodemap + match self { + ZcbMemOp::CLbu | ZcbMemOp::CLhu | ZcbMemOp::CLh | ZcbMemOp::CSb | ZcbMemOp::CSh => { + COpcodeSpace::C0 + } + } + } +} diff --git a/hbcb/src/inst/emit.rs b/hbcb/src/inst/emit.rs new file mode 100644 index 0000000..96e21a1 --- /dev/null +++ b/hbcb/src/inst/emit.rs @@ -0,0 +1,2685 @@ +//! Riscv64 ISA: binary code emission. + +use crate::ir::{self, LibCall, TrapCode}; +use crate::inst::*; +use crate::lower::isle::generated_code::{ + CaOp, CbOp, CiOp, CiwOp, ClOp, CrOp, CsOp, CssOp, CsznOp, FpuOPWidth, ZcbMemOp, +}; +use cranelift_control::ControlPlane; + +pub struct EmitInfo { + shared_flag: settings::Flags, + isa_flags: super::super::riscv_settings::Flags, +} + +impl EmitInfo { + pub(crate) fn new( + shared_flag: settings::Flags, + isa_flags: super::super::riscv_settings::Flags, + ) -> Self { + Self { + shared_flag, + isa_flags, + } + } +} + +pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 { + u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap() +} + +pub(crate) fn reg_to_compressed_gpr_num(m: Reg) -> u32 { + let real_reg = m.to_real_reg().unwrap().hw_enc(); + debug_assert!(real_reg >= 8 && real_reg < 16); + let compressed_reg = real_reg - 8; + u32::try_from(compressed_reg).unwrap() +} + +#[derive(Clone, Debug, PartialEq, Default)] +pub enum EmitVState { + #[default] + Unknown, + Known(VState), +} + +/// State carried between emissions of a sequence of instructions. +#[derive(Default, Clone, Debug)] +pub struct EmitState { + /// The user stack map for the upcoming instruction, as provided to + /// `pre_safepoint()`. + user_stack_map: Option, + + /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and + /// optimized away at compiletime. See [cranelift_control]. + ctrl_plane: ControlPlane, + + /// Vector State + /// Controls the current state of the vector unit at the emission point. + vstate: EmitVState, + + frame_layout: FrameLayout, +} + +impl EmitState { + fn take_stack_map(&mut self) -> Option { + self.user_stack_map.take() + } +} + +impl MachInstEmitState for EmitState { + fn new( + abi: &Callee, + ctrl_plane: ControlPlane, + ) -> Self { + EmitState { + user_stack_map: None, + ctrl_plane, + vstate: EmitVState::Unknown, + frame_layout: abi.frame_layout().clone(), + } + } + + fn pre_safepoint(&mut self, user_stack_map: Option) { + self.user_stack_map = user_stack_map; + } + + fn ctrl_plane_mut(&mut self) -> &mut ControlPlane { + &mut self.ctrl_plane + } + + fn take_ctrl_plane(self) -> ControlPlane { + self.ctrl_plane + } + + fn on_new_block(&mut self) { + // Reset the vector state. + self.vstate = EmitVState::Unknown; + } + + fn frame_layout(&self) -> &FrameLayout { + &self.frame_layout + } +} + +impl Inst { + /// Load int mask. + /// If ty is int then 0xff in rd. + pub(crate) fn load_int_mask(rd: Writable, ty: Type) -> SmallInstVec { + let mut insts = SmallInstVec::new(); + assert!(ty.is_int() && ty.bits() <= 64); + match ty { + I64 => { + insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1))); + } + I32 | I16 => { + insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1))); + insts.push(Inst::Extend { + rd: rd, + rn: rd.to_reg(), + signed: false, + from_bits: ty.bits() as u8, + to_bits: 64, + }); + } + I8 => { + insts.push(Inst::load_imm12(rd, Imm12::from_i16(255))); + } + _ => unreachable!("ty:{:?}", ty), + } + insts + } + /// inverse all bit + pub(crate) fn construct_bit_not(rd: Writable, rs: Reg) -> Inst { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Xori, + rd, + rs, + imm12: Imm12::from_i16(-1), + } + } + + /// Returns Some(VState) if this instruction is expecting a specific vector state + /// before emission. + fn expected_vstate(&self) -> Option<&VState> { + match self { + Inst::Nop0 + | Inst::Nop4 + | Inst::BrTable { .. } + | Inst::Auipc { .. } + | Inst::Fli { .. } + | Inst::Lui { .. } + | Inst::LoadInlineConst { .. } + | Inst::AluRRR { .. } + | Inst::FpuRRR { .. } + | Inst::AluRRImm12 { .. } + | Inst::CsrReg { .. } + | Inst::CsrImm { .. } + | Inst::Load { .. } + | Inst::Store { .. } + | Inst::Args { .. } + | Inst::Rets { .. } + | Inst::Ret { .. } + | Inst::Extend { .. } + | Inst::Call { .. } + | Inst::CallInd { .. } + | Inst::ReturnCall { .. } + | Inst::ReturnCallInd { .. } + | Inst::Jal { .. } + | Inst::CondBr { .. } + | Inst::LoadExtName { .. } + | Inst::ElfTlsGetAddr { .. } + | Inst::LoadAddr { .. } + | Inst::Mov { .. } + | Inst::MovFromPReg { .. } + | Inst::Fence { .. } + | Inst::EBreak + | Inst::Udf { .. } + | Inst::FpuRR { .. } + | Inst::FpuRRRR { .. } + | Inst::Jalr { .. } + | Inst::Atomic { .. } + | Inst::Select { .. } + | Inst::AtomicCas { .. } + | Inst::RawData { .. } + | Inst::AtomicStore { .. } + | Inst::AtomicLoad { .. } + | Inst::AtomicRmwLoop { .. } + | Inst::TrapIf { .. } + | Inst::Unwind { .. } + | Inst::DummyUse { .. } + | Inst::Popcnt { .. } + | Inst::Cltz { .. } + | Inst::Brev8 { .. } + | Inst::StackProbeLoop { .. } => None, + + // VecSetState does not expect any vstate, rather it updates it. + Inst::VecSetState { .. } => None, + + // `vmv` instructions copy a set of registers and ignore vstate. + Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None, + + Inst::VecAluRR { vstate, .. } | + Inst::VecAluRRR { vstate, .. } | + Inst::VecAluRRRR { vstate, .. } | + Inst::VecAluRImm5 { vstate, .. } | + Inst::VecAluRRImm5 { vstate, .. } | + Inst::VecAluRRRImm5 { vstate, .. } | + // TODO: Unit-stride loads and stores only need the AVL to be correct, not + // the full vtype. A future optimization could be to decouple these two when + // updating vstate. This would allow us to avoid emitting a VecSetState in + // some cases. + Inst::VecLoad { vstate, .. } + | Inst::VecStore { vstate, .. } => Some(vstate), + } + } +} + +impl MachInstEmit for Inst { + type State = EmitState; + type Info = EmitInfo; + + fn emit(&self, sink: &mut MachBuffer, emit_info: &Self::Info, state: &mut EmitState) { + // Check if we need to update the vector state before emitting this instruction + if let Some(expected) = self.expected_vstate() { + if state.vstate != EmitVState::Known(*expected) { + // Update the vector state. + Inst::VecSetState { + rd: writable_zero_reg(), + vstate: *expected, + } + .emit(sink, emit_info, state); + } + } + + // N.B.: we *must* not exceed the "worst-case size" used to compute + // where to insert islands, except when islands are explicitly triggered + // (with an `EmitIsland`). We check this in debug builds. This is `mut` + // to allow disabling the check for `JTSequence`, which is always + // emitted following an `EmitIsland`. + let mut start_off = sink.cur_offset(); + + // First try to emit this as a compressed instruction + let res = self.try_emit_compressed(sink, emit_info, state, &mut start_off); + if res.is_none() { + // If we can't lets emit it as a normal instruction + self.emit_uncompressed(sink, emit_info, state, &mut start_off); + } + + // We exclude br_table and return call from these checks since they emit + // their own islands, and thus are allowed to exceed the worst case size. + if !matches!( + self, + Inst::BrTable { .. } | Inst::ReturnCall { .. } | Inst::ReturnCallInd { .. } + ) { + let end_off = sink.cur_offset(); + assert!( + (end_off - start_off) <= Inst::worst_case_size(), + "Inst:{:?} length:{} worst_case_size:{}", + self, + end_off - start_off, + Inst::worst_case_size() + ); + } + } + + fn pretty_print_inst(&self, state: &mut Self::State) -> String { + self.print_with_state(state) + } +} + +impl Inst { + /// Tries to emit an instruction as compressed, if we can't return false. + fn try_emit_compressed( + &self, + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + start_off: &mut u32, + ) -> Option<()> { + let has_m = emit_info.isa_flags.has_m(); + let has_zba = emit_info.isa_flags.has_zba(); + let has_zbb = emit_info.isa_flags.has_zbb(); + let has_zca = emit_info.isa_flags.has_zca(); + let has_zcb = emit_info.isa_flags.has_zcb(); + let has_zcd = emit_info.isa_flags.has_zcd(); + + // Currently all compressed extensions (Zcb, Zcd, Zcmp, Zcmt, etc..) require Zca + // to be enabled, so check it early. + if !has_zca { + return None; + } + + fn reg_is_compressible(r: Reg) -> bool { + r.to_real_reg() + .map(|r| r.hw_enc() >= 8 && r.hw_enc() < 16) + .unwrap_or(false) + } + + match *self { + // C.ADD + Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd, + rs1, + rs2, + } if (rd.to_reg() == rs1 || rd.to_reg() == rs2) + && rs1 != zero_reg() + && rs2 != zero_reg() => + { + // Technically `c.add rd, rs` expands to `add rd, rd, rs`, but we can + // also swap rs1 with rs2 and we get an equivalent instruction. i.e we + // can also compress `add rd, rs, rd` into `c.add rd, rs`. + let src = if rd.to_reg() == rs1 { rs2 } else { rs1 }; + + sink.put2(encode_cr_type(CrOp::CAdd, rd, src)); + } + + // C.MV + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi | AluOPRRI::Ori, + rd, + rs, + imm12, + } if rd.to_reg() != rs + && rd.to_reg() != zero_reg() + && rs != zero_reg() + && imm12.as_i16() == 0 => + { + sink.put2(encode_cr_type(CrOp::CMv, rd, rs)); + } + + // CA Ops + Inst::AluRRR { + alu_op: + alu_op @ (AluOPRRR::And + | AluOPRRR::Or + | AluOPRRR::Xor + | AluOPRRR::Addw + | AluOPRRR::Mul), + rd, + rs1, + rs2, + } if (rd.to_reg() == rs1 || rd.to_reg() == rs2) + && reg_is_compressible(rs1) + && reg_is_compressible(rs2) => + { + let op = match alu_op { + AluOPRRR::And => CaOp::CAnd, + AluOPRRR::Or => CaOp::COr, + AluOPRRR::Xor => CaOp::CXor, + AluOPRRR::Addw => CaOp::CAddw, + AluOPRRR::Mul if has_zcb && has_m => CaOp::CMul, + _ => return None, + }; + // The canonical expansion for these instruction has `rd == rs1`, but + // these are all commutative operations, so we can swap the operands. + let src = if rd.to_reg() == rs1 { rs2 } else { rs1 }; + + sink.put2(encode_ca_type(op, rd, src)); + } + + // The sub instructions are non commutative, so we can't swap the operands. + Inst::AluRRR { + alu_op: alu_op @ (AluOPRRR::Sub | AluOPRRR::Subw), + rd, + rs1, + rs2, + } if rd.to_reg() == rs1 && reg_is_compressible(rs1) && reg_is_compressible(rs2) => { + let op = match alu_op { + AluOPRRR::Sub => CaOp::CSub, + AluOPRRR::Subw => CaOp::CSubw, + _ => return None, + }; + sink.put2(encode_ca_type(op, rd, rs2)); + } + + // c.j + // + // We don't have a separate JAL as that is only available in RV32C + Inst::Jal { label } => { + sink.use_label_at_offset(*start_off, label, LabelUse::RVCJump); + sink.add_uncond_branch(*start_off, *start_off + 2, label); + sink.put2(encode_cj_type(CjOp::CJ, Imm12::ZERO)); + } + + // c.jr + Inst::Jalr { rd, base, offset } + if rd.to_reg() == zero_reg() && base != zero_reg() && offset.as_i16() == 0 => + { + sink.put2(encode_cr2_type(CrOp::CJr, base)); + } + + // c.jalr + Inst::Jalr { rd, base, offset } + if rd.to_reg() == link_reg() && base != zero_reg() && offset.as_i16() == 0 => + { + sink.put2(encode_cr2_type(CrOp::CJalr, base)); + } + + // c.ebreak + Inst::EBreak => { + sink.put2(encode_cr_type( + CrOp::CEbreak, + writable_zero_reg(), + zero_reg(), + )); + } + + // c.unimp + Inst::Udf { trap_code } => { + sink.add_trap(trap_code); + sink.put2(0x0000); + } + // c.addi16sp + // + // c.addi16sp shares the opcode with c.lui, but has a destination field of x2. + // c.addi16sp adds the non-zero sign-extended 6-bit immediate to the value in the stack pointer (sp=x2), + // where the immediate is scaled to represent multiples of 16 in the range (-512,496). c.addi16sp is used + // to adjust the stack pointer in procedure prologues and epilogues. It expands into addi x2, x2, nzimm. c.addi16sp + // is only valid when nzimm≠0; the code point with nzimm=0 is reserved. + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs, + imm12, + } if rd.to_reg() == rs + && rs == stack_reg() + && imm12.as_i16() != 0 + && (imm12.as_i16() % 16) == 0 + && Imm6::maybe_from_i16(imm12.as_i16() / 16).is_some() => + { + let imm6 = Imm6::maybe_from_i16(imm12.as_i16() / 16).unwrap(); + sink.put2(encode_c_addi16sp(imm6)); + } + + // c.addi4spn + // + // c.addi4spn is a CIW-format instruction that adds a zero-extended non-zero + // immediate, scaled by 4, to the stack pointer, x2, and writes the result to + // rd. This instruction is used to generate pointers to stack-allocated variables + // and expands to addi rd, x2, nzuimm. c.addi4spn is only valid when nzuimm≠0; + // the code points with nzuimm=0 are reserved. + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs, + imm12, + } if reg_is_compressible(rd.to_reg()) + && rs == stack_reg() + && imm12.as_i16() != 0 + && (imm12.as_i16() % 4) == 0 + && u8::try_from(imm12.as_i16() / 4).is_ok() => + { + let imm = u8::try_from(imm12.as_i16() / 4).unwrap(); + sink.put2(encode_ciw_type(CiwOp::CAddi4spn, rd, imm)); + } + + // c.li + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs, + imm12, + } if rd.to_reg() != zero_reg() && rs == zero_reg() => { + let imm6 = Imm6::maybe_from_imm12(imm12)?; + sink.put2(encode_ci_type(CiOp::CLi, rd, imm6)); + } + + // c.addi + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs, + imm12, + } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => { + let imm6 = Imm6::maybe_from_imm12(imm12)?; + sink.put2(encode_ci_type(CiOp::CAddi, rd, imm6)); + } + + // c.addiw + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addiw, + rd, + rs, + imm12, + } if rd.to_reg() == rs && rs != zero_reg() => { + let imm6 = Imm6::maybe_from_imm12(imm12)?; + sink.put2(encode_ci_type(CiOp::CAddiw, rd, imm6)); + } + + // c.lui + // + // c.lui loads the non-zero 6-bit immediate field into bits 17–12 + // of the destination register, clears the bottom 12 bits, and + // sign-extends bit 17 into all higher bits of the destination. + Inst::Lui { rd, imm: imm20 } + if rd.to_reg() != zero_reg() + && rd.to_reg() != stack_reg() + && imm20.as_i32() != 0 => + { + // Check that the top bits are sign extended + let imm = imm20.as_i32() << 14 >> 14; + if imm != imm20.as_i32() { + return None; + } + let imm6 = Imm6::maybe_from_i32(imm)?; + sink.put2(encode_ci_type(CiOp::CLui, rd, imm6)); + } + + // c.slli + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd, + rs, + imm12, + } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => { + // The shift amount is unsigned, but we encode it as signed. + let shift = imm12.as_i16() & 0x3f; + let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap(); + sink.put2(encode_ci_type(CiOp::CSlli, rd, imm6)); + } + + // c.srli / c.srai + Inst::AluRRImm12 { + alu_op: op @ (AluOPRRI::Srli | AluOPRRI::Srai), + rd, + rs, + imm12, + } if rd.to_reg() == rs && reg_is_compressible(rs) && imm12.as_i16() != 0 => { + let op = match op { + AluOPRRI::Srli => CbOp::CSrli, + AluOPRRI::Srai => CbOp::CSrai, + _ => unreachable!(), + }; + + // The shift amount is unsigned, but we encode it as signed. + let shift = imm12.as_i16() & 0x3f; + let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap(); + sink.put2(encode_cb_type(op, rd, imm6)); + } + + // c.zextb + // + // This is an alias for `andi rd, rd, 0xff` + Inst::AluRRImm12 { + alu_op: AluOPRRI::Andi, + rd, + rs, + imm12, + } if has_zcb + && rd.to_reg() == rs + && reg_is_compressible(rs) + && imm12.as_i16() == 0xff => + { + sink.put2(encode_cszn_type(CsznOp::CZextb, rd)); + } + + // c.andi + Inst::AluRRImm12 { + alu_op: AluOPRRI::Andi, + rd, + rs, + imm12, + } if rd.to_reg() == rs && reg_is_compressible(rs) => { + let imm6 = Imm6::maybe_from_imm12(imm12)?; + sink.put2(encode_cb_type(CbOp::CAndi, rd, imm6)); + } + + // Stack Based Loads + Inst::Load { + rd, + op: op @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld), + from, + flags, + } if from.get_base_register() == Some(stack_reg()) + && (from.get_offset_with_state(state) % op.size()) == 0 => + { + // We encode the offset in multiples of the load size. + let offset = from.get_offset_with_state(state); + let imm6 = u8::try_from(offset / op.size()) + .ok() + .and_then(Uimm6::maybe_from_u8)?; + + // Some additional constraints on these instructions. + // + // Integer loads are not allowed to target x0, but floating point loads + // are, since f0 is not a special register. + // + // Floating point loads are not included in the base Zca extension + // but in a separate Zcd extension. Both of these are part of the C Extension. + let rd_is_zero = rd.to_reg() == zero_reg(); + let op = match op { + LoadOP::Lw if !rd_is_zero => CiOp::CLwsp, + LoadOP::Ld if !rd_is_zero => CiOp::CLdsp, + LoadOP::Fld if has_zcd => CiOp::CFldsp, + _ => return None, + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + sink.put2(encode_ci_sp_load(op, rd, imm6)); + } + + // Regular Loads + Inst::Load { + rd, + op: + op + @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld | LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh), + from, + flags, + } if reg_is_compressible(rd.to_reg()) + && from + .get_base_register() + .map(reg_is_compressible) + .unwrap_or(false) + && (from.get_offset_with_state(state) % op.size()) == 0 => + { + let base = from.get_base_register().unwrap(); + + // We encode the offset in multiples of the store size. + let offset = from.get_offset_with_state(state); + let offset = u8::try_from(offset / op.size()).ok()?; + + // We mix two different formats here. + // + // c.lw / c.ld / c.fld instructions are available in the standard Zca + // extension using the CL format. + // + // c.lbu / c.lhu / c.lh are only available in the Zcb extension and + // are also encoded differently. Technically they each have a different + // format, but they are similar enough that we can group them. + let is_zcb_load = matches!(op, LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh); + let encoded = if is_zcb_load { + if !has_zcb { + return None; + } + + let op = match op { + LoadOP::Lbu => ZcbMemOp::CLbu, + LoadOP::Lhu => ZcbMemOp::CLhu, + LoadOP::Lh => ZcbMemOp::CLh, + _ => unreachable!(), + }; + + // Byte stores & loads have 2 bits of immediate offset. Halfword stores + // and loads only have 1 bit. + let imm2 = Uimm2::maybe_from_u8(offset)?; + if (offset & !((1 << op.imm_bits()) - 1)) != 0 { + return None; + } + + encode_zcbmem_load(op, rd, base, imm2) + } else { + // Floating point loads are not included in the base Zca extension + // but in a separate Zcd extension. Both of these are part of the C Extension. + let op = match op { + LoadOP::Lw => ClOp::CLw, + LoadOP::Ld => ClOp::CLd, + LoadOP::Fld if has_zcd => ClOp::CFld, + _ => return None, + }; + let imm5 = Uimm5::maybe_from_u8(offset)?; + + encode_cl_type(op, rd, base, imm5) + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + sink.put2(encoded); + } + + // Stack Based Stores + Inst::Store { + src, + op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd), + to, + flags, + } if to.get_base_register() == Some(stack_reg()) + && (to.get_offset_with_state(state) % op.size()) == 0 => + { + // We encode the offset in multiples of the store size. + let offset = to.get_offset_with_state(state); + let imm6 = u8::try_from(offset / op.size()) + .ok() + .and_then(Uimm6::maybe_from_u8)?; + + // Floating point stores are not included in the base Zca extension + // but in a separate Zcd extension. Both of these are part of the C Extension. + let op = match op { + StoreOP::Sw => CssOp::CSwsp, + StoreOP::Sd => CssOp::CSdsp, + StoreOP::Fsd if has_zcd => CssOp::CFsdsp, + _ => return None, + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + sink.put2(encode_css_type(op, src, imm6)); + } + + // Regular Stores + Inst::Store { + src, + op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd | StoreOP::Sh | StoreOP::Sb), + to, + flags, + } if reg_is_compressible(src) + && to + .get_base_register() + .map(reg_is_compressible) + .unwrap_or(false) + && (to.get_offset_with_state(state) % op.size()) == 0 => + { + let base = to.get_base_register().unwrap(); + + // We encode the offset in multiples of the store size. + let offset = to.get_offset_with_state(state); + let offset = u8::try_from(offset / op.size()).ok()?; + + // We mix two different formats here. + // + // c.sw / c.sd / c.fsd instructions are available in the standard Zca + // extension using the CL format. + // + // c.sb / c.sh are only available in the Zcb extension and are also + // encoded differently. + let is_zcb_store = matches!(op, StoreOP::Sh | StoreOP::Sb); + let encoded = if is_zcb_store { + if !has_zcb { + return None; + } + + let op = match op { + StoreOP::Sh => ZcbMemOp::CSh, + StoreOP::Sb => ZcbMemOp::CSb, + _ => unreachable!(), + }; + + // Byte stores & loads have 2 bits of immediate offset. Halfword stores + // and loads only have 1 bit. + let imm2 = Uimm2::maybe_from_u8(offset)?; + if (offset & !((1 << op.imm_bits()) - 1)) != 0 { + return None; + } + + encode_zcbmem_store(op, src, base, imm2) + } else { + // Floating point stores are not included in the base Zca extension + // but in a separate Zcd extension. Both of these are part of the C Extension. + let op = match op { + StoreOP::Sw => CsOp::CSw, + StoreOP::Sd => CsOp::CSd, + StoreOP::Fsd if has_zcd => CsOp::CFsd, + _ => return None, + }; + let imm5 = Uimm5::maybe_from_u8(offset)?; + + encode_cs_type(op, src, base, imm5) + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + sink.put2(encoded); + } + + // c.not + // + // This is an alias for `xori rd, rd, -1` + Inst::AluRRImm12 { + alu_op: AluOPRRI::Xori, + rd, + rs, + imm12, + } if has_zcb + && rd.to_reg() == rs + && reg_is_compressible(rs) + && imm12.as_i16() == -1 => + { + sink.put2(encode_cszn_type(CsznOp::CNot, rd)); + } + + // c.sext.b / c.sext.h / c.zext.h + // + // These are all the extend instructions present in `Zcb`, they + // also require `Zbb` since they aren't available in the base ISA. + Inst::AluRRImm12 { + alu_op: alu_op @ (AluOPRRI::Sextb | AluOPRRI::Sexth | AluOPRRI::Zexth), + rd, + rs, + imm12, + } if has_zcb + && has_zbb + && rd.to_reg() == rs + && reg_is_compressible(rs) + && imm12.as_i16() == 0 => + { + let op = match alu_op { + AluOPRRI::Sextb => CsznOp::CSextb, + AluOPRRI::Sexth => CsznOp::CSexth, + AluOPRRI::Zexth => CsznOp::CZexth, + _ => unreachable!(), + }; + sink.put2(encode_cszn_type(op, rd)); + } + + // c.zext.w + // + // This is an alias for `add.uw rd, rd, zero` + Inst::AluRRR { + alu_op: AluOPRRR::Adduw, + rd, + rs1, + rs2, + } if has_zcb + && has_zba + && rd.to_reg() == rs1 + && reg_is_compressible(rs1) + && rs2 == zero_reg() => + { + sink.put2(encode_cszn_type(CsznOp::CZextw, rd)); + } + + _ => return None, + } + + return Some(()); + } + + fn emit_uncompressed( + &self, + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + start_off: &mut u32, + ) { + match self { + &Inst::Nop0 => { + // do nothing + } + // Addi x0, x0, 0 + &Inst::Nop4 => { + let x = Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: Writable::from_reg(zero_reg()), + rs: zero_reg(), + imm12: Imm12::ZERO, + }; + x.emit(sink, emit_info, state) + } + &Inst::RawData { ref data } => { + // Right now we only put a u32 or u64 in this instruction. + // It is not very long, no need to check if need `emit_island`. + // If data is very long , this is a bug because RawData is typically + // use to load some data and rely on some position in the code stream. + // and we may exceed `Inst::worst_case_size`. + // for more information see https://github.com/bytecodealliance/wasmtime/pull/5612. + sink.put_data(&data[..]); + } + &Inst::Lui { rd, ref imm } => { + let x: u32 = 0b0110111 | reg_to_gpr_num(rd.to_reg()) << 7 | (imm.bits() << 12); + sink.put4(x); + } + &Inst::Fli { rd, ty, imm } => { + sink.put4(encode_fli(ty, imm, rd)); + } + &Inst::LoadInlineConst { rd, ty, imm } => { + let data = &imm.to_le_bytes()[..ty.bytes() as usize]; + + let label_data: MachLabel = sink.get_label(); + let label_end: MachLabel = sink.get_label(); + + // Load into rd + Inst::Load { + rd, + op: LoadOP::from_type(ty), + flags: MemFlags::new(), + from: AMode::Label(label_data), + } + .emit(sink, emit_info, state); + + // Jump over the inline pool + Inst::gen_jump(label_end).emit(sink, emit_info, state); + + // Emit the inline data + sink.bind_label(label_data, &mut state.ctrl_plane); + Inst::RawData { data: data.into() }.emit(sink, emit_info, state); + + sink.bind_label(label_end, &mut state.ctrl_plane); + } + &Inst::FpuRR { + alu_op, + width, + frm, + rd, + rs, + } => { + if alu_op.is_convert_to_int() { + sink.add_trap(TrapCode::BadConversionToInteger); + } + sink.put4(encode_fp_rr(alu_op, width, frm, rd, rs)); + } + &Inst::FpuRRRR { + alu_op, + rd, + rs1, + rs2, + rs3, + frm, + width, + } => { + sink.put4(encode_fp_rrrr(alu_op, width, frm, rd, rs1, rs2, rs3)); + } + &Inst::FpuRRR { + alu_op, + width, + frm, + rd, + rs1, + rs2, + } => { + sink.put4(encode_fp_rrr(alu_op, width, frm, rd, rs1, rs2)); + } + &Inst::Unwind { ref inst } => { + sink.add_unwind(inst.clone()); + } + &Inst::DummyUse { .. } => { + // This has already been handled by Inst::allocate. + } + &Inst::AluRRR { + alu_op, + rd, + rs1, + rs2, + } => { + let (rs1, rs2) = if alu_op.reverse_rs() { + (rs2, rs1) + } else { + (rs1, rs2) + }; + + sink.put4(encode_r_type( + alu_op.op_code(), + rd, + alu_op.funct3(), + rs1, + rs2, + alu_op.funct7(), + )); + } + &Inst::AluRRImm12 { + alu_op, + rd, + rs, + imm12, + } => { + let x = alu_op.op_code() + | reg_to_gpr_num(rd.to_reg()) << 7 + | alu_op.funct3() << 12 + | reg_to_gpr_num(rs) << 15 + | alu_op.imm12(imm12) << 20; + sink.put4(x); + } + &Inst::CsrReg { op, rd, rs, csr } => { + sink.put4(encode_csr_reg(op, rd, rs, csr)); + } + &Inst::CsrImm { op, rd, csr, imm } => { + sink.put4(encode_csr_imm(op, rd, csr, imm)); + } + &Inst::Load { + rd, + op, + from, + flags, + } => { + let base = from.get_base_register(); + let offset = from.get_offset_with_state(state); + let offset_imm12 = Imm12::maybe_from_i64(offset); + let label = from.get_label_with_sink(sink); + + let (addr, imm12) = match (base, offset_imm12, label) { + // When loading from a Reg+Offset, if the offset fits into an imm12 we can directly encode it. + (Some(base), Some(imm12), None) => (base, imm12), + + // Otherwise, if the offset does not fit into a imm12, we need to materialize it into a + // register and load from that. + (Some(_), None, None) => { + let tmp = writable_spilltmp_reg(); + Inst::LoadAddr { rd: tmp, mem: from }.emit(sink, emit_info, state); + (tmp.to_reg(), Imm12::ZERO) + } + + // If the AMode contains a label we can emit an internal relocation that gets + // resolved with the correct address later. + (None, Some(imm), Some(label)) => { + debug_assert_eq!(imm.as_i16(), 0); + + // Get the current PC. + sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20); + Inst::Auipc { + rd, + imm: Imm20::ZERO, + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // Emit a relocation for the load. This patches the offset into the instruction. + sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I); + + // Imm12 here is meaningless since it's going to get replaced. + (rd.to_reg(), Imm12::ZERO) + } + + // These cases are impossible with the current AModes that we have. We either + // always have a register, or always have a label. Never both, and never neither. + (None, None, None) + | (None, Some(_), None) + | (Some(_), None, Some(_)) + | (Some(_), Some(_), Some(_)) + | (None, None, Some(_)) => { + unreachable!("Invalid load address") + } + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + + sink.put4(encode_i_type(op.op_code(), rd, op.funct3(), addr, imm12)); + } + &Inst::Store { op, src, flags, to } => { + let base = to.get_base_register(); + let offset = to.get_offset_with_state(state); + let offset_imm12 = Imm12::maybe_from_i64(offset); + + let (addr, imm12) = match (base, offset_imm12) { + // If the offset fits into an imm12 we can directly encode it. + (Some(base), Some(imm12)) => (base, imm12), + // Otherwise load the address it into a reg and load from it. + _ => { + let tmp = writable_spilltmp_reg(); + Inst::LoadAddr { rd: tmp, mem: to }.emit(sink, emit_info, state); + (tmp.to_reg(), Imm12::ZERO) + } + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + + sink.put4(encode_s_type(op.op_code(), op.funct3(), addr, src, imm12)); + } + &Inst::Args { .. } | &Inst::Rets { .. } => { + // Nothing: this is a pseudoinstruction that serves + // only to constrain registers at a certain point. + } + &Inst::Ret {} => { + // RISC-V does not have a dedicated ret instruction, instead we emit the equivalent + // `jalr x0, x1, 0` that jumps to the return address. + Inst::Jalr { + rd: writable_zero_reg(), + base: link_reg(), + offset: Imm12::ZERO, + } + .emit(sink, emit_info, state); + } + + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits: _to_bits, + } => { + let mut insts = SmallInstVec::new(); + let shift_bits = (64 - from_bits) as i16; + let is_u8 = || from_bits == 8 && signed == false; + if is_u8() { + // special for u8. + insts.push(Inst::AluRRImm12 { + alu_op: AluOPRRI::Andi, + rd, + rs: rn, + imm12: Imm12::from_i16(255), + }); + } else { + insts.push(Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd, + rs: rn, + imm12: Imm12::from_i16(shift_bits), + }); + insts.push(Inst::AluRRImm12 { + alu_op: if signed { + AluOPRRI::Srai + } else { + AluOPRRI::Srli + }, + rd, + rs: rd.to_reg(), + imm12: Imm12::from_i16(shift_bits), + }); + } + insts + .into_iter() + .for_each(|i| i.emit(sink, emit_info, state)); + } + + &Inst::Call { ref info } => { + sink.add_call_site(); + sink.add_reloc(Reloc::RiscvCallPlt, &info.dest, 0); + + Inst::construct_auipc_and_jalr(Some(writable_link_reg()), writable_link_reg(), 0) + .into_iter() + .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off)); + + if let Some(s) = state.take_stack_map() { + let offset = sink.cur_offset(); + sink.push_user_stack_map(state, offset, s); + } + + let callee_pop_size = i32::try_from(info.callee_pop_size).unwrap(); + if callee_pop_size > 0 { + for inst in Riscv64MachineDeps::gen_sp_reg_adjust(-callee_pop_size) { + inst.emit(sink, emit_info, state); + } + } + } + &Inst::CallInd { ref info } => { + Inst::Jalr { + rd: writable_link_reg(), + base: info.dest, + offset: Imm12::ZERO, + } + .emit(sink, emit_info, state); + + if let Some(s) = state.take_stack_map() { + let offset = sink.cur_offset(); + sink.push_user_stack_map(state, offset, s); + } + + sink.add_call_site(); + + let callee_pop_size = i32::try_from(info.callee_pop_size).unwrap(); + if callee_pop_size > 0 { + for inst in Riscv64MachineDeps::gen_sp_reg_adjust(-callee_pop_size) { + inst.emit(sink, emit_info, state); + } + } + } + + &Inst::ReturnCall { ref info } => { + emit_return_call_common_sequence(sink, emit_info, state, info); + + sink.add_call_site(); + sink.add_reloc(Reloc::RiscvCallPlt, &info.dest, 0); + Inst::construct_auipc_and_jalr(None, writable_spilltmp_reg(), 0) + .into_iter() + .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off)); + } + + &Inst::ReturnCallInd { ref info } => { + emit_return_call_common_sequence(sink, emit_info, state, &info); + + Inst::Jalr { + rd: writable_zero_reg(), + base: info.dest, + offset: Imm12::ZERO, + } + .emit(sink, emit_info, state); + } + &Inst::Jal { label } => { + sink.use_label_at_offset(*start_off, label, LabelUse::Jal20); + sink.add_uncond_branch(*start_off, *start_off + 4, label); + sink.put4(0b1101111); + } + &Inst::CondBr { + taken, + not_taken, + kind, + } => { + match taken { + CondBrTarget::Label(label) => { + let code = kind.emit(); + let code_inverse = kind.inverse().emit().to_le_bytes(); + sink.use_label_at_offset(*start_off, label, LabelUse::B12); + sink.add_cond_branch(*start_off, *start_off + 4, label, &code_inverse); + sink.put4(code); + } + CondBrTarget::Fallthrough => panic!("Cannot fallthrough in taken target"), + } + + match not_taken { + CondBrTarget::Label(label) => { + Inst::gen_jump(label).emit(sink, emit_info, state) + } + CondBrTarget::Fallthrough => {} + }; + } + + &Inst::Mov { rd, rm, ty } => { + debug_assert_eq!(rd.to_reg().class(), rm.class()); + if rd.to_reg() == rm { + return; + } + + match rm.class() { + RegClass::Int => Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: rd, + rs: rm, + imm12: Imm12::ZERO, + }, + RegClass::Float => Inst::FpuRRR { + alu_op: FpuOPRRR::Fsgnj, + width: FpuOPWidth::try_from(ty).unwrap(), + frm: FRM::RNE, + rd: rd, + rs1: rm, + rs2: rm, + }, + RegClass::Vector => Inst::VecAluRRImm5 { + op: VecAluOpRRImm5::VmvrV, + vd: rd, + vs2: rm, + // Imm 0 means copy 1 register. + imm: Imm5::maybe_from_i8(0).unwrap(), + mask: VecOpMasking::Disabled, + // Vstate for this instruction is ignored. + vstate: VState::from_type(ty), + }, + } + .emit(sink, emit_info, state); + } + + &Inst::MovFromPReg { rd, rm } => { + Inst::gen_move(rd, Reg::from(rm), I64).emit(sink, emit_info, state); + } + + &Inst::BrTable { + index, + tmp1, + tmp2, + ref targets, + } => { + let ext_index = writable_spilltmp_reg(); + + let label_compute_target = sink.get_label(); + + // The default target is passed in as the 0th element of `targets` + // separate it here for clarity. + let default_target = targets[0]; + let targets = &targets[1..]; + + // We are going to potentially emit a large amount of instructions, so ensure that we emit an island + // now if we need one. + // + // The worse case PC calculations are 12 instructions. And each entry in the jump table is 2 instructions. + // Check if we need to emit a jump table here to support that jump. + let inst_count = 12 + (targets.len() * 2); + let distance = (inst_count * Inst::UNCOMPRESSED_INSTRUCTION_SIZE as usize) as u32; + if sink.island_needed(distance) { + let jump_around_label = sink.get_label(); + Inst::gen_jump(jump_around_label).emit(sink, emit_info, state); + sink.emit_island(distance + 4, &mut state.ctrl_plane); + sink.bind_label(jump_around_label, &mut state.ctrl_plane); + } + + // We emit a bounds check on the index, if the index is larger than the number of + // jump table entries, we jump to the default block. Otherwise we compute a jump + // offset by multiplying the index by 8 (the size of each entry) and then jump to + // that offset. Each jump table entry is a regular auipc+jalr which we emit sequentially. + // + // Build the following sequence: + // + // extend_index: + // zext.w ext_index, index + // bounds_check: + // li tmp, n_labels + // bltu ext_index, tmp, compute_target + // jump_to_default_block: + // auipc pc, 0 + // jalr zero, pc, default_block + // compute_target: + // auipc pc, 0 + // slli tmp, ext_index, 3 + // add pc, pc, tmp + // jalr zero, pc, 0x10 + // jump_table: + // ; This repeats for each entry in the jumptable + // auipc pc, 0 + // jalr zero, pc, block_target + + // Extend the index to 64 bits. + // + // This prevents us branching on the top 32 bits of the index, which + // are undefined. + Inst::Extend { + rd: ext_index, + rn: index, + signed: false, + from_bits: 32, + to_bits: 64, + } + .emit(sink, emit_info, state); + + // Bounds check. + // + // Check if the index passed in is larger than the number of jumptable + // entries that we have. If it is, we fallthrough to a jump into the + // default block. + Inst::load_constant_u32(tmp2, targets.len() as u64) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + Inst::CondBr { + taken: CondBrTarget::Label(label_compute_target), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::UnsignedLessThan, + rs1: ext_index.to_reg(), + rs2: tmp2.to_reg(), + }, + } + .emit(sink, emit_info, state); + + sink.use_label_at_offset(sink.cur_offset(), default_target, LabelUse::PCRel32); + Inst::construct_auipc_and_jalr(None, tmp2, 0) + .iter() + .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off)); + + // Compute the jump table offset. + // We need to emit a PC relative offset, + sink.bind_label(label_compute_target, &mut state.ctrl_plane); + + // Get the current PC. + Inst::Auipc { + rd: tmp1, + imm: Imm20::ZERO, + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // These instructions must be emitted as uncompressed since we + // are manually computing the offset from the PC. + + // Multiply the index by 8, since that is the size in + // bytes of each jump table entry + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp2, + rs: ext_index.to_reg(), + imm12: Imm12::from_i16(3), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // Calculate the base of the jump, PC + the offset from above. + Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: tmp1, + rs1: tmp1.to_reg(), + rs2: tmp2.to_reg(), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // Jump to the middle of the jump table. + // We add a 16 byte offset here, since we used 4 instructions + // since the AUIPC that was used to get the PC. + Inst::Jalr { + rd: writable_zero_reg(), + base: tmp1.to_reg(), + offset: Imm12::from_i16((4 * Inst::UNCOMPRESSED_INSTRUCTION_SIZE) as i16), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // Emit the jump table. + // + // Each entry is a auipc + jalr to the target block. We also start with a island + // if necessary. + + // Emit the jumps back to back + for target in targets.iter() { + sink.use_label_at_offset(sink.cur_offset(), *target, LabelUse::PCRel32); + + Inst::construct_auipc_and_jalr(None, tmp2, 0) + .iter() + .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off)); + } + + // We've just emitted an island that is safe up to *here*. + // Mark it as such so that we don't needlessly emit additional islands. + *start_off = sink.cur_offset(); + } + + &Inst::Atomic { + op, + rd, + addr, + src, + amo, + } => { + // TODO: get flags from original CLIF atomic instruction + let flags = MemFlags::new(); + if let Some(trap_code) = flags.trap_code() { + sink.add_trap(trap_code); + } + let x = op.op_code() + | reg_to_gpr_num(rd.to_reg()) << 7 + | op.funct3() << 12 + | reg_to_gpr_num(addr) << 15 + | reg_to_gpr_num(src) << 20 + | op.funct7(amo) << 25; + + sink.put4(x); + } + &Inst::Fence { pred, succ } => { + let x = 0b0001111 + | 0b00000 << 7 + | 0b000 << 12 + | 0b00000 << 15 + | (succ as u32) << 20 + | (pred as u32) << 24; + + sink.put4(x); + } + &Inst::Auipc { rd, imm } => { + sink.put4(enc_auipc(rd, imm)); + } + + &Inst::LoadAddr { rd, mem } => { + let base = mem.get_base_register(); + let offset = mem.get_offset_with_state(state); + let offset_imm12 = Imm12::maybe_from_i64(offset); + + match (mem, base, offset_imm12) { + (_, Some(rs), Some(imm12)) => { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs, + imm12, + } + .emit(sink, emit_info, state); + } + (_, Some(rs), None) => { + let mut insts = Inst::load_constant_u64(rd, offset as u64); + insts.push(Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd, + rs1: rd.to_reg(), + rs2: rs, + }); + insts + .into_iter() + .for_each(|inst| inst.emit(sink, emit_info, state)); + } + (AMode::Const(addr), None, _) => { + // Get an address label for the constant and recurse. + let label = sink.get_label_for_constant(addr); + Inst::LoadAddr { + rd, + mem: AMode::Label(label), + } + .emit(sink, emit_info, state); + } + (AMode::Label(label), None, _) => { + // Get the current PC. + sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20); + let inst = Inst::Auipc { + rd, + imm: Imm20::ZERO, + }; + inst.emit_uncompressed(sink, emit_info, state, start_off); + + // Emit an add to the address with a relocation. + // This later gets patched up with the correct offset. + sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd, + rs: rd.to_reg(), + imm12: Imm12::ZERO, + } + .emit_uncompressed(sink, emit_info, state, start_off); + } + (amode, _, _) => { + unimplemented!("LoadAddr: {:?}", amode); + } + } + } + + &Inst::Select { + ref dst, + condition, + ref x, + ref y, + } => { + // The general form for this select is the following: + // + // mv rd, x + // b{cond} rcond, label_end + // mv rd, y + // label_end: + // ... etc + // + // This is built on the assumption that moves are cheap, but branches and jumps + // are not. So with this format we always avoid one jump instruction at the expense + // of an unconditional move. + // + // We also perform another optimization here. If the destination register is the same + // as one of the input registers, we can avoid emitting the first unconditional move + // and emit just the branch and the second move. + // + // To make sure that this happens as often as possible, we also try to invert the + // condition, so that if either of the input registers are the same as the destination + // we avoid that move. + + let label_end = sink.get_label(); + + let xregs = x.regs(); + let yregs = y.regs(); + let dstregs: Vec = dst.regs().into_iter().map(|r| r.to_reg()).collect(); + let condregs = condition.regs(); + + // We are going to write to the destination register before evaluating + // the condition, so we need to make sure that the destination register + // is not one of the condition registers. + // + // This should never happen, since hopefully the regalloc constraints + // for this register are set up correctly. + debug_assert_ne!(dstregs, condregs); + + // Check if we can invert the condition and avoid moving the y registers into + // the destination. This allows us to only emit the branch and one of the moves. + let (uncond_move, cond_move, condition) = if yregs == dstregs { + (yregs, xregs, condition.inverse()) + } else { + (xregs, yregs, condition) + }; + + // Unconditionally move one of the values to the destination register. + // + // These moves may not end up being emitted if the source and + // destination registers are the same. That logic is built into + // the emit function for `Inst::Mov`. + for i in gen_moves(dst.regs(), uncond_move) { + i.emit(sink, emit_info, state); + } + + // If the condition passes we skip over the conditional move + Inst::CondBr { + taken: CondBrTarget::Label(label_end), + not_taken: CondBrTarget::Fallthrough, + kind: condition, + } + .emit(sink, emit_info, state); + + // Move the conditional value to the destination register. + for i in gen_moves(dst.regs(), cond_move) { + i.emit(sink, emit_info, state); + } + + sink.bind_label(label_end, &mut state.ctrl_plane); + } + &Inst::Jalr { rd, base, offset } => { + sink.put4(enc_jalr(rd, base, offset)); + } + &Inst::EBreak => { + sink.put4(0x00100073); + } + &Inst::AtomicCas { + offset, + t0, + dst, + e, + addr, + v, + ty, + } => { + // # addr holds address of memory location + // # e holds expected value + // # v holds desired value + // # dst holds return value + // cas: + // lr.w dst, (addr) # Load original value. + // bne dst, e, fail # Doesn’t match, so fail. + // sc.w t0, v, (addr) # Try to update. + // bnez t0 , cas # if store not ok,retry. + // fail: + let fail_label = sink.get_label(); + let cas_lebel = sink.get_label(); + sink.bind_label(cas_lebel, &mut state.ctrl_plane); + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: dst, + addr, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + if ty.bits() < 32 { + AtomicOP::extract(dst, offset, dst.to_reg(), ty) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + } else if ty.bits() == 32 { + Inst::Extend { + rd: dst, + rn: dst.to_reg(), + signed: false, + from_bits: 32, + to_bits: 64, + } + .emit(sink, emit_info, state); + } + Inst::CondBr { + taken: CondBrTarget::Label(fail_label), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::NotEqual, + rs1: e, + rs2: dst.to_reg(), + }, + } + .emit(sink, emit_info, state); + let store_value = if ty.bits() < 32 { + // reload value to t0. + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: t0, + addr, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + // set reset part. + AtomicOP::merge(t0, writable_spilltmp_reg(), offset, v, ty) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + t0.to_reg() + } else { + v + }; + Inst::Atomic { + op: AtomicOP::store_op(ty), + rd: t0, + addr, + src: store_value, + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + // check is our value stored. + Inst::CondBr { + taken: CondBrTarget::Label(cas_lebel), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::NotEqual, + rs1: t0.to_reg(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + sink.bind_label(fail_label, &mut state.ctrl_plane); + } + &Inst::AtomicRmwLoop { + offset, + op, + dst, + ty, + p, + x, + t0, + } => { + let retry = sink.get_label(); + sink.bind_label(retry, &mut state.ctrl_plane); + // load old value. + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: dst, + addr: p, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + // + + let store_value: Reg = match op { + crate::ir::AtomicRmwOp::Add + | crate::ir::AtomicRmwOp::Sub + | crate::ir::AtomicRmwOp::And + | crate::ir::AtomicRmwOp::Or + | crate::ir::AtomicRmwOp::Xor => { + AtomicOP::extract(dst, offset, dst.to_reg(), ty) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + Inst::AluRRR { + alu_op: match op { + crate::ir::AtomicRmwOp::Add => AluOPRRR::Add, + crate::ir::AtomicRmwOp::Sub => AluOPRRR::Sub, + crate::ir::AtomicRmwOp::And => AluOPRRR::And, + crate::ir::AtomicRmwOp::Or => AluOPRRR::Or, + crate::ir::AtomicRmwOp::Xor => AluOPRRR::Xor, + _ => unreachable!(), + }, + rd: t0, + rs1: dst.to_reg(), + rs2: x, + } + .emit(sink, emit_info, state); + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: writable_spilltmp_reg2(), + addr: p, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + AtomicOP::merge( + writable_spilltmp_reg2(), + writable_spilltmp_reg(), + offset, + t0.to_reg(), + ty, + ) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + spilltmp_reg2() + } + crate::ir::AtomicRmwOp::Nand => { + if ty.bits() < 32 { + AtomicOP::extract(dst, offset, dst.to_reg(), ty) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + } + Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: t0, + rs1: x, + rs2: dst.to_reg(), + } + .emit(sink, emit_info, state); + Inst::construct_bit_not(t0, t0.to_reg()).emit(sink, emit_info, state); + if ty.bits() < 32 { + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: writable_spilltmp_reg2(), + addr: p, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + AtomicOP::merge( + writable_spilltmp_reg2(), + writable_spilltmp_reg(), + offset, + t0.to_reg(), + ty, + ) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + spilltmp_reg2() + } else { + t0.to_reg() + } + } + + crate::ir::AtomicRmwOp::Umin + | crate::ir::AtomicRmwOp::Umax + | crate::ir::AtomicRmwOp::Smin + | crate::ir::AtomicRmwOp::Smax => { + let label_select_dst = sink.get_label(); + let label_select_done = sink.get_label(); + if op == crate::ir::AtomicRmwOp::Umin || op == crate::ir::AtomicRmwOp::Umax + { + AtomicOP::extract(dst, offset, dst.to_reg(), ty) + } else { + AtomicOP::extract_sext(dst, offset, dst.to_reg(), ty) + } + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + + Inst::CondBr { + taken: CondBrTarget::Label(label_select_dst), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: match op { + crate::ir::AtomicRmwOp::Umin => IntCC::UnsignedLessThan, + crate::ir::AtomicRmwOp::Umax => IntCC::UnsignedGreaterThan, + crate::ir::AtomicRmwOp::Smin => IntCC::SignedLessThan, + crate::ir::AtomicRmwOp::Smax => IntCC::SignedGreaterThan, + _ => unreachable!(), + }, + rs1: dst.to_reg(), + rs2: x, + }, + } + .emit(sink, emit_info, state); + // here we select x. + Inst::gen_move(t0, x, I64).emit(sink, emit_info, state); + Inst::gen_jump(label_select_done).emit(sink, emit_info, state); + sink.bind_label(label_select_dst, &mut state.ctrl_plane); + Inst::gen_move(t0, dst.to_reg(), I64).emit(sink, emit_info, state); + sink.bind_label(label_select_done, &mut state.ctrl_plane); + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: writable_spilltmp_reg2(), + addr: p, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + AtomicOP::merge( + writable_spilltmp_reg2(), + writable_spilltmp_reg(), + offset, + t0.to_reg(), + ty, + ) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + spilltmp_reg2() + } + crate::ir::AtomicRmwOp::Xchg => { + AtomicOP::extract(dst, offset, dst.to_reg(), ty) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + Inst::Atomic { + op: AtomicOP::load_op(ty), + rd: writable_spilltmp_reg2(), + addr: p, + src: zero_reg(), + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + AtomicOP::merge( + writable_spilltmp_reg2(), + writable_spilltmp_reg(), + offset, + x, + ty, + ) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + spilltmp_reg2() + } + }; + + Inst::Atomic { + op: AtomicOP::store_op(ty), + rd: t0, + addr: p, + src: store_value, + amo: AMO::SeqCst, + } + .emit(sink, emit_info, state); + + // if store is not ok,retry. + Inst::CondBr { + taken: CondBrTarget::Label(retry), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::NotEqual, + rs1: t0.to_reg(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + } + + &Inst::LoadExtName { + rd, + ref name, + offset, + } => { + if emit_info.shared_flag.is_pic() { + // Load a PC-relative address into a register. + // RISC-V does this slightly differently from other arches. We emit a relocation + // with a label, instead of the symbol itself. + // + // See: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#pc-relative-symbol-addresses + // + // Emit the following code: + // label: + // auipc rd, 0 # R_RISCV_GOT_HI20 (symbol_name) + // ld rd, rd, 0 # R_RISCV_PCREL_LO12_I (label) + + // Create the label that is going to be published to the final binary object. + let auipc_label = sink.get_label(); + sink.bind_label(auipc_label, &mut state.ctrl_plane); + + // Get the current PC. + sink.add_reloc(Reloc::RiscvGotHi20, &**name, 0); + Inst::Auipc { + rd: rd, + imm: Imm20::from_i32(0), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // The `ld` here, points to the `auipc` label instead of directly to the symbol. + sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0); + Inst::Load { + rd, + op: LoadOP::Ld, + flags: MemFlags::trusted(), + from: AMode::RegOffset(rd.to_reg(), 0), + } + .emit_uncompressed(sink, emit_info, state, start_off); + } else { + // In the non PIC sequence we relocate the absolute address into + // a prealocatted space, load it into a register and jump over it. + // + // Emit the following code: + // ld rd, label_data + // j label_end + // label_data: + // <8 byte space> # ABS8 + // label_end: + + let label_data = sink.get_label(); + let label_end = sink.get_label(); + + // Load the value from a label + Inst::Load { + rd, + op: LoadOP::Ld, + flags: MemFlags::trusted(), + from: AMode::Label(label_data), + } + .emit(sink, emit_info, state); + + // Jump over the data + Inst::gen_jump(label_end).emit(sink, emit_info, state); + + sink.bind_label(label_data, &mut state.ctrl_plane); + sink.add_reloc(Reloc::Abs8, name.as_ref(), offset); + sink.put8(0); + + sink.bind_label(label_end, &mut state.ctrl_plane); + } + } + + &Inst::ElfTlsGetAddr { rd, ref name } => { + // RISC-V's TLS GD model is slightly different from other arches. + // + // We have a relocation (R_RISCV_TLS_GD_HI20) that loads the high 20 bits + // of the address relative to the GOT entry. This relocation points to + // the symbol as usual. + // + // However when loading the bottom 12bits of the address, we need to + // use a label that points to the previous AUIPC instruction. + // + // label: + // auipc a0,0 # R_RISCV_TLS_GD_HI20 (symbol) + // addi a0,a0,0 # R_RISCV_PCREL_LO12_I (label) + // + // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#global-dynamic + + // Create the label that is going to be published to the final binary object. + let auipc_label = sink.get_label(); + sink.bind_label(auipc_label, &mut state.ctrl_plane); + + // Get the current PC. + sink.add_reloc(Reloc::RiscvTlsGdHi20, &**name, 0); + Inst::Auipc { + rd: rd, + imm: Imm20::from_i32(0), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + // The `addi` here, points to the `auipc` label instead of directly to the symbol. + sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: rd, + rs: rd.to_reg(), + imm12: Imm12::from_i16(0), + } + .emit_uncompressed(sink, emit_info, state, start_off); + + Inst::Call { + info: Box::new(CallInfo::empty( + ExternalName::LibCall(LibCall::ElfTlsGetAddr), + CallConv::SystemV, + )), + } + .emit_uncompressed(sink, emit_info, state, start_off); + } + + &Inst::TrapIf { + rs1, + rs2, + cc, + trap_code, + } => { + let label_end = sink.get_label(); + let cond = IntegerCompare { kind: cc, rs1, rs2 }; + + // Jump over the trap if we the condition is false. + Inst::CondBr { + taken: CondBrTarget::Label(label_end), + not_taken: CondBrTarget::Fallthrough, + kind: cond.inverse(), + } + .emit(sink, emit_info, state); + Inst::Udf { trap_code }.emit(sink, emit_info, state); + + sink.bind_label(label_end, &mut state.ctrl_plane); + } + &Inst::Udf { trap_code } => { + sink.add_trap(trap_code); + sink.put_data(Inst::TRAP_OPCODE); + } + &Inst::AtomicLoad { rd, ty, p } => { + // emit the fence. + Inst::Fence { + pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W, + succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W, + } + .emit(sink, emit_info, state); + // load. + Inst::Load { + rd: rd, + op: LoadOP::from_type(ty), + flags: MemFlags::new(), + from: AMode::RegOffset(p, 0), + } + .emit(sink, emit_info, state); + Inst::Fence { + pred: Inst::FENCE_REQ_R, + succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W, + } + .emit(sink, emit_info, state); + } + &Inst::AtomicStore { src, ty, p } => { + Inst::Fence { + pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W, + succ: Inst::FENCE_REQ_W, + } + .emit(sink, emit_info, state); + Inst::Store { + to: AMode::RegOffset(p, 0), + op: StoreOP::from_type(ty), + flags: MemFlags::new(), + src, + } + .emit(sink, emit_info, state); + } + + &Inst::Popcnt { + sum, + tmp, + step, + rs, + ty, + } => { + // load 0 to sum , init. + Inst::gen_move(sum, zero_reg(), I64).emit(sink, emit_info, state); + // load + Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)) + .emit(sink, emit_info, state); + // + Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::from_i16((ty.bits() - 1) as i16), + } + .emit(sink, emit_info, state); + let label_done = sink.get_label(); + let label_loop = sink.get_label(); + sink.bind_label(label_loop, &mut state.ctrl_plane); + Inst::CondBr { + taken: CondBrTarget::Label(label_done), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::SignedLessThanOrEqual, + rs1: step.to_reg(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + // test and add sum. + { + Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: writable_spilltmp_reg2(), + rs1: tmp.to_reg(), + rs2: rs, + } + .emit(sink, emit_info, state); + let label_over = sink.get_label(); + Inst::CondBr { + taken: CondBrTarget::Label(label_over), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::Equal, + rs1: zero_reg(), + rs2: spilltmp_reg2(), + }, + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: sum, + rs: sum.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + sink.bind_label(label_over, &mut state.ctrl_plane); + } + // set step and tmp. + { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: step, + rs: step.to_reg(), + imm12: Imm12::from_i16(-1), + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Srli, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + Inst::gen_jump(label_loop).emit(sink, emit_info, state); + } + sink.bind_label(label_done, &mut state.ctrl_plane); + } + &Inst::Cltz { + sum, + tmp, + step, + rs, + leading, + ty, + } => { + // load 0 to sum , init. + Inst::gen_move(sum, zero_reg(), I64).emit(sink, emit_info, state); + // load + Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)) + .emit(sink, emit_info, state); + // + Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state); + if leading { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::from_i16((ty.bits() - 1) as i16), + } + .emit(sink, emit_info, state); + } + let label_done = sink.get_label(); + let label_loop = sink.get_label(); + sink.bind_label(label_loop, &mut state.ctrl_plane); + Inst::CondBr { + taken: CondBrTarget::Label(label_done), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::SignedLessThanOrEqual, + rs1: step.to_reg(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + // test and add sum. + { + Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: writable_spilltmp_reg2(), + rs1: tmp.to_reg(), + rs2: rs, + } + .emit(sink, emit_info, state); + Inst::CondBr { + taken: CondBrTarget::Label(label_done), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::NotEqual, + rs1: zero_reg(), + rs2: spilltmp_reg2(), + }, + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: sum, + rs: sum.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + } + // set step and tmp. + { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: step, + rs: step.to_reg(), + imm12: Imm12::from_i16(-1), + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: if leading { + AluOPRRI::Srli + } else { + AluOPRRI::Slli + }, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + Inst::gen_jump(label_loop).emit(sink, emit_info, state); + } + sink.bind_label(label_done, &mut state.ctrl_plane); + } + &Inst::Brev8 { + rs, + ty, + step, + tmp, + tmp2, + rd, + } => { + Inst::gen_move(rd, zero_reg(), I64).emit(sink, emit_info, state); + Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)) + .emit(sink, emit_info, state); + // + Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::from_i16((ty.bits() - 1) as i16), + } + .emit(sink, emit_info, state); + Inst::load_imm12(tmp2, Imm12::ONE).emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp2, + rs: tmp2.to_reg(), + imm12: Imm12::from_i16((ty.bits() - 8) as i16), + } + .emit(sink, emit_info, state); + + let label_done = sink.get_label(); + let label_loop = sink.get_label(); + sink.bind_label(label_loop, &mut state.ctrl_plane); + Inst::CondBr { + taken: CondBrTarget::Label(label_done), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::SignedLessThanOrEqual, + rs1: step.to_reg(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + // test and set bit. + { + Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: writable_spilltmp_reg2(), + rs1: tmp.to_reg(), + rs2: rs, + } + .emit(sink, emit_info, state); + let label_over = sink.get_label(); + Inst::CondBr { + taken: CondBrTarget::Label(label_over), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::Equal, + rs1: zero_reg(), + rs2: spilltmp_reg2(), + }, + } + .emit(sink, emit_info, state); + Inst::AluRRR { + alu_op: AluOPRRR::Or, + rd: rd, + rs1: rd.to_reg(), + rs2: tmp2.to_reg(), + } + .emit(sink, emit_info, state); + sink.bind_label(label_over, &mut state.ctrl_plane); + } + // set step and tmp. + { + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: step, + rs: step.to_reg(), + imm12: Imm12::from_i16(-1), + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Srli, + rd: tmp, + rs: tmp.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + { + // reset tmp2 + // if (step %=8 == 0) then tmp2 = tmp2 >> 15 + // if (step %=8 != 0) then tmp2 = tmp2 << 1 + let label_over = sink.get_label(); + let label_sll_1 = sink.get_label(); + Inst::load_imm12(writable_spilltmp_reg2(), Imm12::from_i16(8)) + .emit(sink, emit_info, state); + Inst::AluRRR { + alu_op: AluOPRRR::Rem, + rd: writable_spilltmp_reg2(), + rs1: step.to_reg(), + rs2: spilltmp_reg2(), + } + .emit(sink, emit_info, state); + Inst::CondBr { + taken: CondBrTarget::Label(label_sll_1), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::NotEqual, + rs1: spilltmp_reg2(), + rs2: zero_reg(), + }, + } + .emit(sink, emit_info, state); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Srli, + rd: tmp2, + rs: tmp2.to_reg(), + imm12: Imm12::from_i16(15), + } + .emit(sink, emit_info, state); + Inst::gen_jump(label_over).emit(sink, emit_info, state); + sink.bind_label(label_sll_1, &mut state.ctrl_plane); + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: tmp2, + rs: tmp2.to_reg(), + imm12: Imm12::ONE, + } + .emit(sink, emit_info, state); + sink.bind_label(label_over, &mut state.ctrl_plane); + } + Inst::gen_jump(label_loop).emit(sink, emit_info, state); + } + sink.bind_label(label_done, &mut state.ctrl_plane); + } + &Inst::StackProbeLoop { + guard_size, + probe_count, + tmp: guard_size_tmp, + } => { + let step = writable_spilltmp_reg(); + Inst::load_constant_u64(step, (guard_size as u64) * (probe_count as u64)) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + Inst::load_constant_u64(guard_size_tmp, guard_size as u64) + .iter() + .for_each(|i| i.emit(sink, emit_info, state)); + + let loop_start = sink.get_label(); + let label_done = sink.get_label(); + sink.bind_label(loop_start, &mut state.ctrl_plane); + Inst::CondBr { + taken: CondBrTarget::Label(label_done), + not_taken: CondBrTarget::Fallthrough, + kind: IntegerCompare { + kind: IntCC::UnsignedLessThanOrEqual, + rs1: step.to_reg(), + rs2: guard_size_tmp.to_reg(), + }, + } + .emit(sink, emit_info, state); + // compute address. + Inst::AluRRR { + alu_op: AluOPRRR::Sub, + rd: writable_spilltmp_reg2(), + rs1: stack_reg(), + rs2: step.to_reg(), + } + .emit(sink, emit_info, state); + Inst::Store { + to: AMode::RegOffset(spilltmp_reg2(), 0), + op: StoreOP::Sb, + flags: MemFlags::new(), + src: zero_reg(), + } + .emit(sink, emit_info, state); + // reset step. + Inst::AluRRR { + alu_op: AluOPRRR::Sub, + rd: step, + rs1: step.to_reg(), + rs2: guard_size_tmp.to_reg(), + } + .emit(sink, emit_info, state); + Inst::gen_jump(loop_start).emit(sink, emit_info, state); + sink.bind_label(label_done, &mut state.ctrl_plane); + } + &Inst::VecAluRRRImm5 { + op, + vd, + vd_src, + imm, + vs2, + ref mask, + .. + } => { + debug_assert_eq!(vd.to_reg(), vd_src); + + sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, *mask)); + } + &Inst::VecAluRRRR { + op, + vd, + vd_src, + vs1, + vs2, + ref mask, + .. + } => { + debug_assert_eq!(vd.to_reg(), vd_src); + + sink.put4(encode_valu_rrrr(op, vd, vs2, vs1, *mask)); + } + &Inst::VecAluRRR { + op, + vd, + vs1, + vs2, + ref mask, + .. + } => { + sink.put4(encode_valu(op, vd, vs1, vs2, *mask)); + } + &Inst::VecAluRRImm5 { + op, + vd, + imm, + vs2, + ref mask, + .. + } => { + sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, *mask)); + } + &Inst::VecAluRR { + op, + vd, + vs, + ref mask, + .. + } => { + sink.put4(encode_valu_rr(op, vd, vs, *mask)); + } + &Inst::VecAluRImm5 { + op, + vd, + imm, + ref mask, + .. + } => { + sink.put4(encode_valu_r_imm(op, vd, imm, *mask)); + } + &Inst::VecSetState { rd, ref vstate } => { + sink.put4(encode_vcfg_imm( + 0x57, + rd.to_reg(), + vstate.avl.unwrap_static(), + &vstate.vtype, + )); + + // Update the current vector emit state. + state.vstate = EmitVState::Known(*vstate); + } + + &Inst::VecLoad { + eew, + to, + ref from, + ref mask, + flags, + .. + } => { + // Vector Loads don't support immediate offsets, so we need to load it into a register. + let addr = match from { + VecAMode::UnitStride { base } => { + let base_reg = base.get_base_register(); + let offset = base.get_offset_with_state(state); + + // Reg+0 Offset can be directly encoded + if let (Some(base_reg), 0) = (base_reg, offset) { + base_reg + } else { + // Otherwise load the address it into a reg and load from it. + let tmp = writable_spilltmp_reg(); + Inst::LoadAddr { + rd: tmp, + mem: *base, + } + .emit(sink, emit_info, state); + tmp.to_reg() + } + } + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + + sink.put4(encode_vmem_load( + 0x07, + to.to_reg(), + eew, + addr, + from.lumop(), + *mask, + from.mop(), + from.nf(), + )); + } + + &Inst::VecStore { + eew, + ref to, + from, + ref mask, + flags, + .. + } => { + // Vector Stores don't support immediate offsets, so we need to load it into a register. + let addr = match to { + VecAMode::UnitStride { base } => { + let base_reg = base.get_base_register(); + let offset = base.get_offset_with_state(state); + + // Reg+0 Offset can be directly encoded + if let (Some(base_reg), 0) = (base_reg, offset) { + base_reg + } else { + // Otherwise load the address it into a reg and load from it. + let tmp = writable_spilltmp_reg(); + Inst::LoadAddr { + rd: tmp, + mem: *base, + } + .emit(sink, emit_info, state); + tmp.to_reg() + } + } + }; + + if let Some(trap_code) = flags.trap_code() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(trap_code); + } + + sink.put4(encode_vmem_store( + 0x27, + from, + eew, + addr, + to.sumop(), + *mask, + to.mop(), + to.nf(), + )); + } + }; + } +} + +fn emit_return_call_common_sequence( + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + info: &ReturnCallInfo, +) { + // The return call sequence can potentially emit a lot of instructions (up to 634 bytes!) + // So lets emit an island here if we need it. + // + // It is difficult to calculate exactly how many instructions are going to be emitted, so + // we calculate it by emitting it into a disposable buffer, and then checking how many instructions + // were actually emitted. + let mut buffer = MachBuffer::new(); + let mut fake_emit_state = state.clone(); + + return_call_emit_impl(&mut buffer, emit_info, &mut fake_emit_state, info); + + // Finalize the buffer and get the number of bytes emitted. + let buffer = buffer.finish(&Default::default(), &mut Default::default()); + let length = buffer.data().len() as u32; + + // And now emit the island inline with this instruction. + if sink.island_needed(length) { + let jump_around_label = sink.get_label(); + Inst::gen_jump(jump_around_label).emit(sink, emit_info, state); + sink.emit_island(length + 4, &mut state.ctrl_plane); + sink.bind_label(jump_around_label, &mut state.ctrl_plane); + } + + // Now that we're done, emit the *actual* return sequence. + return_call_emit_impl(sink, emit_info, state, info); +} + +/// This should not be called directly, Instead prefer to call [emit_return_call_common_sequence]. +fn return_call_emit_impl( + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + info: &ReturnCallInfo, +) { + let sp_to_fp_offset = { + let frame_layout = state.frame_layout(); + i64::from( + frame_layout.clobber_size + + frame_layout.fixed_frame_storage_size + + frame_layout.outgoing_args_size, + ) + }; + + let mut clobber_offset = sp_to_fp_offset - 8; + for reg in state.frame_layout().clobbered_callee_saves.clone() { + let rreg = reg.to_reg(); + let ty = match rreg.class() { + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Restores"), + }; + + Inst::gen_load( + reg.map(Reg::from), + AMode::SPOffset(clobber_offset), + ty, + MemFlags::trusted(), + ) + .emit(sink, emit_info, state); + + clobber_offset -= 8 + } + + // Restore the link register and frame pointer + let setup_area_size = i64::from(state.frame_layout().setup_area_size); + if setup_area_size > 0 { + Inst::gen_load( + writable_link_reg(), + AMode::SPOffset(sp_to_fp_offset + 8), + I64, + MemFlags::trusted(), + ) + .emit(sink, emit_info, state); + + Inst::gen_load( + writable_fp_reg(), + AMode::SPOffset(sp_to_fp_offset), + I64, + MemFlags::trusted(), + ) + .emit(sink, emit_info, state); + } + + // If we over-allocated the incoming args area in the prologue, resize down to what the callee + // is expecting. + let incoming_args_diff = + i64::from(state.frame_layout().tail_args_size - info.new_stack_arg_size); + + // Increment SP all at once + let sp_increment = sp_to_fp_offset + setup_area_size + incoming_args_diff; + if sp_increment > 0 { + for inst in Riscv64MachineDeps::gen_sp_reg_adjust(i32::try_from(sp_increment).unwrap()) { + inst.emit(sink, emit_info, state); + } + } +} diff --git a/hbcb/src/inst/emit_tests.rs b/hbcb/src/inst/emit_tests.rs new file mode 100644 index 0000000..668e170 --- /dev/null +++ b/hbcb/src/inst/emit_tests.rs @@ -0,0 +1,2277 @@ +#[allow(unused)] +use crate::ir::LibCall; +use crate::inst::*; +use crate::lower::isle::generated_code::FpuOPWidth; +use std::borrow::Cow; + +fn fa7() -> Reg { + f_reg(17) +} + +#[test] +fn test_riscv64_binemit() { + struct TestUnit { + inst: Inst, + assembly: &'static str, + code: TestEncoding, + } + + struct TestEncoding(Cow<'static, str>); + + impl From<&'static str> for TestEncoding { + fn from(value: &'static str) -> Self { + Self(value.into()) + } + } + + impl From for TestEncoding { + fn from(value: u32) -> Self { + let value = value.swap_bytes(); + let value = format!("{value:08X}"); + Self(value.into()) + } + } + + impl TestUnit { + fn new(inst: Inst, assembly: &'static str, code: impl Into) -> Self { + let code = code.into(); + Self { + inst, + assembly, + code, + } + } + } + + let mut insns = Vec::::with_capacity(500); + + insns.push(TestUnit::new(Inst::Ret {}, "ret", 0x00008067)); + + insns.push(TestUnit::new( + Inst::Mov { + rd: writable_fa0(), + rm: fa1(), + ty: F32, + }, + "fmv.s fa0,fa1", + 0x20b58553, + )); + + insns.push(TestUnit::new( + Inst::Mov { + rd: writable_fa0(), + rm: fa1(), + ty: F64, + }, + "fmv.d fa0,fa1", + 0x22b58553, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Brev8, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "brev8 a1,a0", + 0x68755593, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Rev8, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "rev8 a1,a0", + 0x6b855593, + )); + + // + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Bclri, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "bclri a1,a0,5", + 0x48551593, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Bexti, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "bexti a1,a0,5", + 0x48555593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Binvi, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "binvi a1,a0,5", + 0x68551593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Bseti, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "bseti a1,a0,5", + 0x28551593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Rori, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "rori a1,a0,5", + 0x60555593, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Roriw, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "roriw a1,a0,5", + 0x6055559b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::SlliUw, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "slli.uw a1,a0,5", + 0x855159b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Clz, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "clz a1,a0", + 0x60051593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Clzw, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "clzw a1,a0", + 0x6005159b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Cpop, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "cpop a1,a0", + 0x60251593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Cpopw, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "cpopw a1,a0", + 0x6025159b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Ctz, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "ctz a1,a0", + 0x60151593, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Ctzw, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "ctzw a1,a0", + 0x6015159b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Sextb, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "sext.b a1,a0", + 0x60451593, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Sexth, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "sext.h a1,a0", + 0x60551593, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Zexth, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "zext.h a1,a0", + 0x80545bb, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Orcb, + rd: writable_a1(), + rs: a0(), + imm12: Imm12::ZERO, + }, + "orc.b a1,a0", + 0x28755593, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Adduw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "zext.w a1,a0", + 0x80505bb, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Adduw, + rd: writable_a1(), + rs1: a0(), + rs2: a1(), + }, + "add.uw a1,a0,a1", + 0x08b505bb, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Andn, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "andn a1,a0,zero", + 0x400575b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Bclr, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "bclr a1,a0,zero", + 0x480515b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Bext, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "bext a1,a0,zero", + 0x480555b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Binv, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "binv a1,a0,zero", + 0x680515b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Bset, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "bset a1,a0,zero", + 0x280515b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Clmul, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "clmul a1,a0,zero", + 0xa0515b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Clmulh, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "clmulh a1,a0,zero", + 0xa0535b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Clmulr, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "clmulr a1,a0,zero", + 0xa0525b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Max, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "max a1,a0,zero", + 0xa0565b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Maxu, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "maxu a1,a0,zero", + 0xa0575b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Min, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "min a1,a0,zero", + 0xa0545b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Minu, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "minu a1,a0,zero", + 0xa0555b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Orn, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "orn a1,a0,zero", + 0x400565b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Rol, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "rol a1,a0,zero", + 0x600515b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Rolw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "rolw a1,a0,zero", + 0x600515bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Ror, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "ror a1,a0,zero", + 0x600555b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Rorw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "rorw a1,a0,zero", + 0x600555bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh1add, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh1add a1,a0,zero", + 0x200525b3, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh1adduw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh1add.uw a1,a0,zero", + 0x200525bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh2add, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh2add a1,a0,zero", + 0x200545b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh2adduw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh2add.uw a1,a0,zero", + 0x200545bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh3add, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh3add a1,a0,zero", + 0x200565b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sh3adduw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "sh3add.uw a1,a0,zero", + 0x200565bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Xnor, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "xnor a1,a0,zero", + 0x400545b3, + )); + + // Zbkb + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Pack, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "pack a1,a0,zero", + 0x080545b3, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Packw, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "packw a1,a0,zero", + 0x080545bb, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Packh, + rd: writable_a1(), + rs1: a0(), + rs2: zero_reg(), + }, + "packh a1,a0,zero", + 0x080575b3, + )); + + // + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: writable_fp_reg(), + rs1: fp_reg(), + rs2: zero_reg(), + }, + "add fp,fp,zero", + 0x40433, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: writable_fp_reg(), + rs: stack_reg(), + imm12: Imm12::maybe_from_u64(100).unwrap(), + }, + "addi fp,sp,100", + 0x6410413, + )); + insns.push(TestUnit::new( + Inst::Lui { + rd: writable_zero_reg(), + imm: Imm20::from_i32(120), + }, + "lui zero,120", + 0x78037, + )); + insns.push(TestUnit::new( + Inst::Auipc { + rd: writable_zero_reg(), + imm: Imm20::from_i32(120), + }, + "auipc zero,120", + 0x78017, + )); + + insns.push(TestUnit::new( + Inst::Jalr { + rd: writable_a0(), + base: a0(), + offset: Imm12::from_i16(100), + }, + "jalr a0,100(a0)", + 0x6450567, + )); + + insns.push(TestUnit::new( + Inst::Load { + rd: writable_a0(), + op: LoadOP::Lb, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "lb a0,100(a1)", + 0x6458503, + )); + insns.push(TestUnit::new( + Inst::Load { + rd: writable_a0(), + op: LoadOP::Lh, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "lh a0,100(a1)", + 0x6459503, + )); + + insns.push(TestUnit::new( + Inst::Load { + rd: writable_a0(), + op: LoadOP::Lw, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "lw a0,100(a1)", + 0x645a503, + )); + + insns.push(TestUnit::new( + Inst::Load { + rd: writable_a0(), + op: LoadOP::Ld, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "ld a0,100(a1)", + 0x645b503, + )); + insns.push(TestUnit::new( + Inst::Load { + rd: Writable::from_reg(fa0()), + op: LoadOP::Flw, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "flw fa0,100(a1)", + 0x645a507, + )); + + insns.push(TestUnit::new( + Inst::Load { + rd: Writable::from_reg(fa0()), + op: LoadOP::Fld, + flags: MemFlags::new(), + from: AMode::RegOffset(a1(), 100), + }, + "fld fa0,100(a1)", + 0x645b507, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Sb, + flags: MemFlags::new(), + src: a0(), + }, + "sb a0,100(sp)", + 0x6a10223, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Sh, + flags: MemFlags::new(), + src: a0(), + }, + "sh a0,100(sp)", + 0x6a11223, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Sw, + flags: MemFlags::new(), + src: a0(), + }, + "sw a0,100(sp)", + 0x6a12223, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Sd, + flags: MemFlags::new(), + src: a0(), + }, + "sd a0,100(sp)", + 0x6a13223, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Fsw, + flags: MemFlags::new(), + src: fa0(), + }, + "fsw fa0,100(sp)", + 0x6a12227, + )); + insns.push(TestUnit::new( + Inst::Store { + to: AMode::SPOffset(100), + op: StoreOP::Fsd, + flags: MemFlags::new(), + src: fa0(), + }, + "fsd fa0,100(sp)", + 0x6a13227, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(100), + }, + "addi a0,a0,100", + 0x6450513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slti, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(100), + }, + "slti a0,a0,100", + 0x6452513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::SltiU, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(100), + }, + "sltiu a0,a0,100", + 0x6453513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Xori, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(100), + }, + "xori a0,a0,100", + 0x6454513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Andi, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(100), + }, + "andi a0,a0,100", + 0x6457513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slli, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "slli a0,a0,5", + 0x551513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Srli, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "srli a0,a0,5", + 0x555513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Srai, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "srai a0,a0,5", + 0x40555513, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addiw, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(120), + }, + "addiw a0,a0,120", + 0x785051b, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Slliw, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "slliw a0,a0,5", + 0x55151b, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::SrliW, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "srliw a0,a0,5", + 0x55551b, + )); + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Sraiw, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "sraiw a0,a0,5", + 0x4055551b, + )); + + insns.push(TestUnit::new( + Inst::AluRRImm12 { + alu_op: AluOPRRI::Sraiw, + rd: writable_a0(), + rs: a0(), + imm12: Imm12::from_i16(5), + }, + "sraiw a0,a0,5", + 0x4055551b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Add, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "add a0,a0,a1", + 0xb50533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sub, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sub a0,a0,a1", + 0x40b50533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sll, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sll a0,a0,a1", + 0xb51533, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Slt, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "slt a0,a0,a1", + 0xb52533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::SltU, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sltu a0,a0,a1", + 0xb53533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Xor, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "xor a0,a0,a1", + 0xb54533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Srl, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "srl a0,a0,a1", + 0xb55533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sra, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sra a0,a0,a1", + 0x40b55533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Or, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "or a0,a0,a1", + 0xb56533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::And, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "and a0,a0,a1", + 0xb57533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Addw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "addw a0,a0,a1", + 0xb5053b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Subw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "subw a0,a0,a1", + 0x40b5053b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sllw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sllw a0,a0,a1", + 0xb5153b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Srlw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "srlw a0,a0,a1", + 0xb5553b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Sraw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "sraw a0,a0,a1", + 0x40b5553b, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Mul, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "mul a0,a0,a1", + 0x2b50533, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Mulh, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "mulh a0,a0,a1", + 0x2b51533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Mulhsu, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "mulhsu a0,a0,a1", + 0x2b52533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Mulhu, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "mulhu a0,a0,a1", + 0x2b53533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Div, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "div a0,a0,a1", + 0x2b54533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::DivU, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "divu a0,a0,a1", + 0x2b55533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Rem, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "rem a0,a0,a1", + 0x2b56533, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::RemU, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "remu a0,a0,a1", + 0x2b57533, + )); + + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Mulw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "mulw a0,a0,a1", + 0x2b5053b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Divw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "divw a0,a0,a1", + 0x2b5453b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Remw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "remw a0,a0,a1", + 0x2b5653b, + )); + insns.push(TestUnit::new( + Inst::AluRRR { + alu_op: AluOPRRR::Remuw, + rd: writable_a0(), + rs1: a0(), + rs2: a1(), + }, + "remuw a0,a0,a1", + 0x2b5753b, + )); + + // + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fadd.s fa0,fa0,fa1,rne", + 0xb50553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsub.s fa0,fa0,fa1,rtz", + 0x8b51553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RUP, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fmul, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmul.s fa0,fa0,fa1,rup", + 0x10b53553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fdiv, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fdiv.s fa0,fa0,fa1,fcsr", + 0x18b57553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fsgnj, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnj.s fa0,fa0,fa1", + 0x20b50553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fsgnjn, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnjn.s fa0,fa0,fa1", + 0x20b51553, + )); + + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RDN, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fsgnjx, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnjx.s fa0,fa0,fa1", + 0x20b52553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fmin, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmin.s fa0,fa0,fa1", + 0x28b50553, + )); + + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fmax, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmax.s fa0,fa0,fa1", + 0x28b51553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RDN, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Feq, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "feq.s a0,fa0,fa1", + 0xa0b52553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Flt, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "flt.s a0,fa0,fa1", + 0xa0b51553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRRR::Fle, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "fle.s a0,fa0,fa1", + 0xa0b50553, + )); + + // + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fadd.d fa0,fa0,fa1,fcsr", + 0x2b57553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsub.d fa0,fa0,fa1,fcsr", + 0xab57553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fmul, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmul.d fa0,fa0,fa1,fcsr", + 0x12b57553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fdiv, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fdiv.d fa0,fa0,fa1,fcsr", + 0x1ab57553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fsgnj, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnj.d fa0,fa0,fa1", + 0x22b50553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fsgnjn, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnjn.d fa0,fa0,fa1", + 0x22b51553, + )); + + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RDN, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fsgnjx, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fsgnjx.d fa0,fa0,fa1", + 0x22b52553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fmin, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmin.d fa0,fa0,fa1", + 0x2ab50553, + )); + + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fmax, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + }, + "fmax.d fa0,fa0,fa1", + 0x2ab51553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RDN, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Feq, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "feq.d a0,fa0,fa1", + 0xa2b52553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RTZ, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Flt, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "flt.d a0,fa0,fa1", + 0xa2b51553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRRR::Fle, + rd: writable_a0(), + rs1: fa0(), + rs2: fa1(), + }, + "fle.d a0,fa0,fa1", + 0xa2b50553, + )); + + // + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRR::Fsqrt, + rd: writable_fa0(), + rs: fa1(), + }, + "fsqrt.s fa0,fa1,rne", + 0x58058553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtWFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fcvt.w.s a0,fa1,fcsr", + 0xc005f553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtWuFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fcvt.wu.s a0,fa1,fcsr", + 0xc015f553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FmvXFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fmv.x.w a0,fa1", + 0xe0058553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RTZ, + width: FpuOPWidth::S, + alu_op: FpuOPRR::Fclass, + rd: writable_a0(), + rs: fa1(), + }, + "fclass.s a0,fa1", + 0xe0059553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtFmtW, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.s.w fa0,a0,fcsr", + 0xd0057553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtFmtWu, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.s.wu fa0,a0,fcsr", + 0xd0157553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FmvFmtX, + rd: writable_fa0(), + rs: a0(), + }, + "fmv.w.x fa0,a0", + 0xf0050553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtLFmt, + rd: writable_a0(), + rs: fa0(), + }, + "fcvt.l.s a0,fa0,fcsr", + 0xc0257553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtLuFmt, + rd: writable_a0(), + rs: fa0(), + }, + "fcvt.lu.s a0,fa0,fcsr", + 0xc0357553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtFmtL, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.s.l fa0,a0,fcsr", + 0xd0257553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtFmtLu, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.s.lu fa0,a0,fcsr", + 0xd0357553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::Fsqrt, + rd: writable_fa0(), + rs: fa1(), + }, + "fsqrt.d fa0,fa1,fcsr", + 0x5a05f553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtWFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fcvt.w.d a0,fa1,fcsr", + 0xc205f553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtWuFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fcvt.wu.d a0,fa1,fcsr", + 0xc215f553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FmvXFmt, + rd: writable_a0(), + rs: fa1(), + }, + "fmv.x.d a0,fa1", + 0xe2058553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RTZ, + width: FpuOPWidth::D, + alu_op: FpuOPRR::Fclass, + rd: writable_a0(), + rs: fa1(), + }, + "fclass.d a0,fa1", + 0xe2059553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRR::FcvtSD, + rd: writable_fa0(), + rs: fa0(), + }, + "fcvt.s.d fa0,fa0,fcsr", + 0x40157553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtFmtWu, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.d.wu fa0,a0,rne", + 0xd2150553, + )); + + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::RNE, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FmvFmtX, + rd: writable_fa0(), + rs: a0(), + }, + "fmv.d.x fa0,a0", + 0xf2050553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtLFmt, + rd: writable_a0(), + rs: fa0(), + }, + "fcvt.l.d a0,fa0,fcsr", + 0xc2257553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtLuFmt, + rd: writable_a0(), + rs: fa0(), + }, + "fcvt.lu.d a0,fa0,fcsr", + 0xc2357553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtFmtL, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.d.l fa0,a0,fcsr", + 0xd2257553, + )); + insns.push(TestUnit::new( + Inst::FpuRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRR::FcvtFmtLu, + rd: writable_fa0(), + rs: a0(), + }, + "fcvt.d.lu fa0,a0,fcsr", + 0xd2357553, + )); + ////////////////////// + + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::RNE, + width: FpuOPWidth::S, + alu_op: FpuOPRRRR::Fmadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fmadd.s fa0,fa0,fa1,fa7,rne", + 0x88b50543, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRRRR::Fmsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fmsub.s fa0,fa0,fa1,fa7,fcsr", + 0x88b57547, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRRRR::Fnmsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fnmsub.s fa0,fa0,fa1,fa7,fcsr", + 0x88b5754b, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::S, + alu_op: FpuOPRRRR::Fnmadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fnmadd.s fa0,fa0,fa1,fa7,fcsr", + 0x88b5754f, + )); + + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRRR::Fmadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fmadd.d fa0,fa0,fa1,fa7,fcsr", + 0x8ab57543, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRRR::Fmsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fmsub.d fa0,fa0,fa1,fa7,fcsr", + 0x8ab57547, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRRR::Fnmsub, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fnmsub.d fa0,fa0,fa1,fa7,fcsr", + 0x8ab5754b, + )); + insns.push(TestUnit::new( + Inst::FpuRRRR { + frm: FRM::Fcsr, + width: FpuOPWidth::D, + alu_op: FpuOPRRRR::Fnmadd, + rd: writable_fa0(), + rs1: fa0(), + rs2: fa1(), + rs3: fa7(), + }, + "fnmadd.d fa0,fa0,fa1,fa7,fcsr", + 0x8ab5754f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::LrW, + rd: writable_a0(), + addr: a1(), + src: zero_reg(), + amo: AMO::Relax, + }, + "lr.w a0,(a1)", + 0x1005a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::ScW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Release, + }, + "sc.w.rl a0,a2,(a1)", + 0x1ac5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoswapW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Aquire, + }, + "amoswap.w.aq a0,a2,(a1)", + 0xcc5a52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoaddW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::SeqCst, + }, + "amoadd.w.aqrl a0,a2,(a1)", + 0x6c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoxorW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoxor.w a0,a2,(a1)", + 0x20c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoandW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoand.w a0,a2,(a1)", + 0x60c5a52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoorW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoor.w a0,a2,(a1)", + 0x40c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmominW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomin.w a0,a2,(a1)", + 0x80c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmomaxW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomax.w a0,a2,(a1)", + 0xa0c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmominuW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amominu.w a0,a2,(a1)", + 0xc0c5a52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmomaxuW, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomaxu.w a0,a2,(a1)", + 0xe0c5a52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::LrD, + rd: writable_a0(), + addr: a1(), + src: zero_reg(), + amo: AMO::Relax, + }, + "lr.d a0,(a1)", + 0x1005b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::ScD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "sc.d a0,a2,(a1)", + 0x18c5b52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoswapD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoswap.d a0,a2,(a1)", + 0x8c5b52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoaddD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoadd.d a0,a2,(a1)", + 0xc5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoxorD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoxor.d a0,a2,(a1)", + 0x20c5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoandD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoand.d a0,a2,(a1)", + 0x60c5b52f, + )); + + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmoorD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amoor.d a0,a2,(a1)", + 0x40c5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmominD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomin.d a0,a2,(a1)", + 0x80c5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmomaxD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomax.d a0,a2,(a1)", + 0xa0c5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmominuD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amominu.d a0,a2,(a1)", + 0xc0c5b52f, + )); + insns.push(TestUnit::new( + Inst::Atomic { + op: AtomicOP::AmomaxuD, + rd: writable_a0(), + addr: a1(), + src: a2(), + amo: AMO::Relax, + }, + "amomaxu.d a0,a2,(a1)", + 0xe0c5b52f, + )); + + ///////// + insns.push(TestUnit::new( + Inst::Fence { + pred: 1, + succ: 1 << 1, + }, + "fence w,r", + 0x120000f, + )); + insns.push(TestUnit::new(Inst::EBreak {}, "ebreak", 0x100073)); + + insns.push(TestUnit::new( + Inst::FpuRRR { + alu_op: FpuOPRRR::Fsgnj, + width: FpuOPWidth::S, + frm: FRM::RNE, + rd: writable_fa0(), + rs1: fa1(), + rs2: fa1(), + }, + "fmv.s fa0,fa1", + 0x20b58553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + alu_op: FpuOPRRR::Fsgnj, + width: FpuOPWidth::D, + frm: FRM::RNE, + rd: writable_fa0(), + rs1: fa1(), + rs2: fa1(), + }, + "fmv.d fa0,fa1", + 0x22b58553, + )); + + insns.push(TestUnit::new( + Inst::FpuRRR { + alu_op: FpuOPRRR::Fsgnjn, + width: FpuOPWidth::S, + frm: FRM::RTZ, + rd: writable_fa0(), + rs1: fa1(), + rs2: fa1(), + }, + "fneg.s fa0,fa1", + 0x20b59553, + )); + insns.push(TestUnit::new( + Inst::FpuRRR { + alu_op: FpuOPRRR::Fsgnjn, + width: FpuOPWidth::D, + frm: FRM::RTZ, + rd: writable_fa0(), + rs1: fa1(), + rs2: fa1(), + }, + "fneg.d fa0,fa1", + 0x22b59553, + )); + + insns.push(TestUnit::new( + Inst::Fli { + ty: F32, + rd: writable_fa0(), + imm: FliConstant::new(0), + }, + "fli.s fa0,-1.0", + 0xf0100553, + )); + + insns.push(TestUnit::new( + Inst::Fli { + ty: F64, + rd: writable_fa0(), + imm: FliConstant::new(13), + }, + "fli.d fa0,0.625", + 0xf2168553, + )); + + let (flags, isa_flags) = make_test_flags(); + let emit_info = EmitInfo::new(flags, isa_flags); + + for unit in insns.iter() { + println!("Riscv64: {:?}, {}", unit.inst, unit.assembly); + // Check the printed text is as expected. + let actual_printing = unit.inst.print_with_state(&mut EmitState::default()); + assert_eq!(unit.assembly, actual_printing); + let mut buffer = MachBuffer::new(); + unit.inst + .emit(&mut buffer, &emit_info, &mut Default::default()); + let buffer = buffer.finish(&Default::default(), &mut Default::default()); + let actual_encoding = buffer.stringify_code_bytes(); + + assert_eq!(actual_encoding, unit.code.0); + } +} + +fn make_test_flags() -> (settings::Flags, super::super::riscv_settings::Flags) { + let b = settings::builder(); + let flags = settings::Flags::new(b.clone()); + let b2 = super::super::riscv_settings::builder(); + let isa_flags = super::super::riscv_settings::Flags::new(&flags, &b2); + (flags, isa_flags) +} + +#[test] +fn riscv64_worst_case_instruction_size() { + let (flags, isa_flags) = make_test_flags(); + let emit_info = EmitInfo::new(flags, isa_flags); + + // These are all candidate instructions with potential to generate a lot of bytes. + let mut candidates: Vec = vec![]; + + candidates.push(Inst::Popcnt { + sum: writable_a0(), + tmp: writable_a0(), + step: writable_a0(), + rs: a0(), + ty: I64, + }); + + candidates.push(Inst::Cltz { + sum: writable_a0(), + tmp: writable_a0(), + step: writable_a0(), + rs: a0(), + leading: true, + ty: I64, + }); + + candidates.push(Inst::Brev8 { + rd: writable_a0(), + tmp: writable_a0(), + step: writable_a0(), + tmp2: writable_a0(), + rs: a0(), + ty: I64, + }); + + candidates.push(Inst::AtomicCas { + offset: a0(), + t0: writable_a0(), + dst: writable_a0(), + e: a0(), + addr: a0(), + v: a0(), + ty: I64, + }); + + candidates.push(Inst::AtomicCas { + offset: a0(), + t0: writable_a0(), + dst: writable_a0(), + e: a0(), + addr: a0(), + v: a0(), + ty: I16, + }); + + candidates.extend( + crate::ir::AtomicRmwOp::all() + .iter() + .map(|op| Inst::AtomicRmwLoop { + op: *op, + offset: a0(), + dst: writable_a1(), + ty: I16, + p: a1(), + x: a2(), + t0: writable_a0(), + }), + ); + + // Return Call Indirect and BrTable are the largest instructions possible. However they + // emit their own island, so we don't account them here. + + let mut max: (u32, MInst) = (0, Inst::Nop0); + for i in candidates { + let mut buffer = MachBuffer::new(); + let mut emit_state = Default::default(); + i.emit(&mut buffer, &emit_info, &mut emit_state); + let buffer = buffer.finish(&Default::default(), &mut Default::default()); + let length = buffer.data().len() as u32; + if length > max.0 { + let length = buffer.data().len() as u32; + max = (length, i.clone()); + } + println!("insn:{i:?} length: {length}"); + } + println!("calculate max size is {} , inst is {:?}", max.0, max.1); + assert!(max.0 <= Inst::worst_case_size()); +} diff --git a/hbcb/src/inst/encode.rs b/hbcb/src/inst/encode.rs new file mode 100644 index 0000000..0e2d4c4 --- /dev/null +++ b/hbcb/src/inst/encode.rs @@ -0,0 +1,721 @@ +//! Contains the RISC-V instruction encoding logic. +//! +//! These formats are specified in the RISC-V specification in section 2.2. +//! See: +//! +//! Some instructions especially in extensions have slight variations from +//! the base RISC-V specification. + +use super::*; +use crate::lower::isle::generated_code::{ + COpcodeSpace, CaOp, CbOp, CiOp, CiwOp, ClOp, CrOp, CsOp, CssOp, CsznOp, FpuOPWidth, + VecAluOpRImm5, VecAluOpRR, VecAluOpRRRImm5, VecAluOpRRRR, VecOpCategory, ZcbMemOp, +}; +use crate::machinst::isle::WritableReg; + +fn unsigned_field_width(value: u32, width: u8) -> u32 { + debug_assert_eq!(value & (!0 << width), 0); + value +} + +/// Layout: +/// 0-------6-7-------11-12------14-15------19-20------24-25-------31 +/// | Opcode | rd | funct3 | rs1 | rs2 | funct7 | +fn encode_r_type_bits(opcode: u32, rd: u32, funct3: u32, rs1: u32, rs2: u32, funct7: u32) -> u32 { + let mut bits = 0; + bits |= unsigned_field_width(opcode, 7); + bits |= unsigned_field_width(rd, 5) << 7; + bits |= unsigned_field_width(funct3, 3) << 12; + bits |= unsigned_field_width(rs1, 5) << 15; + bits |= unsigned_field_width(rs2, 5) << 20; + bits |= unsigned_field_width(funct7, 7) << 25; + bits +} + +/// Encode an R-type instruction. +pub fn encode_r_type( + opcode: u32, + rd: WritableReg, + funct3: u32, + rs1: Reg, + rs2: Reg, + funct7: u32, +) -> u32 { + encode_r_type_bits( + opcode, + reg_to_gpr_num(rd.to_reg()), + funct3, + reg_to_gpr_num(rs1), + reg_to_gpr_num(rs2), + funct7, + ) +} + +/// Layout: +/// 0-------6-7-------11-12------14-15------19-20------------------31 +/// | Opcode | rd | width | rs1 | Offset[11:0] | +fn encode_i_type_bits(opcode: u32, rd: u32, funct3: u32, rs1: u32, offset: u32) -> u32 { + let mut bits = 0; + bits |= unsigned_field_width(opcode, 7); + bits |= unsigned_field_width(rd, 5) << 7; + bits |= unsigned_field_width(funct3, 3) << 12; + bits |= unsigned_field_width(rs1, 5) << 15; + bits |= unsigned_field_width(offset, 12) << 20; + bits +} + +/// Encode an I-type instruction. +pub fn encode_i_type(opcode: u32, rd: WritableReg, width: u32, rs1: Reg, offset: Imm12) -> u32 { + encode_i_type_bits( + opcode, + reg_to_gpr_num(rd.to_reg()), + width, + reg_to_gpr_num(rs1), + offset.bits(), + ) +} + +/// Encode an S-type instruction. +/// +/// Layout: +/// 0-------6-7-------11-12------14-15------19-20---24-25-------------31 +/// | Opcode | imm[4:0] | width | base | src | imm[11:5] | +pub fn encode_s_type(opcode: u32, width: u32, base: Reg, src: Reg, offset: Imm12) -> u32 { + let mut bits = 0; + bits |= unsigned_field_width(opcode, 7); + bits |= (offset.bits() & 0b11111) << 7; + bits |= unsigned_field_width(width, 3) << 12; + bits |= reg_to_gpr_num(base) << 15; + bits |= reg_to_gpr_num(src) << 20; + bits |= unsigned_field_width(offset.bits() >> 5, 7) << 25; + bits +} + +/// Encodes a Vector ALU instruction. +/// +/// Fields: +/// - opcode (7 bits) +/// - vd (5 bits) +/// - funct3 (3 bits) +/// - vs1 (5 bits) +/// - vs2 (5 bits) +/// - vm (1 bit) +/// - funct6 (6 bits) +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc +pub fn encode_valu( + op: VecAluOpRRR, + vd: WritableReg, + vs1: Reg, + vs2: Reg, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + reg_to_gpr_num(vs1), + reg_to_gpr_num(vs2), + funct7, + ) +} + +/// Encodes a Vector ALU+Imm instruction. +/// This is just a Vector ALU instruction with an immediate in the VS1 field. +/// +/// Fields: +/// - opcode (7 bits) +/// - vd (5 bits) +/// - funct3 (3 bits) +/// - imm (5 bits) +/// - vs2 (5 bits) +/// - vm (1 bit) +/// - funct6 (6 bits) +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc +pub fn encode_valu_rr_imm( + op: VecAluOpRRImm5, + vd: WritableReg, + imm: Imm5, + vs2: Reg, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + let imm = imm.bits() as u32; + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + imm, + reg_to_gpr_num(vs2), + funct7, + ) +} + +pub fn encode_valu_rrrr( + op: VecAluOpRRRR, + vd: WritableReg, + vs2: Reg, + vs1: Reg, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + reg_to_gpr_num(vs1), + reg_to_gpr_num(vs2), + funct7, + ) +} + +pub fn encode_valu_rrr_imm( + op: VecAluOpRRRImm5, + vd: WritableReg, + imm: Imm5, + vs2: Reg, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + let imm = imm.bits() as u32; + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + imm, + reg_to_gpr_num(vs2), + funct7, + ) +} + +pub fn encode_valu_rr(op: VecAluOpRR, vd: WritableReg, vs: Reg, masking: VecOpMasking) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + + let (vs1, vs2) = if op.vs_is_vs2_encoded() { + (op.aux_encoding(), reg_to_gpr_num(vs)) + } else { + (reg_to_gpr_num(vs), op.aux_encoding()) + }; + + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + vs1, + vs2, + funct7, + ) +} + +pub fn encode_valu_r_imm( + op: VecAluOpRImm5, + vd: WritableReg, + imm: Imm5, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + + // This is true for this opcode, not sure if there are any other ones. + debug_assert_eq!(op, VecAluOpRImm5::VmvVI); + let vs1 = imm.bits() as u32; + let vs2 = op.aux_encoding(); + + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + vs1, + vs2, + funct7, + ) +} + +/// Encodes a Vector CFG Imm instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc +// TODO: Check if this is any of the known instruction types in the spec. +pub fn encode_vcfg_imm(opcode: u32, rd: Reg, imm: UImm5, vtype: &VType) -> u32 { + let mut bits = 0; + bits |= unsigned_field_width(opcode, 7); + bits |= reg_to_gpr_num(rd) << 7; + bits |= VecOpCategory::OPCFG.encode() << 12; + bits |= unsigned_field_width(imm.bits(), 5) << 15; + bits |= unsigned_field_width(vtype.encode(), 10) << 20; + bits |= 0b11 << 30; + bits +} + +/// Encodes a Vector Mem Unit Stride Load instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc +/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP +pub fn encode_vmem_load( + opcode: u32, + vd: Reg, + width: VecElementWidth, + rs1: Reg, + lumop: u32, + masking: VecOpMasking, + mop: u32, + nf: u32, +) -> u32 { + // Width is encoded differently to avoid a clash with the FP load/store sizes. + let width = match width { + VecElementWidth::E8 => 0b000, + VecElementWidth::E16 => 0b101, + VecElementWidth::E32 => 0b110, + VecElementWidth::E64 => 0b111, + }; + + let mut bits = 0; + bits |= unsigned_field_width(opcode, 7); + bits |= reg_to_gpr_num(vd) << 7; + bits |= width << 12; + bits |= reg_to_gpr_num(rs1) << 15; + bits |= unsigned_field_width(lumop, 5) << 20; + bits |= masking.encode() << 25; + bits |= unsigned_field_width(mop, 2) << 26; + + // The mew bit (inst[28]) when set is expected to be used to encode expanded + // memory sizes of 128 bits and above, but these encodings are currently reserved. + bits |= 0b0 << 28; + + bits |= unsigned_field_width(nf, 3) << 29; + bits +} + +/// Encodes a Vector Mem Unit Stride Load instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc +/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP +pub fn encode_vmem_store( + opcode: u32, + vs3: Reg, + width: VecElementWidth, + rs1: Reg, + sumop: u32, + masking: VecOpMasking, + mop: u32, + nf: u32, +) -> u32 { + // This is pretty much the same as the load instruction, just + // with different names on the fields. + encode_vmem_load(opcode, vs3, width, rs1, sumop, masking, mop, nf) +} + +// The CSR Reg instruction is really just an I type instruction with the CSR in +// the immediate field. +pub fn encode_csr_reg(op: CsrRegOP, rd: WritableReg, rs: Reg, csr: CSR) -> u32 { + encode_i_type(op.opcode(), rd, op.funct3(), rs, csr.bits()) +} + +// The CSR Imm instruction is an I type instruction with the CSR in +// the immediate field and the value to be set in the `rs1` field. +pub fn encode_csr_imm(op: CsrImmOP, rd: WritableReg, csr: CSR, imm: UImm5) -> u32 { + encode_i_type_bits( + op.opcode(), + reg_to_gpr_num(rd.to_reg()), + op.funct3(), + imm.bits(), + csr.bits().bits(), + ) +} + +// Encode a CR type instruction. +// +// 0--1-2-----6-7-------11-12-------15 +// |op | rs2 | rd/rs1 | funct4 | +pub fn encode_cr_type(op: CrOp, rd: WritableReg, rs2: Reg) -> u16 { + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= reg_to_gpr_num(rs2) << 2; + bits |= reg_to_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(op.funct4(), 4) << 12; + bits.try_into().unwrap() +} + +// This isn't technically a instruction format that exists. It's just a CR type +// where the source is rs1, rs2 is zero. rs1 is never written to. +// +// Used for C.JR and C.JALR +pub fn encode_cr2_type(op: CrOp, rs1: Reg) -> u16 { + encode_cr_type(op, WritableReg::from_reg(rs1), zero_reg()) +} + +// Encode a CA type instruction. +// +// 0--1-2-----4-5--------6-7--------9-10------15 +// |op | rs2 | funct2 | rd/rs1 | funct6 | +pub fn encode_ca_type(op: CaOp, rd: WritableReg, rs2: Reg) -> u16 { + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= reg_to_compressed_gpr_num(rs2) << 2; + bits |= unsigned_field_width(op.funct2(), 2) << 5; + bits |= reg_to_compressed_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(op.funct6(), 6) << 10; + bits.try_into().unwrap() +} + +// Encode a CJ type instruction. +// +// The imm field is a 11 bit signed immediate that is shifted left by 1. +// +// 0--1-2-----12-13--------15 +// |op | imm | funct3 | +pub fn encode_cj_type(op: CjOp, imm: Imm12) -> u16 { + let imm = imm.bits(); + debug_assert!(imm & 1 == 0); + + // The offset bits are in rather weird positions. + // [11|4|9:8|10|6|7|3:1|5] + let mut imm_field = 0; + imm_field |= ((imm >> 11) & 1) << 10; + imm_field |= ((imm >> 4) & 1) << 9; + imm_field |= ((imm >> 8) & 3) << 7; + imm_field |= ((imm >> 10) & 1) << 6; + imm_field |= ((imm >> 6) & 1) << 5; + imm_field |= ((imm >> 7) & 1) << 4; + imm_field |= ((imm >> 1) & 7) << 1; + imm_field |= ((imm >> 5) & 1) << 0; + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= unsigned_field_width(imm_field, 11) << 2; + bits |= unsigned_field_width(op.funct3(), 3) << 13; + bits.try_into().unwrap() +} + +// Encode a CI type instruction. +// +// The imm field is a 6 bit signed immediate. +// +// 0--1-2-------6-7-------11-12-----12-13-----15 +// |op | imm[4:0] | src | imm[5] | funct3 | +pub fn encode_ci_type(op: CiOp, rd: WritableReg, imm: Imm6) -> u16 { + let imm = imm.bits(); + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= unsigned_field_width((imm & 0x1f) as u32, 5) << 2; + bits |= reg_to_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(((imm >> 5) & 1) as u32, 1) << 12; + bits |= unsigned_field_width(op.funct3(), 3) << 13; + bits.try_into().unwrap() +} + +// Stack-Pointer relative loads are regular CI instructions, but, the immediate +// is zero extended, and with a slightly different immediate field encoding. +pub fn encode_ci_sp_load(op: CiOp, rd: WritableReg, imm: Uimm6) -> u16 { + let imm = imm.bits(); + + // These are the spec encoded offsets. + // LWSP: [5|4:2|7:6] + // LDSP: [5|4:3|8:6] + // FLDSP: [5|4:3|8:6] + // + // We don't receive the entire offset in `imm`, just a multiple of the load-size. + + // Number of bits in the lowest position of imm. 3 for lwsp, 2 for {f,}ldsp. + let low_bits = match op { + CiOp::CLwsp => 3, // [4:2] + CiOp::CLdsp | CiOp::CFldsp => 2, // [4:3] + _ => unreachable!(), + }; + let high_bits = 6 - 1 - low_bits; + let mut enc_imm = 0; + + // Encode [7:6] at the bottom of imm + enc_imm |= imm >> (6 - high_bits); + + // Next place [4:2] in the middle + enc_imm |= (imm & ((1 << low_bits) - 1)) << high_bits; + + // Finally place [5] at the top + enc_imm |= ((imm >> low_bits) & 1) << 5; + + let enc_imm = Imm6::maybe_from_i16((enc_imm as i16) << 10 >> 10).unwrap(); + + encode_ci_type(op, rd, enc_imm) +} + +/// c.addi16sp is a regular CI op, but the immediate field is encoded in a weird way +pub fn encode_c_addi16sp(imm: Imm6) -> u16 { + let imm = imm.bits(); + + // [6|1|3|5:4|2] + let mut enc_imm = 0; + enc_imm |= ((imm >> 5) & 1) << 5; + enc_imm |= ((imm >> 0) & 1) << 4; + enc_imm |= ((imm >> 2) & 1) << 3; + enc_imm |= ((imm >> 3) & 3) << 1; + enc_imm |= ((imm >> 1) & 1) << 0; + let enc_imm = Imm6::maybe_from_i16((enc_imm as i16) << 10 >> 10).unwrap(); + + encode_ci_type(CiOp::CAddi16sp, writable_stack_reg(), enc_imm) +} + +// Encode a CIW type instruction. +// +// 0--1-2------4-5------12-13--------15 +// |op | rd | imm | funct3 | +pub fn encode_ciw_type(op: CiwOp, rd: WritableReg, imm: u8) -> u16 { + // [3:2|7:4|0|1] + let mut imm_field = 0; + imm_field |= ((imm >> 1) & 1) << 0; + imm_field |= ((imm >> 0) & 1) << 1; + imm_field |= ((imm >> 4) & 15) << 2; + imm_field |= ((imm >> 2) & 3) << 6; + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= reg_to_compressed_gpr_num(rd.to_reg()) << 2; + bits |= unsigned_field_width(imm_field as u32, 8) << 5; + bits |= unsigned_field_width(op.funct3(), 3) << 13; + bits.try_into().unwrap() +} + +// Encode a CB type instruction. +// +// The imm field is a 6 bit signed immediate. +// +// 0--1-2-------6-7-------9-10-------11-12-------13--------15 +// |op | imm[4:0] | dst | funct2 | imm[5] | funct3 | +pub fn encode_cb_type(op: CbOp, rd: WritableReg, imm: Imm6) -> u16 { + let imm = imm.bits(); + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= unsigned_field_width((imm & 0x1f) as u32, 5) << 2; + bits |= reg_to_compressed_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(op.funct2(), 2) << 10; + bits |= unsigned_field_width(((imm >> 5) & 1) as u32, 1) << 12; + bits |= unsigned_field_width(op.funct3(), 3) << 13; + bits.try_into().unwrap() +} + +// Encode a CSS type instruction. +// +// The imm field is a 6 bit unsigned immediate. +// +// 0--1-2-------6-7--------12-13-------15 +// |op | src | imm | funct3 | +pub fn encode_css_type(op: CssOp, src: Reg, imm: Uimm6) -> u16 { + let imm = imm.bits(); + + // These are the spec encoded offsets. + // c.swsp: [5:2|7:6] + // c.sdsp: [5:3|8:6] + // c.fsdsp: [5:3|8:6] + // + // We don't receive the entire offset in `imm`, just a multiple of the load-size. + + // Number of bits in the lowest position of imm. 4 for c.swsp, 3 for c.{f,}sdsp. + let low_bits = match op { + CssOp::CSwsp => 4, // [5:2] + CssOp::CSdsp | CssOp::CFsdsp => 3, // [5:3] + }; + let high_bits = 6 - low_bits; + + let mut enc_imm = 0; + enc_imm |= (imm & ((1 << low_bits) - 1)) << high_bits; + enc_imm |= imm >> low_bits; + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= reg_to_gpr_num(src) << 2; + bits |= unsigned_field_width(enc_imm as u32, 6) << 7; + bits |= unsigned_field_width(op.funct3(), 3) << 13; + bits.try_into().unwrap() +} + +// Encode a CS type instruction. +// +// The imm field is a 5 bit unsigned immediate. +// +// 0--1-2-----4-5----------6-7---------9-10----------12-13-----15 +// |op | src | imm(2-bit) | base | imm(3-bit) | funct3 | +pub fn encode_cs_type(op: CsOp, src: Reg, base: Reg, imm: Uimm5) -> u16 { + let size = match op { + CsOp::CFsd | CsOp::CSd => 8, + CsOp::CSw => 4, + }; + + encode_cs_cl_type_bits(op.op(), op.funct3(), size, src, base, imm) +} + +// Encode a CL type instruction. +// +// The imm field is a 5 bit unsigned immediate. +// +// 0--1-2------4-5----------6-7---------9-10----------12-13-----15 +// |op | dest | imm(2-bit) | base | imm(3-bit) | funct3 | +pub fn encode_cl_type(op: ClOp, dest: WritableReg, base: Reg, imm: Uimm5) -> u16 { + let size = match op { + ClOp::CFld | ClOp::CLd => 8, + ClOp::CLw => 4, + }; + + encode_cs_cl_type_bits(op.op(), op.funct3(), size, dest.to_reg(), base, imm) +} + +// CL and CS type instructions have the same physical layout. +// +// 0--1-2----------4-5----------6-7---------9-10----------12-13-----15 +// |op | dest/src | imm(2-bit) | base | imm(3-bit) | funct3 | +fn encode_cs_cl_type_bits( + op: COpcodeSpace, + funct3: u32, + size: u32, + dest_src: Reg, + base: Reg, + imm: Uimm5, +) -> u16 { + let imm = imm.bits(); + + // c.sw / c.lw: [2|6] + // c.sd / c.ld: [7:6] + // c.fsd / c.fld: [7:6] + // + // We differentiate these based on the operation size + let imm2 = match size { + 4 => ((imm >> 4) & 1) | ((imm & 1) << 1), + 8 => (imm >> 3) & 0b11, + _ => unreachable!(), + }; + + // [5:3] on all opcodes + let imm3 = match size { + 4 => (imm >> 1) & 0b111, + 8 => (imm >> 0) & 0b111, + _ => unreachable!(), + }; + + let mut bits = 0; + bits |= unsigned_field_width(op.bits(), 2); + bits |= reg_to_compressed_gpr_num(dest_src) << 2; + bits |= unsigned_field_width(imm2 as u32, 2) << 5; + bits |= reg_to_compressed_gpr_num(base) << 7; + bits |= unsigned_field_width(imm3 as u32, 3) << 10; + bits |= unsigned_field_width(funct3, 3) << 13; + bits.try_into().unwrap() +} + +// Encode a CSZN type instruction. +// +// This is an additional encoding format that is introduced in the Zcb extension. +// +// 0--1-2---------6-7--------9-10------15 +// |op | funct5 | rd/rs1 | funct6 | +pub fn encode_cszn_type(op: CsznOp, rd: WritableReg) -> u16 { + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= unsigned_field_width(op.funct5(), 5) << 2; + bits |= reg_to_compressed_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(op.funct6(), 6) << 10; + bits.try_into().unwrap() +} + +// Encodes the various memory operations in the Zcb extension. +// +// 0--1-2----------4-5----------6-7---------9-10-------15 +// |op | dest/src | imm(2-bit) | base | funct6 | +fn encode_zcbmem_bits(op: ZcbMemOp, dest_src: Reg, base: Reg, imm: Uimm2) -> u16 { + let imm = imm.bits(); + + // For these ops, bit 6 is part of the opcode, and bit 5 encodes the imm offset. + let imm = match op { + ZcbMemOp::CLh | ZcbMemOp::CLhu | ZcbMemOp::CSh => { + debug_assert_eq!(imm & !1, 0); + // Only c.lh has this bit as 1 + let opcode_bit = (op == ZcbMemOp::CLh) as u8; + imm | (opcode_bit << 1) + } + // In the rest of the ops the imm is reversed. + _ => ((imm & 1) << 1) | ((imm >> 1) & 1), + }; + + let mut bits = 0; + bits |= unsigned_field_width(op.op().bits(), 2); + bits |= reg_to_compressed_gpr_num(dest_src) << 2; + bits |= unsigned_field_width(imm as u32, 2) << 5; + bits |= reg_to_compressed_gpr_num(base) << 7; + bits |= unsigned_field_width(op.funct6(), 6) << 10; + bits.try_into().unwrap() +} + +pub fn encode_zcbmem_load(op: ZcbMemOp, rd: WritableReg, base: Reg, imm: Uimm2) -> u16 { + encode_zcbmem_bits(op, rd.to_reg(), base, imm) +} + +pub fn encode_zcbmem_store(op: ZcbMemOp, src: Reg, base: Reg, imm: Uimm2) -> u16 { + encode_zcbmem_bits(op, src, base, imm) +} + +pub fn encode_fli(ty: Type, imm: FliConstant, rd: WritableReg) -> u32 { + // FLI.{S,D} is encoded as a FMV.{W,D} instruction with rs2 set to the + // immediate value to be loaded. + let op = FpuOPRR::FmvFmtX; + let width = FpuOPWidth::try_from(ty).unwrap(); + let frm = 0; // FRM is hard coded to 0 in both instructions + let rs2 = 1; // rs2 set to 1 is what differentiates FLI from FMV + + let mut bits = 0; + bits |= unsigned_field_width(op.opcode(), 7); + bits |= reg_to_gpr_num(rd.to_reg()) << 7; + bits |= unsigned_field_width(frm, 3) << 12; + bits |= unsigned_field_width(imm.bits() as u32, 5) << 15; + bits |= unsigned_field_width(rs2, 6) << 20; + bits |= unsigned_field_width(op.funct7(width), 7) << 25; + bits +} + +pub fn encode_fp_rr(op: FpuOPRR, width: FpuOPWidth, frm: FRM, rd: WritableReg, rs: Reg) -> u32 { + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(rd.to_reg()), + frm.as_u32(), + reg_to_gpr_num(rs), + op.rs2(), + op.funct7(width), + ) +} + +pub fn encode_fp_rrr( + op: FpuOPRRR, + width: FpuOPWidth, + frm: FRM, + rd: WritableReg, + rs1: Reg, + rs2: Reg, +) -> u32 { + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(rd.to_reg()), + frm.as_u32(), + reg_to_gpr_num(rs1), + reg_to_gpr_num(rs2), + op.funct7(width), + ) +} + +pub fn encode_fp_rrrr( + op: FpuOPRRRR, + width: FpuOPWidth, + frm: FRM, + rd: WritableReg, + rs1: Reg, + rs2: Reg, + rs3: Reg, +) -> u32 { + let funct7 = (reg_to_gpr_num(rs3) << 2) | width.as_u32(); + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(rd.to_reg()), + frm.as_u32(), + reg_to_gpr_num(rs1), + reg_to_gpr_num(rs2), + funct7, + ) +} diff --git a/hbcb/src/inst/imms.rs b/hbcb/src/inst/imms.rs new file mode 100644 index 0000000..28f2791 --- /dev/null +++ b/hbcb/src/inst/imms.rs @@ -0,0 +1,374 @@ +//! Riscv64 ISA definitions: immediate constants. + +// Some variants are never constructed, but we still want them as options in the future. +use super::Inst; +#[allow(dead_code)] +use std::fmt::{Debug, Display, Formatter, Result}; + +#[derive(Copy, Clone, Debug, Default)] +pub struct Imm12 { + /// 16-bit container where the low 12 bits are the data payload. + /// + /// Acquiring the underlying value requires sign-extending the 12th bit. + bits: u16, +} + +impl Imm12 { + pub(crate) const ZERO: Self = Self { bits: 0 }; + pub(crate) const ONE: Self = Self { bits: 1 }; + + pub fn maybe_from_u64(val: u64) -> Option { + Self::maybe_from_i64(val as i64) + } + + pub fn maybe_from_i64(val: i64) -> Option { + if val >= -2048 && val <= 2047 { + Some(Imm12 { + bits: val as u16 & 0xfff, + }) + } else { + None + } + } + + #[inline] + pub fn from_i16(bits: i16) -> Self { + assert!(bits >= -2048 && bits <= 2047); + Self { + bits: (bits & 0xfff) as u16, + } + } + + #[inline] + pub fn as_i16(self) -> i16 { + (self.bits << 4) as i16 >> 4 + } + + #[inline] + pub fn bits(&self) -> u32 { + self.bits.into() + } +} + +impl Into for Imm12 { + fn into(self) -> i64 { + self.as_i16().into() + } +} + +impl Display for Imm12 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{:+}", self.as_i16()) + } +} + +// signed +#[derive(Clone, Copy, Default)] +pub struct Imm20 { + /// 32-bit container where the low 20 bits are the data payload. + /// + /// Acquiring the underlying value requires sign-extending the 20th bit. + bits: u32, +} + +impl Imm20 { + pub(crate) const ZERO: Self = Self { bits: 0 }; + + pub fn maybe_from_u64(val: u64) -> Option { + Self::maybe_from_i64(val as i64) + } + + pub fn maybe_from_i64(val: i64) -> Option { + if val >= -(0x7_ffff + 1) && val <= 0x7_ffff { + Some(Imm20 { bits: val as u32 }) + } else { + None + } + } + + #[inline] + pub fn from_i32(bits: i32) -> Self { + assert!(bits >= -(0x7_ffff + 1) && bits <= 0x7_ffff); + Self { + bits: (bits as u32) & 0xf_ffff, + } + } + + #[inline] + pub fn as_i32(&self) -> i32 { + ((self.bits << 12) as i32) >> 12 + } + + #[inline] + pub fn bits(&self) -> u32 { + self.bits + } +} + +impl Debug for Imm20 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.as_i32()) + } +} + +impl Display for Imm20 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.bits) + } +} + +/// An unsigned 5-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct UImm5 { + value: u8, +} + +impl UImm5 { + /// Create an unsigned 5-bit immediate from u8. + pub fn maybe_from_u8(value: u8) -> Option { + if value < 32 { + Some(UImm5 { value }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + u32::from(self.value) + } +} + +impl Display for UImm5 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +/// A Signed 5-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Imm5 { + value: i8, +} + +impl Imm5 { + /// Create an signed 5-bit immediate from an i8. + pub fn maybe_from_i8(value: i8) -> Option { + if value >= -16 && value <= 15 { + Some(Imm5 { value }) + } else { + None + } + } + + pub fn from_bits(value: u8) -> Imm5 { + assert_eq!(value & 0x1f, value); + let signed = ((value << 3) as i8) >> 3; + Imm5 { value: signed } + } + + /// Bits for encoding. + pub fn bits(&self) -> u8 { + self.value as u8 & 0x1f + } +} + +impl Display for Imm5 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +/// A Signed 6-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Imm6 { + value: i8, +} + +impl Imm6 { + /// Create an signed 6-bit immediate from an i16 + pub fn maybe_from_i16(value: i16) -> Option { + if value >= -32 && value <= 31 { + Some(Self { value: value as i8 }) + } else { + None + } + } + + pub fn maybe_from_i32(value: i32) -> Option { + value.try_into().ok().and_then(Imm6::maybe_from_i16) + } + + pub fn maybe_from_imm12(value: Imm12) -> Option { + Imm6::maybe_from_i16(value.as_i16()) + } + + /// Bits for encoding. + pub fn bits(&self) -> u8 { + self.value as u8 & 0x3f + } +} + +impl Display for Imm6 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +/// A unsigned 6-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Uimm6 { + value: u8, +} + +impl Uimm6 { + /// Create an unsigned 6-bit immediate from an u8 + pub fn maybe_from_u8(value: u8) -> Option { + if value <= 63 { + Some(Self { value }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u8 { + self.value & 0x3f + } +} + +impl Display for Uimm6 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +/// A unsigned 5-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Uimm5 { + value: u8, +} + +impl Uimm5 { + /// Create an unsigned 5-bit immediate from an u8 + pub fn maybe_from_u8(value: u8) -> Option { + if value <= 31 { + Some(Self { value }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u8 { + self.value & 0x1f + } +} + +impl Display for Uimm5 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +/// A unsigned 2-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Uimm2 { + value: u8, +} + +impl Uimm2 { + /// Create an unsigned 2-bit immediate from an u8 + pub fn maybe_from_u8(value: u8) -> Option { + if value <= 3 { + Some(Self { value }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u8 { + self.value & 0x3 + } +} + +impl Display for Uimm2 { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.value) + } +} + +impl Inst { + pub(crate) fn imm_min() -> i64 { + let imm20_max: i64 = (1 << 19) << 12; + let imm12_max = 1 << 11; + -imm20_max - imm12_max + } + pub(crate) fn imm_max() -> i64 { + let imm20_max: i64 = ((1 << 19) - 1) << 12; + let imm12_max = (1 << 11) - 1; + imm20_max + imm12_max + } + + /// An imm20 immediate and an Imm12 immediate can generate a 32-bit immediate. + /// This helper produces an imm12, imm20, or both to generate the value. + /// + /// `value` must be between `imm_min()` and `imm_max()`, or else + /// this helper returns `None`. + pub(crate) fn generate_imm(value: u64) -> Option<(Imm20, Imm12)> { + if let Some(imm12) = Imm12::maybe_from_u64(value) { + // can be load using single imm12. + return Some((Imm20::ZERO, imm12)); + } + let value = value as i64; + if !(value >= Self::imm_min() && value <= Self::imm_max()) { + // not in range, return None. + return None; + } + const MOD_NUM: i64 = 4096; + let (imm20, imm12) = if value > 0 { + let mut imm20 = value / MOD_NUM; + let mut imm12 = value % MOD_NUM; + if imm12 >= 2048 { + imm12 -= MOD_NUM; + imm20 += 1; + } + assert!(imm12 >= -2048 && imm12 <= 2047); + (imm20, imm12) + } else { + // this is the abs value. + let value_abs = value.abs(); + let imm20 = value_abs / MOD_NUM; + let imm12 = value_abs % MOD_NUM; + let mut imm20 = -imm20; + let mut imm12 = -imm12; + if imm12 < -2048 { + imm12 += MOD_NUM; + imm20 -= 1; + } + (imm20, imm12) + }; + assert!(imm20 != 0 || imm12 != 0); + let imm20 = i32::try_from(imm20).unwrap(); + let imm12 = i16::try_from(imm12).unwrap(); + Some((Imm20::from_i32(imm20), Imm12::from_i16(imm12))) + } +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_imm12() { + let x = Imm12::ZERO; + assert_eq!(0, x.bits()); + Imm12::maybe_from_u64(0xffff_ffff_ffff_ffff).unwrap(); + } + + #[test] + fn imm20_and_imm12() { + assert!(Inst::imm_max() == (i32::MAX - 2048) as i64); + assert!(Inst::imm_min() == i32::MIN as i64 - 2048); + } +} diff --git a/hbcb/src/inst/mod.rs b/hbcb/src/inst/mod.rs new file mode 100644 index 0000000..6440c57 --- /dev/null +++ b/hbcb/src/inst/mod.rs @@ -0,0 +1,1559 @@ +//! This module defines riscv64-specific machine instruction types. + +pub use crate::ir::{ + condcodes::{FloatCC, IntCC}, + ExternalName, MemFlags, Type, +}; +use { + super::lower::isle::generated_code::{VecAMode, VecElementWidth, VecOpMasking}, + alloc::vec::Vec, + cranelift_codegen::{ + binemit::{Addend, CodeOffset, Reloc}, + ir::types::{self, F128, F16, F32, F64, I128, I16, I32, I64, I8, I8X16}, + isa::{CallConv, FunctionAlignment}, + machinst::*, + settings, CodegenError, CodegenResult, + }, + regalloc2::RegClass, + smallvec::{smallvec, SmallVec}, + std::{ + boxed::Box, + fmt::Write, + string::{String, ToString}, + }, +}; + +pub mod regs; +pub use self::regs::*; +pub mod imms; +pub use self::imms::*; +pub mod args; +pub use self::args::*; +pub mod emit; +pub use self::emit::*; +pub mod vector; +pub use self::vector::*; +pub mod encode; +pub use self::encode::*; +pub mod unwind; + +use crate::abi::Riscv64MachineDeps; + +#[cfg(test)] +mod emit_tests; + +use std::fmt::{Display, Formatter}; + +pub(crate) type VecU8 = Vec; + +//============================================================================= +// Instructions (top level): definition + +pub use crate::lower::isle::generated_code::{ + AluOPRRI, AluOPRRR, AtomicOP, CsrImmOP, CsrRegOP, FClassResult, FFlagsException, FpuOPRR, + FpuOPRRR, FpuOPRRRR, LoadOP, MInst as Inst, StoreOP, CSR, FRM, +}; +use crate::lower::isle::generated_code::{CjOp, MInst, VecAluOpRRImm5, VecAluOpRRR}; + +/// Additional information for `return_call[_ind]` instructions, left out of +/// line to lower the size of the `Inst` enum. +#[derive(Clone, Debug)] +pub struct ReturnCallInfo { + pub dest: T, + pub uses: CallArgList, + pub new_stack_arg_size: u32, +} + +/// A conditional branch target. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CondBrTarget { + /// An unresolved reference to a Label, as passed into + /// `lower_branch_group()`. + Label(MachLabel), + /// No jump; fall through to the next instruction. + Fallthrough, +} + +impl CondBrTarget { + /// Return the target's label, if it is a label-based target. + pub(crate) fn as_label(self) -> Option { + match self { + CondBrTarget::Label(l) => Some(l), + _ => None, + } + } + + pub(crate) fn is_fallthrouh(&self) -> bool { + self == &CondBrTarget::Fallthrough + } +} + +impl Display for CondBrTarget { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + CondBrTarget::Label(l) => write!(f, "{}", l.to_string()), + CondBrTarget::Fallthrough => write!(f, "0"), + } + } +} + +pub(crate) fn enc_auipc(rd: Writable, imm: Imm20) -> u32 { + let x = 0b0010111 | reg_to_gpr_num(rd.to_reg()) << 7 | imm.bits() << 12; + x +} + +pub(crate) fn enc_jalr(rd: Writable, base: Reg, offset: Imm12) -> u32 { + let x = 0b1100111 + | reg_to_gpr_num(rd.to_reg()) << 7 + | 0b000 << 12 + | reg_to_gpr_num(base) << 15 + | offset.bits() << 20; + x +} + +/// rd and src must have the same length. +pub(crate) fn gen_moves(rd: &[Writable], src: &[Reg]) -> SmallInstVec { + assert!(rd.len() == src.len()); + assert!(rd.len() > 0); + let mut insts = SmallInstVec::new(); + for (dst, src) in rd.iter().zip(src.iter()) { + let ty = Inst::canonical_type_for_rc(dst.to_reg().class()); + insts.push(Inst::gen_move(*dst, *src, ty)); + } + insts +} + +impl Inst { + /// RISC-V can have multiple instruction sizes. 2 bytes for compressed + /// instructions, 4 for regular instructions, 6 and 8 byte instructions + /// are also being considered. + const UNCOMPRESSED_INSTRUCTION_SIZE: i32 = 4; + + #[inline] + pub(crate) fn load_imm12(rd: Writable, imm: Imm12) -> Inst { + Inst::AluRRImm12 { alu_op: AluOPRRI::Addi, rd, rs: zero_reg(), imm12: imm } + } + + /// Immediates can be loaded using lui and addi instructions. + fn load_const_imm(rd: Writable, value: u64) -> Option> { + Inst::generate_imm(value).map(|(imm20, imm12)| { + let mut insts = SmallVec::new(); + + let imm20_is_zero = imm20.as_i32() == 0; + let imm12_is_zero = imm12.as_i16() == 0; + + let rs = if !imm20_is_zero { + insts.push(Inst::Lui { rd, imm: imm20 }); + rd.to_reg() + } else { + zero_reg() + }; + + // We also need to emit the addi if the value is 0, otherwise we just + // won't produce any instructions. + if !imm12_is_zero || (imm20_is_zero && imm12_is_zero) { + insts.push(Inst::AluRRImm12 { alu_op: AluOPRRI::Addi, rd, rs, imm12 }) + } + + insts + }) + } + + pub(crate) fn load_constant_u32(rd: Writable, value: u64) -> SmallInstVec { + let insts = Inst::load_const_imm(rd, value); + insts.unwrap_or_else(|| smallvec![Inst::LoadInlineConst { rd, ty: I32, imm: value }]) + } + + pub fn load_constant_u64(rd: Writable, value: u64) -> SmallInstVec { + let insts = Inst::load_const_imm(rd, value); + insts.unwrap_or_else(|| smallvec![Inst::LoadInlineConst { rd, ty: I64, imm: value }]) + } + + pub(crate) fn construct_auipc_and_jalr( + link: Option>, + tmp: Writable, + offset: i64, + ) -> [Inst; 2] { + Inst::generate_imm(offset as u64) + .map(|(imm20, imm12)| { + let a = Inst::Auipc { rd: tmp, imm: imm20 }; + let b = Inst::Jalr { + rd: link.unwrap_or(writable_zero_reg()), + base: tmp.to_reg(), + offset: imm12, + }; + [a, b] + }) + .expect("code range is too big.") + } + + /// Generic constructor for a load (zero-extending where appropriate). + pub fn gen_load(into_reg: Writable, mem: AMode, ty: Type, flags: MemFlags) -> Inst { + if ty.is_vector() { + Inst::VecLoad { + eew: VecElementWidth::from_type(ty), + to: into_reg, + from: VecAMode::UnitStride { base: mem }, + flags, + mask: VecOpMasking::Disabled, + vstate: VState::from_type(ty), + } + } else { + Inst::Load { rd: into_reg, op: LoadOP::from_type(ty), from: mem, flags } + } + } + + /// Generic constructor for a store. + pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst { + if ty.is_vector() { + Inst::VecStore { + eew: VecElementWidth::from_type(ty), + to: VecAMode::UnitStride { base: mem }, + from: from_reg, + flags, + mask: VecOpMasking::Disabled, + vstate: VState::from_type(ty), + } + } else { + Inst::Store { src: from_reg, op: StoreOP::from_type(ty), to: mem, flags } + } + } +} + +//============================================================================= + +fn vec_mask_operands(mask: &mut VecOpMasking, collector: &mut impl OperandVisitor) { + match mask { + VecOpMasking::Enabled { reg } => { + collector.reg_fixed_use(reg, pv_reg(0).into()); + } + VecOpMasking::Disabled => {} + } +} +fn vec_mask_late_operands(mask: &mut VecOpMasking, collector: &mut impl OperandVisitor) { + match mask { + VecOpMasking::Enabled { reg } => { + collector.reg_fixed_late_use(reg, pv_reg(0).into()); + } + VecOpMasking::Disabled => {} + } +} + +fn riscv64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { + match inst { + Inst::Nop0 | Inst::Nop4 => {} + Inst::BrTable { index, tmp1, tmp2, .. } => { + collector.reg_use(index); + collector.reg_early_def(tmp1); + collector.reg_early_def(tmp2); + } + Inst::Auipc { rd, .. } => collector.reg_def(rd), + Inst::Lui { rd, .. } => collector.reg_def(rd), + Inst::Fli { rd, .. } => collector.reg_def(rd), + Inst::LoadInlineConst { rd, .. } => collector.reg_def(rd), + Inst::AluRRR { rd, rs1, rs2, .. } => { + collector.reg_use(rs1); + collector.reg_use(rs2); + collector.reg_def(rd); + } + Inst::FpuRRR { rd, rs1, rs2, .. } => { + collector.reg_use(rs1); + collector.reg_use(rs2); + collector.reg_def(rd); + } + Inst::AluRRImm12 { rd, rs, .. } => { + collector.reg_use(rs); + collector.reg_def(rd); + } + Inst::CsrReg { rd, rs, .. } => { + collector.reg_use(rs); + collector.reg_def(rd); + } + Inst::CsrImm { rd, .. } => { + collector.reg_def(rd); + } + Inst::Load { rd, from, .. } => { + from.get_operands(collector); + collector.reg_def(rd); + } + Inst::Store { to, src, .. } => { + to.get_operands(collector); + collector.reg_use(src); + } + + Inst::Args { args } => { + for ArgPair { vreg, preg } in args { + collector.reg_fixed_def(vreg, *preg); + } + } + Inst::Rets { rets } => { + for RetPair { vreg, preg } in rets { + collector.reg_fixed_use(vreg, *preg); + } + } + Inst::Ret { .. } => {} + + Inst::Extend { rd, rn, .. } => { + collector.reg_use(rn); + collector.reg_def(rd); + } + Inst::Call { info, .. } => { + let CallInfo { uses, defs, .. } = &mut **info; + for CallArgPair { vreg, preg } in uses { + collector.reg_fixed_use(vreg, *preg); + } + for CallRetPair { vreg, preg } in defs { + collector.reg_fixed_def(vreg, *preg); + } + collector.reg_clobbers(info.clobbers); + } + Inst::CallInd { info } => { + let CallInfo { dest, uses, defs, .. } = &mut **info; + collector.reg_use(dest); + for CallArgPair { vreg, preg } in uses { + collector.reg_fixed_use(vreg, *preg); + } + for CallRetPair { vreg, preg } in defs { + collector.reg_fixed_def(vreg, *preg); + } + collector.reg_clobbers(info.clobbers); + } + Inst::ReturnCall { info } => { + for CallArgPair { vreg, preg } in &mut info.uses { + collector.reg_fixed_use(vreg, *preg); + } + } + Inst::ReturnCallInd { info } => { + // TODO(https://github.com/bytecodealliance/regalloc2/issues/145): + // This shouldn't be a fixed register constraint. + collector.reg_fixed_use(&mut info.dest, x_reg(5)); + + for CallArgPair { vreg, preg } in &mut info.uses { + collector.reg_fixed_use(vreg, *preg); + } + } + Inst::Jal { .. } => { + // JAL technically has a rd register, but we currently always + // hardcode it to x0. + } + Inst::CondBr { kind: IntegerCompare { rs1, rs2, .. }, .. } => { + collector.reg_use(rs1); + collector.reg_use(rs2); + } + Inst::LoadExtName { rd, .. } => { + collector.reg_def(rd); + } + Inst::ElfTlsGetAddr { rd, .. } => { + // x10 is a0 which is both the first argument and the first return value. + collector.reg_fixed_def(rd, a0()); + let mut clobbers = Riscv64MachineDeps::get_regs_clobbered_by_call(CallConv::SystemV); + clobbers.remove(px_reg(10)); + collector.reg_clobbers(clobbers); + } + Inst::LoadAddr { rd, mem } => { + mem.get_operands(collector); + collector.reg_early_def(rd); + } + + Inst::Mov { rd, rm, .. } => { + collector.reg_use(rm); + collector.reg_def(rd); + } + Inst::MovFromPReg { rd, rm } => { + debug_assert!([px_reg(2), px_reg(8)].contains(rm)); + collector.reg_def(rd); + } + Inst::Fence { .. } => {} + Inst::EBreak => {} + Inst::Udf { .. } => {} + Inst::FpuRR { rd, rs, .. } => { + collector.reg_use(rs); + collector.reg_def(rd); + } + Inst::FpuRRRR { rd, rs1, rs2, rs3, .. } => { + collector.reg_use(rs1); + collector.reg_use(rs2); + collector.reg_use(rs3); + collector.reg_def(rd); + } + + Inst::Jalr { rd, base, .. } => { + collector.reg_use(base); + collector.reg_def(rd); + } + Inst::Atomic { rd, addr, src, .. } => { + collector.reg_use(addr); + collector.reg_use(src); + collector.reg_def(rd); + } + Inst::Select { dst, condition: IntegerCompare { rs1, rs2, .. }, x, y, .. } => { + // Mark the condition registers as late use so that they don't overlap with the destination + // register. We may potentially write to the destination register before evaluating the + // condition. + collector.reg_late_use(rs1); + collector.reg_late_use(rs2); + + for reg in x.regs_mut() { + collector.reg_use(reg); + } + for reg in y.regs_mut() { + collector.reg_use(reg); + } + + // If there's more than one destination register then use + // `reg_early_def` to prevent destination registers from overlapping + // with any operands. This ensures that the lowering doesn't have to + // deal with a situation such as when the input registers need to be + // swapped when moved to the destination. + // + // When there's only one destination register though don't use an + // early def because once the register is written no other inputs + // are read so it's ok for the destination to overlap the sources. + // The condition registers are already marked as late use so they + // won't overlap with the destination. + match dst.regs_mut() { + [reg] => collector.reg_def(reg), + regs => { + for d in regs { + collector.reg_early_def(d); + } + } + } + } + Inst::AtomicCas { offset, t0, dst, e, addr, v, .. } => { + collector.reg_use(offset); + collector.reg_use(e); + collector.reg_use(addr); + collector.reg_use(v); + collector.reg_early_def(t0); + collector.reg_early_def(dst); + } + + Inst::RawData { .. } => {} + Inst::AtomicStore { src, p, .. } => { + collector.reg_use(src); + collector.reg_use(p); + } + Inst::AtomicLoad { rd, p, .. } => { + collector.reg_use(p); + collector.reg_def(rd); + } + Inst::AtomicRmwLoop { offset, dst, p, x, t0, .. } => { + collector.reg_use(offset); + collector.reg_use(p); + collector.reg_use(x); + collector.reg_early_def(t0); + collector.reg_early_def(dst); + } + Inst::TrapIf { rs1, rs2, .. } => { + collector.reg_use(rs1); + collector.reg_use(rs2); + } + Inst::Unwind { .. } => {} + Inst::DummyUse { reg } => { + collector.reg_use(reg); + } + Inst::Popcnt { sum, step, rs, tmp, .. } => { + collector.reg_use(rs); + collector.reg_early_def(tmp); + collector.reg_early_def(step); + collector.reg_early_def(sum); + } + Inst::Cltz { sum, step, tmp, rs, .. } => { + collector.reg_use(rs); + collector.reg_early_def(tmp); + collector.reg_early_def(step); + collector.reg_early_def(sum); + } + Inst::Brev8 { rs, rd, step, tmp, tmp2, .. } => { + collector.reg_use(rs); + collector.reg_early_def(step); + collector.reg_early_def(tmp); + collector.reg_early_def(tmp2); + collector.reg_early_def(rd); + } + Inst::StackProbeLoop { .. } => { + // StackProbeLoop has a tmp register and StackProbeLoop used at gen_prologue. + // t3 will do the job. (t3 is caller-save register and not used directly by compiler like writable_spilltmp_reg) + // gen_prologue is called at emit stage. + // no need let reg alloc know. + } + Inst::VecAluRRRR { op, vd, vd_src, vs1, vs2, mask, .. } => { + debug_assert_eq!(vd_src.class(), RegClass::Vector); + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + debug_assert_eq!(vs1.class(), op.vs1_regclass()); + + collector.reg_late_use(vs1); + collector.reg_late_use(vs2); + collector.reg_use(vd_src); + collector.reg_reuse_def(vd, 2); // `vd` == `vd_src`. + vec_mask_late_operands(mask, collector); + } + Inst::VecAluRRRImm5 { op, vd, vd_src, vs2, mask, .. } => { + debug_assert_eq!(vd_src.class(), RegClass::Vector); + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + + // If the operation forbids source/destination overlap we need to + // ensure that the source and destination registers are different. + if op.forbids_overlaps(mask) { + collector.reg_late_use(vs2); + collector.reg_use(vd_src); + collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`. + vec_mask_late_operands(mask, collector); + } else { + collector.reg_use(vs2); + collector.reg_use(vd_src); + collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`. + vec_mask_operands(mask, collector); + } + } + Inst::VecAluRRR { op, vd, vs1, vs2, mask, .. } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + debug_assert_eq!(vs1.class(), op.vs1_regclass()); + + collector.reg_use(vs1); + collector.reg_use(vs2); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_overlaps(mask) { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + + vec_mask_operands(mask, collector); + } + Inst::VecAluRRImm5 { op, vd, vs2, mask, .. } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + + collector.reg_use(vs2); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_overlaps(mask) { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + + vec_mask_operands(mask, collector); + } + Inst::VecAluRR { op, vd, vs, mask, .. } => { + debug_assert_eq!(vd.to_reg().class(), op.dst_regclass()); + debug_assert_eq!(vs.class(), op.src_regclass()); + + collector.reg_use(vs); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_overlaps(mask) { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + + vec_mask_operands(mask, collector); + } + Inst::VecAluRImm5 { op, vd, mask, .. } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert!(!op.forbids_overlaps(mask)); + + collector.reg_def(vd); + vec_mask_operands(mask, collector); + } + Inst::VecSetState { rd, .. } => { + collector.reg_def(rd); + } + Inst::VecLoad { to, from, mask, .. } => { + from.get_operands(collector); + collector.reg_def(to); + vec_mask_operands(mask, collector); + } + Inst::VecStore { to, from, mask, .. } => { + to.get_operands(collector); + collector.reg_use(from); + vec_mask_operands(mask, collector); + } + } +} + +impl MachInst for Inst { + type ABIMachineSpec = Riscv64MachineDeps; + type LabelUse = LabelUse; + + // https://github.com/riscv/riscv-isa-manual/issues/850 + // all zero will cause invalid opcode. + const TRAP_OPCODE: &'static [u8] = &[0; 4]; + + fn gen_dummy_use(reg: Reg) -> Self { + Inst::DummyUse { reg } + } + + fn canonical_type_for_rc(rc: RegClass) -> Type { + match rc { + regalloc2::RegClass::Int => I64, + regalloc2::RegClass::Float => F64, + regalloc2::RegClass::Vector => I8X16, + } + } + + fn is_safepoint(&self) -> bool { + match self { + Inst::Call { .. } | Inst::CallInd { .. } => true, + _ => false, + } + } + + fn get_operands(&mut self, collector: &mut impl OperandVisitor) { + riscv64_get_operands(self, collector); + } + + fn is_move(&self) -> Option<(Writable, Reg)> { + match self { + Inst::Mov { rd, rm, .. } => Some((*rd, *rm)), + _ => None, + } + } + + fn is_included_in_clobbers(&self) -> bool { + match self { + &Inst::Args { .. } => false, + _ => true, + } + } + + fn is_trap(&self) -> bool { + match self { + Self::Udf { .. } => true, + _ => false, + } + } + + fn is_args(&self) -> bool { + match self { + Self::Args { .. } => true, + _ => false, + } + } + + fn is_term(&self) -> MachTerminator { + match self { + &Inst::Jal { .. } => MachTerminator::Uncond, + &Inst::CondBr { .. } => MachTerminator::Cond, + &Inst::Jalr { .. } => MachTerminator::Uncond, + &Inst::Rets { .. } => MachTerminator::Ret, + &Inst::BrTable { .. } => MachTerminator::Indirect, + &Inst::ReturnCall { .. } | &Inst::ReturnCallInd { .. } => MachTerminator::RetCall, + _ => MachTerminator::None, + } + } + + fn is_mem_access(&self) -> bool { + panic!("TODO FILL ME OUT") + } + + fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { + let x = Inst::Mov { rd: to_reg, rm: from_reg, ty }; + x + } + + fn gen_nop(preferred_size: usize) -> Inst { + if preferred_size == 0 { + return Inst::Nop0; + } + // We can't give a NOP (or any insn) < 4 bytes. + assert!(preferred_size >= 4); + Inst::Nop4 + } + + fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> { + match ty { + I8 => Ok((&[RegClass::Int], &[I8])), + I16 => Ok((&[RegClass::Int], &[I16])), + I32 => Ok((&[RegClass::Int], &[I32])), + I64 => Ok((&[RegClass::Int], &[I64])), + F16 => Ok((&[RegClass::Float], &[F16])), + F32 => Ok((&[RegClass::Float], &[F32])), + F64 => Ok((&[RegClass::Float], &[F64])), + I128 => Ok((&[RegClass::Int, RegClass::Int], &[I64, I64])), + _ if ty.is_vector() => { + debug_assert!(ty.bits() <= 512); + + // Here we only need to return a SIMD type with the same size as `ty`. + // We use these types for spills and reloads, so prefer types with lanes <= 31 + // since that fits in the immediate field of `vsetivli`. + const SIMD_TYPES: [[Type; 1]; 6] = [ + [types::I8X2], + [types::I8X4], + [types::I8X8], + [types::I8X16], + [types::I16X16], + [types::I32X16], + ]; + let idx = (ty.bytes().ilog2() - 1) as usize; + let ty = &SIMD_TYPES[idx][..]; + + Ok((&[RegClass::Vector], ty)) + } + _ => Err(CodegenError::Unsupported(format!("Unexpected SSA-value type: {ty}"))), + } + } + + fn gen_jump(target: MachLabel) -> Inst { + Inst::Jal { label: target } + } + + fn worst_case_size() -> CodeOffset { + // Our worst case size is determined by the riscv64_worst_case_instruction_size test + 84 + } + + fn ref_type_regclass(_settings: &settings::Flags) -> RegClass { + RegClass::Int + } + + fn function_alignment() -> FunctionAlignment { + FunctionAlignment { minimum: 2, preferred: 4 } + } +} + +//============================================================================= +// Pretty-printing of instructions. +pub fn reg_name(reg: Reg) -> String { + match reg.to_real_reg() { + Some(real) => match real.class() { + RegClass::Int => match real.hw_enc() { + 0 => "zero".into(), + 1 => "ra".into(), + 2 => "sp".into(), + 3 => "gp".into(), + 4 => "tp".into(), + 5..=7 => format!("t{}", real.hw_enc() - 5), + 8 => "fp".into(), + 9 => "s1".into(), + 10..=17 => format!("a{}", real.hw_enc() - 10), + 18..=27 => format!("s{}", real.hw_enc() - 16), + 28..=31 => format!("t{}", real.hw_enc() - 25), + _ => unreachable!(), + }, + RegClass::Float => match real.hw_enc() { + 0..=7 => format!("ft{}", real.hw_enc() - 0), + 8..=9 => format!("fs{}", real.hw_enc() - 8), + 10..=17 => format!("fa{}", real.hw_enc() - 10), + 18..=27 => format!("fs{}", real.hw_enc() - 16), + 28..=31 => format!("ft{}", real.hw_enc() - 20), + _ => unreachable!(), + }, + RegClass::Vector => format!("v{}", real.hw_enc()), + }, + None => { + format!("{reg:?}") + } + } +} + +impl Inst { + fn print_with_state(&self, _state: &mut EmitState) -> String { + let format_reg = |reg: Reg| -> String { reg_name(reg) }; + + let format_vec_amode = |amode: &VecAMode| -> String { + match amode { + VecAMode::UnitStride { base } => base.to_string(), + } + }; + + let format_mask = |mask: &VecOpMasking| -> String { + match mask { + VecOpMasking::Enabled { reg } => format!(",{}.t", format_reg(*reg)), + VecOpMasking::Disabled => format!(""), + } + }; + + let format_regs = |regs: &[Reg]| -> String { + let mut x = if regs.len() > 1 { String::from("[") } else { String::default() }; + regs.iter().for_each(|i| { + x.push_str(format_reg(*i).as_str()); + if *i != *regs.last().unwrap() { + x.push_str(","); + } + }); + if regs.len() > 1 { + x.push_str("]"); + } + x + }; + let format_labels = |labels: &[MachLabel]| -> String { + if labels.len() == 0 { + return String::from("[_]"); + } + let mut x = String::from("["); + labels.iter().for_each(|l| { + x.push_str( + format!("{:?}{}", l, if l != labels.last().unwrap() { "," } else { "" },) + .as_str(), + ); + }); + x.push_str("]"); + x + }; + + fn format_frm(rounding_mode: FRM) -> String { + format!(",{}", rounding_mode.to_static_str()) + } + + match self { + &Inst::Nop0 => { + format!("##zero length nop") + } + &Inst::Nop4 => { + format!("##fixed 4-size nop") + } + &Inst::StackProbeLoop { guard_size, probe_count, tmp } => { + let tmp = format_reg(tmp.to_reg()); + format!( + "inline_stack_probe##guard_size={guard_size} probe_count={probe_count} tmp={tmp}" + ) + } + &Inst::AtomicStore { src, ty, p } => { + let src = format_reg(src); + let p = format_reg(p); + format!("atomic_store.{ty} {src},({p})") + } + &Inst::DummyUse { reg } => { + let reg = format_reg(reg); + format!("dummy_use {reg}") + } + + &Inst::AtomicLoad { rd, ty, p } => { + let p = format_reg(p); + let rd = format_reg(rd.to_reg()); + format!("atomic_load.{ty} {rd},({p})") + } + &Inst::AtomicRmwLoop { offset, op, dst, ty, p, x, t0 } => { + let offset = format_reg(offset); + let p = format_reg(p); + let x = format_reg(x); + let t0 = format_reg(t0.to_reg()); + let dst = format_reg(dst.to_reg()); + format!("atomic_rmw.{ty} {op} {dst},{x},({p})##t0={t0} offset={offset}") + } + + &Inst::RawData { ref data } => match data.len() { + 4 => { + let mut bytes = [0; 4]; + for i in 0..bytes.len() { + bytes[i] = data[i]; + } + format!(".4byte 0x{:x}", u32::from_le_bytes(bytes)) + } + 8 => { + let mut bytes = [0; 8]; + for i in 0..bytes.len() { + bytes[i] = data[i]; + } + format!(".8byte 0x{:x}", u64::from_le_bytes(bytes)) + } + _ => { + format!(".data {data:?}") + } + }, + &Inst::Unwind { ref inst } => { + format!("unwind {inst:?}") + } + &Inst::Brev8 { rs, ty, step, tmp, tmp2, rd } => { + let rs = format_reg(rs); + let step = format_reg(step.to_reg()); + let tmp = format_reg(tmp.to_reg()); + let tmp2 = format_reg(tmp2.to_reg()); + let rd = format_reg(rd.to_reg()); + format!("brev8 {rd},{rs}##tmp={tmp} tmp2={tmp2} step={step} ty={ty}") + } + &Inst::Popcnt { sum, step, rs, tmp, ty } => { + let rs = format_reg(rs); + let tmp = format_reg(tmp.to_reg()); + let step = format_reg(step.to_reg()); + let sum = format_reg(sum.to_reg()); + format!("popcnt {sum},{rs}##ty={ty} tmp={tmp} step={step}") + } + &Inst::Cltz { sum, step, rs, tmp, ty, leading } => { + let rs = format_reg(rs); + let tmp = format_reg(tmp.to_reg()); + let step = format_reg(step.to_reg()); + let sum = format_reg(sum.to_reg()); + format!( + "{} {},{}##ty={} tmp={} step={}", + if leading { "clz" } else { "ctz" }, + sum, + rs, + ty, + tmp, + step + ) + } + &Inst::AtomicCas { offset, t0, dst, e, addr, v, ty } => { + let offset = format_reg(offset); + let e = format_reg(e); + let addr = format_reg(addr); + let v = format_reg(v); + let t0 = format_reg(t0.to_reg()); + let dst = format_reg(dst.to_reg()); + format!("atomic_cas.{ty} {dst},{e},{v},({addr})##t0={t0} offset={offset}",) + } + &Inst::BrTable { index, tmp1, tmp2, ref targets } => { + format!( + "{} {},{}##tmp1={},tmp2={}", + "br_table", + format_reg(index), + format_labels(&targets[..]), + format_reg(tmp1.to_reg()), + format_reg(tmp2.to_reg()), + ) + } + &Inst::Auipc { rd, imm } => { + format!("{} {},{}", "auipc", format_reg(rd.to_reg()), imm.as_i32(),) + } + &Inst::Jalr { rd, base, offset } => { + let base = format_reg(base); + let rd = format_reg(rd.to_reg()); + format!("{} {},{}({})", "jalr", rd, offset.as_i16(), base) + } + &Inst::Lui { rd, ref imm } => { + format!("{} {},{}", "lui", format_reg(rd.to_reg()), imm.as_i32()) + } + &Inst::Fli { rd, ty, imm } => { + let rd_s = format_reg(rd.to_reg()); + let imm_s = imm.format(); + let suffix = match ty { + F32 => "s", + F64 => "d", + _ => unreachable!(), + }; + + format!("fli.{suffix} {rd_s},{imm_s}") + } + &Inst::LoadInlineConst { rd, imm, .. } => { + let rd = format_reg(rd.to_reg()); + let mut buf = String::new(); + write!(&mut buf, "auipc {rd},0; ").unwrap(); + write!(&mut buf, "ld {rd},12({rd}); ").unwrap(); + write!(&mut buf, "j {}; ", Inst::UNCOMPRESSED_INSTRUCTION_SIZE + 8).unwrap(); + write!(&mut buf, ".8byte 0x{imm:x}").unwrap(); + buf + } + &Inst::AluRRR { alu_op, rd, rs1, rs2 } => { + let rs1_s = format_reg(rs1); + let rs2_s = format_reg(rs2); + let rd_s = format_reg(rd.to_reg()); + match alu_op { + AluOPRRR::Adduw if rs2 == zero_reg() => { + format!("zext.w {rd_s},{rs1_s}") + } + _ => { + format!("{} {},{},{}", alu_op.op_name(), rd_s, rs1_s, rs2_s) + } + } + } + &Inst::FpuRR { alu_op, width, frm, rd, rs } => { + let rs = format_reg(rs); + let rd = format_reg(rd.to_reg()); + let frm = if alu_op.has_frm() { format_frm(frm) } else { String::new() }; + format!("{} {rd},{rs}{frm}", alu_op.op_name(width)) + } + &Inst::FpuRRR { alu_op, width, rd, rs1, rs2, frm } => { + let rs1 = format_reg(rs1); + let rs2 = format_reg(rs2); + let rd = format_reg(rd.to_reg()); + let frm = if alu_op.has_frm() { format_frm(frm) } else { String::new() }; + + let rs1_is_rs2 = rs1 == rs2; + match alu_op { + FpuOPRRR::Fsgnj if rs1_is_rs2 => format!("fmv.{width} {rd},{rs1}"), + FpuOPRRR::Fsgnjn if rs1_is_rs2 => format!("fneg.{width} {rd},{rs1}"), + FpuOPRRR::Fsgnjx if rs1_is_rs2 => format!("fabs.{width} {rd},{rs1}"), + _ => format!("{} {rd},{rs1},{rs2}{frm}", alu_op.op_name(width)), + } + } + &Inst::FpuRRRR { alu_op, rd, rs1, rs2, rs3, frm, width } => { + let rs1 = format_reg(rs1); + let rs2 = format_reg(rs2); + let rs3 = format_reg(rs3); + let rd = format_reg(rd.to_reg()); + let frm = format_frm(frm); + let op_name = alu_op.op_name(width); + format!("{op_name} {rd},{rs1},{rs2},{rs3}{frm}") + } + &Inst::AluRRImm12 { alu_op, rd, rs, ref imm12 } => { + let rs_s = format_reg(rs); + let rd = format_reg(rd.to_reg()); + + // Some of these special cases are better known as + // their pseudo-instruction version, so prefer printing those. + match (alu_op, rs, imm12) { + (AluOPRRI::Addi, rs, _) if rs == zero_reg() => { + return format!("li {},{}", rd, imm12.as_i16()); + } + (AluOPRRI::Addiw, _, imm12) if imm12.as_i16() == 0 => { + return format!("sext.w {rd},{rs_s}"); + } + (AluOPRRI::Xori, _, imm12) if imm12.as_i16() == -1 => { + return format!("not {rd},{rs_s}"); + } + (AluOPRRI::SltiU, _, imm12) if imm12.as_i16() == 1 => { + return format!("seqz {rd},{rs_s}"); + } + (alu_op, _, _) if alu_op.option_funct12().is_some() => { + format!("{} {},{}", alu_op.op_name(), rd, rs_s) + } + (alu_op, _, imm12) => { + format!("{} {},{},{}", alu_op.op_name(), rd, rs_s, imm12.as_i16()) + } + } + } + &Inst::CsrReg { op, rd, rs, csr } => { + let rs_s = format_reg(rs); + let rd_s = format_reg(rd.to_reg()); + + match (op, csr, rd) { + (CsrRegOP::CsrRW, CSR::Frm, rd) if rd.to_reg() == zero_reg() => { + format!("fsrm {rs_s}") + } + _ => { + format!("{op} {rd_s},{csr},{rs_s}") + } + } + } + &Inst::CsrImm { op, rd, csr, imm } => { + let rd_s = format_reg(rd.to_reg()); + + match (op, csr, rd) { + (CsrImmOP::CsrRWI, CSR::Frm, rd) if rd.to_reg() != zero_reg() => { + format!("fsrmi {rd_s},{imm}") + } + _ => { + format!("{op} {rd_s},{csr},{imm}") + } + } + } + &Inst::Load { rd, op, from, flags: _flags } => { + let base = from.to_string(); + let rd = format_reg(rd.to_reg()); + format!("{} {},{}", op.op_name(), rd, base,) + } + &Inst::Store { to, src, op, flags: _flags } => { + let base = to.to_string(); + let src = format_reg(src); + format!("{} {},{}", op.op_name(), src, base,) + } + &Inst::Args { ref args } => { + let mut s = "args".to_string(); + for arg in args { + let preg = format_reg(arg.preg); + let def = format_reg(arg.vreg.to_reg()); + write!(&mut s, " {def}={preg}").unwrap(); + } + s + } + &Inst::Rets { ref rets } => { + let mut s = "rets".to_string(); + for ret in rets { + let preg = format_reg(ret.preg); + let vreg = format_reg(ret.vreg); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } + &Inst::Ret {} => "ret".to_string(), + + &MInst::Extend { rd, rn, signed, from_bits, .. } => { + let rn = format_reg(rn); + let rd = format_reg(rd.to_reg()); + return if signed == false && from_bits == 8 { + format!("andi {rd},{rn}") + } else { + let op = if signed { "srai" } else { "srli" }; + let shift_bits = (64 - from_bits) as i16; + format!("slli {rd},{rn},{shift_bits}; {op} {rd},{rd},{shift_bits}") + }; + } + &MInst::Call { ref info } => format!("call {}", info.dest.display(None)), + &MInst::CallInd { ref info } => { + let rd = format_reg(info.dest); + format!("callind {rd}") + } + &MInst::ReturnCall { ref info } => { + let mut s = format!( + "return_call {:?} new_stack_arg_size:{}", + info.dest, info.new_stack_arg_size + ); + for ret in &info.uses { + let preg = format_reg(ret.preg); + let vreg = format_reg(ret.vreg); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } + &MInst::ReturnCallInd { ref info } => { + let callee = format_reg(info.dest); + let mut s = format!( + "return_call_ind {callee} new_stack_arg_size:{}", + info.new_stack_arg_size + ); + for ret in &info.uses { + let preg = format_reg(ret.preg); + let vreg = format_reg(ret.vreg); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } + &MInst::TrapIf { rs1, rs2, cc, trap_code } => { + let rs1 = format_reg(rs1); + let rs2 = format_reg(rs2); + format!("trap_if {trap_code}##({rs1} {cc} {rs2})") + } + &MInst::Jal { label } => { + format!("j {}", label.to_string()) + } + &MInst::CondBr { taken, not_taken, kind, .. } => { + let rs1 = format_reg(kind.rs1); + let rs2 = format_reg(kind.rs2); + if not_taken.is_fallthrouh() && taken.as_label().is_none() { + format!("{} {},{},0", kind.op_name(), rs1, rs2) + } else { + let x = format!( + "{} {},{},taken({}),not_taken({})", + kind.op_name(), + rs1, + rs2, + taken, + not_taken + ); + x + } + } + &MInst::Atomic { op, rd, addr, src, amo } => { + let op_name = op.op_name(amo); + let addr = format_reg(addr); + let src = format_reg(src); + let rd = format_reg(rd.to_reg()); + if op.is_load() { + format!("{op_name} {rd},({addr})") + } else { + format!("{op_name} {rd},{src},({addr})") + } + } + &MInst::LoadExtName { rd, ref name, offset } => { + let rd = format_reg(rd.to_reg()); + format!("load_sym {},{}{:+}", rd, name.display(None), offset) + } + &Inst::ElfTlsGetAddr { rd, ref name } => { + let rd = format_reg(rd.to_reg()); + format!("elf_tls_get_addr {rd},{}", name.display(None)) + } + &MInst::LoadAddr { ref rd, ref mem } => { + let rs = mem.to_string(); + let rd = format_reg(rd.to_reg()); + format!("load_addr {rd},{rs}") + } + &MInst::Mov { rd, rm, ty } => { + let rm = format_reg(rm); + let rd = format_reg(rd.to_reg()); + + let op = match ty { + F16 => "fmv.h", + F32 => "fmv.s", + F64 => "fmv.d", + ty if ty.is_vector() => "vmv1r.v", + _ => "mv", + }; + + format!("{op} {rd},{rm}") + } + &MInst::MovFromPReg { rd, rm } => { + let rd = format_reg(rd.to_reg()); + debug_assert!([px_reg(2), px_reg(8)].contains(&rm)); + let rm = reg_name(Reg::from(rm)); + format!("mv {rd},{rm}") + } + &MInst::Fence { pred, succ } => { + format!( + "fence {},{}", + Inst::fence_req_to_string(pred), + Inst::fence_req_to_string(succ), + ) + } + &MInst::Select { ref dst, condition, ref x, ref y } => { + let c_rs1 = format_reg(condition.rs1); + let c_rs2 = format_reg(condition.rs2); + let x = format_regs(x.regs()); + let y = format_regs(y.regs()); + let dst = dst.map(|r| r.to_reg()); + let dst = format_regs(dst.regs()); + format!( + "select {},{},{}##condition=({} {} {})", + dst, + x, + y, + c_rs1, + condition.kind.to_static_str(), + c_rs2 + ) + } + &MInst::Udf { trap_code } => format!("udf##trap_code={trap_code}"), + &MInst::EBreak {} => String::from("ebreak"), + &Inst::VecAluRRRR { op, vd, vd_src, vs1, vs2, ref mask, ref vstate } => { + let vs1_s = format_reg(vs1); + let vs2_s = format_reg(vs2); + let vd_src_s = format_reg(vd_src); + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + let vd_fmt = if vd_s != vd_src_s { format!("{vd_s},{vd_src_s}") } else { vd_s }; + + // Note: vs2 and vs1 here are opposite to the standard scalar ordering. + // This is noted in Section 10.1 of the RISC-V Vector spec. + format!("{op} {vd_fmt},{vs2_s},{vs1_s}{mask} {vstate}") + } + &Inst::VecAluRRRImm5 { op, vd, imm, vs2, ref mask, ref vstate, .. } => { + let vs2_s = format_reg(vs2); + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + // Some opcodes interpret the immediate as unsigned, lets show the + // correct number here. + let imm_s = + if op.imm_is_unsigned() { format!("{}", imm.bits()) } else { format!("{imm}") }; + + format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}") + } + &Inst::VecAluRRR { op, vd, vs1, vs2, ref mask, ref vstate } => { + let vs1_s = format_reg(vs1); + let vs2_s = format_reg(vs2); + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + // Note: vs2 and vs1 here are opposite to the standard scalar ordering. + // This is noted in Section 10.1 of the RISC-V Vector spec. + match (op, vs2, vs1) { + (VecAluOpRRR::VrsubVX, _, vs1) if vs1 == zero_reg() => { + format!("vneg.v {vd_s},{vs2_s}{mask} {vstate}") + } + (VecAluOpRRR::VfsgnjnVV, vs2, vs1) if vs2 == vs1 => { + format!("vfneg.v {vd_s},{vs2_s}{mask} {vstate}") + } + (VecAluOpRRR::VfsgnjxVV, vs2, vs1) if vs2 == vs1 => { + format!("vfabs.v {vd_s},{vs2_s}{mask} {vstate}") + } + (VecAluOpRRR::VmnandMM, vs2, vs1) if vs2 == vs1 => { + format!("vmnot.m {vd_s},{vs2_s}{mask} {vstate}") + } + _ => format!("{op} {vd_s},{vs2_s},{vs1_s}{mask} {vstate}"), + } + } + &Inst::VecAluRRImm5 { op, vd, imm, vs2, ref mask, ref vstate } => { + let vs2_s = format_reg(vs2); + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + // Some opcodes interpret the immediate as unsigned, lets show the + // correct number here. + let imm_s = + if op.imm_is_unsigned() { format!("{}", imm.bits()) } else { format!("{imm}") }; + + match (op, imm) { + (VecAluOpRRImm5::VxorVI, imm) if imm == Imm5::maybe_from_i8(-1).unwrap() => { + format!("vnot.v {vd_s},{vs2_s}{mask} {vstate}") + } + _ => format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}"), + } + } + &Inst::VecAluRR { op, vd, vs, ref mask, ref vstate } => { + let vs_s = format_reg(vs); + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + format!("{op} {vd_s},{vs_s}{mask} {vstate}") + } + &Inst::VecAluRImm5 { op, vd, imm, ref mask, ref vstate } => { + let vd_s = format_reg(vd.to_reg()); + let mask = format_mask(mask); + + format!("{op} {vd_s},{imm}{mask} {vstate}") + } + &Inst::VecSetState { rd, ref vstate } => { + let rd_s = format_reg(rd.to_reg()); + assert!(vstate.avl.is_static()); + format!("vsetivli {}, {}, {}", rd_s, vstate.avl, vstate.vtype) + } + Inst::VecLoad { eew, to, from, ref mask, ref vstate, .. } => { + let base = format_vec_amode(from); + let vd = format_reg(to.to_reg()); + let mask = format_mask(mask); + + format!("vl{eew}.v {vd},{base}{mask} {vstate}") + } + Inst::VecStore { eew, to, from, ref mask, ref vstate, .. } => { + let dst = format_vec_amode(to); + let vs3 = format_reg(*from); + let mask = format_mask(mask); + + format!("vs{eew}.v {vs3},{dst}{mask} {vstate}") + } + } + } +} + +/// Different forms of label references for different instruction formats. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// 20-bit branch offset (unconditional branches). PC-rel, offset is + /// imm << 1. Immediate is 20 signed bits. Use in Jal instructions. + Jal20, + + /// The unconditional jump instructions all use PC-relative + /// addressing to help support position independent code. The JALR + /// instruction was defined to enable a two-instruction sequence to + /// jump anywhere in a 32-bit absolute address range. A LUI + /// instruction can first load rs1 with the upper 20 bits of a + /// target address, then JALR can add in the lower bits. Similarly, + /// AUIPC then JALR can jump anywhere in a 32-bit pc-relative + /// address range. + PCRel32, + + /// All branch instructions use the B-type instruction format. The + /// 12-bit B-immediate encodes signed offsets in multiples of 2, and + /// is added to the current pc to give the target address. The + /// conditional branch range is ±4 KiB. + B12, + + /// Equivalent to the `R_RISCV_PCREL_HI20` relocation, Allows setting + /// the immediate field of an `auipc` instruction. + PCRelHi20, + + /// Similar to the `R_RISCV_PCREL_LO12_I` relocation but pointing to + /// the final address, instead of the `PCREL_HI20` label. Allows setting + /// the immediate field of I Type instructions such as `addi` or `lw`. + /// + /// Since we currently don't support offsets in labels, this relocation has + /// an implicit offset of 4. + PCRelLo12I, + + /// 11-bit PC-relative jump offset. Equivalent to the `RVC_JUMP` relocation + RVCJump, +} + +impl MachInstLabelUse for LabelUse { + /// Alignment for veneer code. Every Riscv64 instruction must be + /// 4-byte-aligned. + const ALIGN: CodeOffset = 4; + + /// Maximum PC-relative range (positive), inclusive. + fn max_pos_range(self) -> CodeOffset { + match self { + LabelUse::Jal20 => ((1 << 19) - 1) * 2, + LabelUse::PCRelLo12I | LabelUse::PCRelHi20 | LabelUse::PCRel32 => { + Inst::imm_max() as CodeOffset + } + LabelUse::B12 => ((1 << 11) - 1) * 2, + LabelUse::RVCJump => ((1 << 10) - 1) * 2, + } + } + + /// Maximum PC-relative range (negative). + fn max_neg_range(self) -> CodeOffset { + match self { + LabelUse::PCRel32 => Inst::imm_min().abs() as CodeOffset, + _ => self.max_pos_range() + 2, + } + } + + /// Size of window into code needed to do the patch. + fn patch_size(self) -> CodeOffset { + match self { + LabelUse::RVCJump => 2, + LabelUse::Jal20 | LabelUse::B12 | LabelUse::PCRelHi20 | LabelUse::PCRelLo12I => 4, + LabelUse::PCRel32 => 8, + } + } + + /// Perform the patch. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + assert!(use_offset % 2 == 0); + assert!(label_offset % 2 == 0); + let offset = (label_offset as i64) - (use_offset as i64); + + // re-check range + assert!( + offset >= -(self.max_neg_range() as i64) && offset <= (self.max_pos_range() as i64), + "{self:?} offset '{offset}' use_offset:'{use_offset}' label_offset:'{label_offset}' must not exceed max range.", + ); + self.patch_raw_offset(buffer, offset); + } + + /// Is a veneer supported for this label reference type? + fn supports_veneer(self) -> bool { + match self { + Self::Jal20 | Self::B12 | Self::RVCJump => true, + _ => false, + } + } + + /// How large is the veneer, if supported? + fn veneer_size(self) -> CodeOffset { + match self { + Self::B12 | Self::Jal20 | Self::RVCJump => 8, + _ => unreachable!(), + } + } + + fn worst_case_veneer_size() -> CodeOffset { + 8 + } + + /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return + /// an offset and label-use for the veneer's use of the original label. + fn generate_veneer( + self, + buffer: &mut [u8], + veneer_offset: CodeOffset, + ) -> (CodeOffset, LabelUse) { + let base = writable_spilltmp_reg(); + { + let x = enc_auipc(base, Imm20::ZERO).to_le_bytes(); + buffer[0] = x[0]; + buffer[1] = x[1]; + buffer[2] = x[2]; + buffer[3] = x[3]; + } + { + let x = enc_jalr(writable_zero_reg(), base.to_reg(), Imm12::ZERO).to_le_bytes(); + buffer[4] = x[0]; + buffer[5] = x[1]; + buffer[6] = x[2]; + buffer[7] = x[3]; + } + (veneer_offset, Self::PCRel32) + } + + fn from_reloc(reloc: Reloc, addend: Addend) -> Option { + match (reloc, addend) { + (Reloc::RiscvCallPlt, _) => Some(Self::PCRel32), + _ => None, + } + } +} + +impl LabelUse { + #[allow(dead_code)] // in case it's needed in the future + fn offset_in_range(self, offset: i64) -> bool { + let min = -(self.max_neg_range() as i64); + let max = self.max_pos_range() as i64; + offset >= min && offset <= max + } + + fn patch_raw_offset(self, buffer: &mut [u8], offset: i64) { + let insn = match self { + LabelUse::RVCJump => u16::from_le_bytes(buffer[..2].try_into().unwrap()) as u32, + _ => u32::from_le_bytes(buffer[..4].try_into().unwrap()), + }; + + match self { + LabelUse::Jal20 => { + let offset = offset as u32; + let v = ((offset >> 12 & 0b1111_1111) << 12) + | ((offset >> 11 & 0b1) << 20) + | ((offset >> 1 & 0b11_1111_1111) << 21) + | ((offset >> 20 & 0b1) << 31); + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn | v)); + } + LabelUse::PCRel32 => { + let insn2 = u32::from_le_bytes([buffer[4], buffer[5], buffer[6], buffer[7]]); + Inst::generate_imm(offset as u64) + .map(|(imm20, imm12)| { + // Encode the OR-ed-in value with zero_reg(). The + // register parameter must be in the original + // encoded instruction and or'ing in zeroes does not + // change it. + buffer[0..4].clone_from_slice(&u32::to_le_bytes( + insn | enc_auipc(writable_zero_reg(), imm20), + )); + buffer[4..8].clone_from_slice(&u32::to_le_bytes( + insn2 | enc_jalr(writable_zero_reg(), zero_reg(), imm12), + )); + }) + // expect make sure we handled. + .expect("we have check the range before,this is a compiler error."); + } + + LabelUse::B12 => { + let offset = offset as u32; + let v = ((offset >> 11 & 0b1) << 7) + | ((offset >> 1 & 0b1111) << 8) + | ((offset >> 5 & 0b11_1111) << 25) + | ((offset >> 12 & 0b1) << 31); + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn | v)); + } + + LabelUse::PCRelHi20 => { + // See https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#pc-relative-symbol-addresses + // + // We need to add 0x800 to ensure that we land at the next page as soon as it goes out of range for the + // Lo12 relocation. That relocation is signed and has a maximum range of -2048..2047. So when we get an + // offset of 2048, we need to land at the next page and subtract instead. + let offset = offset as u32; + let hi20 = offset.wrapping_add(0x800) >> 12; + let insn = (insn & 0xFFF) | (hi20 << 12); + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn)); + } + + LabelUse::PCRelLo12I => { + // `offset` is the offset from the current instruction to the target address. + // + // However we are trying to compute the offset to the target address from the previous instruction. + // The previous instruction should be the one that contains the PCRelHi20 relocation and + // stores/references the program counter (`auipc` usually). + // + // Since we are trying to compute the offset from the previous instruction, we can + // represent it as offset = target_address - (current_instruction_address - 4) + // which is equivalent to offset = target_address - current_instruction_address + 4. + // + // Thus we need to add 4 to the offset here. + let lo12 = (offset + 4) as u32 & 0xFFF; + let insn = (insn & 0xFFFFF) | (lo12 << 20); + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn)); + } + LabelUse::RVCJump => { + debug_assert!(offset & 1 == 0); + + // We currently only support this for the C.J operation, so assert that is the opcode in + // the buffer. + debug_assert_eq!(insn & 0xFFFF, 0xA001); + + buffer[0..2].clone_from_slice(&u16::to_le_bytes(encode_cj_type( + CjOp::CJ, + Imm12::from_i16(i16::try_from(offset).unwrap()), + ))); + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn label_use_max_range() { + assert!(LabelUse::B12.max_neg_range() == LabelUse::B12.max_pos_range() + 2); + assert!(LabelUse::Jal20.max_neg_range() == LabelUse::Jal20.max_pos_range() + 2); + assert!(LabelUse::PCRel32.max_pos_range() == (Inst::imm_max() as CodeOffset)); + assert!(LabelUse::PCRel32.max_neg_range() == (Inst::imm_min().abs() as CodeOffset)); + assert!(LabelUse::B12.max_pos_range() == ((1 << 11) - 1) * 2); + } +} diff --git a/hbcb/src/inst/regs.rs b/hbcb/src/inst/regs.rs new file mode 100644 index 0000000..ffdc484 --- /dev/null +++ b/hbcb/src/inst/regs.rs @@ -0,0 +1,168 @@ +//! Riscv64 ISA definitions: registers. +//! + +use crate::machinst::{Reg, Writable}; + +use alloc::vec; +use alloc::vec::Vec; + +use regalloc2::{PReg, RegClass, VReg}; + +// first argument of function call +#[inline] +pub fn a0() -> Reg { + x_reg(10) +} + +// second argument of function call +#[inline] +#[allow(dead_code)] +pub fn a1() -> Reg { + x_reg(11) +} + +// third argument of function call +#[inline] +#[allow(dead_code)] +pub fn a2() -> Reg { + x_reg(12) +} + +#[inline] +#[allow(dead_code)] +pub fn writable_a0() -> Writable { + Writable::from_reg(a0()) +} +#[inline] +#[allow(dead_code)] +pub fn writable_a1() -> Writable { + Writable::from_reg(a1()) +} +#[inline] +#[allow(dead_code)] +pub fn writable_a2() -> Writable { + Writable::from_reg(a2()) +} + +#[inline] +#[allow(dead_code)] +pub fn fa0() -> Reg { + f_reg(10) +} +#[inline] +#[allow(dead_code)] +pub fn writable_fa0() -> Writable { + Writable::from_reg(fa0()) +} +#[inline] +#[allow(dead_code)] +pub fn writable_fa1() -> Writable { + Writable::from_reg(fa1()) +} +#[inline] +pub fn fa1() -> Reg { + f_reg(11) +} + +/// Get a reference to the zero-register. +#[inline] +pub fn zero_reg() -> Reg { + x_reg(0) +} + +/// Get a writable reference to the zero-register (this discards a result). +#[inline] +pub fn writable_zero_reg() -> Writable { + Writable::from_reg(zero_reg()) +} +#[inline] +pub fn stack_reg() -> Reg { + x_reg(2) +} + +/// Get a writable reference to the stack-pointer register. +#[inline] +pub fn writable_stack_reg() -> Writable { + Writable::from_reg(stack_reg()) +} + +/// Get a reference to the link register (x1). +pub fn link_reg() -> Reg { + x_reg(1) +} + +/// Get a writable reference to the link register. +#[inline] +pub fn writable_link_reg() -> Writable { + Writable::from_reg(link_reg()) +} + +/// Get a reference to the frame pointer (x8). +#[inline] +pub fn fp_reg() -> Reg { + x_reg(8) +} + +/// Get a writable reference to the frame pointer. +#[inline] +pub fn writable_fp_reg() -> Writable { + Writable::from_reg(fp_reg()) +} + +/// Get a reference to the first temporary, sometimes "spill temporary", +/// register. This register is used in various ways as a temporary. +#[inline] +pub fn spilltmp_reg() -> Reg { + x_reg(31) +} + +/// Get a writable reference to the spilltmp reg. +#[inline] +pub fn writable_spilltmp_reg() -> Writable { + Writable::from_reg(spilltmp_reg()) +} + +///spilltmp2 +#[inline] +pub fn spilltmp_reg2() -> Reg { + x_reg(30) +} + +/// Get a writable reference to the spilltmp2 reg. +#[inline] +pub fn writable_spilltmp_reg2() -> Writable { + Writable::from_reg(spilltmp_reg2()) +} + +#[inline] +pub fn x_reg(enc: usize) -> Reg { + let p_reg = PReg::new(enc, RegClass::Int); + let v_reg = VReg::new(p_reg.index(), p_reg.class()); + Reg::from(v_reg) +} +pub const fn px_reg(enc: usize) -> PReg { + PReg::new(enc, RegClass::Int) +} + +#[inline] +pub fn f_reg(enc: usize) -> Reg { + let p_reg = PReg::new(enc, RegClass::Float); + let v_reg = VReg::new(p_reg.index(), p_reg.class()); + Reg::from(v_reg) +} +pub const fn pf_reg(enc: usize) -> PReg { + PReg::new(enc, RegClass::Float) +} + +#[allow(dead_code)] +pub(crate) fn x_reg_range(start: usize, end: usize) -> Vec> { + let mut regs = vec![]; + for i in start..=end { + regs.push(Writable::from_reg(x_reg(i))); + } + regs +} + +pub const fn pv_reg(enc: usize) -> PReg { + PReg::new(enc, RegClass::Vector) +} diff --git a/hbcb/src/inst/unwind.rs b/hbcb/src/inst/unwind.rs new file mode 100644 index 0000000..1e2bb90 --- /dev/null +++ b/hbcb/src/inst/unwind.rs @@ -0,0 +1,2 @@ +#[cfg(feature = "unwind")] +pub(crate) mod systemv; diff --git a/hbcb/src/inst/unwind/systemv.rs b/hbcb/src/inst/unwind/systemv.rs new file mode 100644 index 0000000..6cf2445 --- /dev/null +++ b/hbcb/src/inst/unwind/systemv.rs @@ -0,0 +1,170 @@ +//! Unwind information for System V ABI (Riscv64). + +use crate::inst::regs; +use crate::isa::unwind::systemv::RegisterMappingError; +use crate::machinst::Reg; +use gimli::{write::CommonInformationEntry, Encoding, Format, Register}; +use regalloc2::RegClass; + +/// Creates a new riscv64 common information entry (CIE). +pub fn create_cie() -> CommonInformationEntry { + use gimli::write::CallFrameInstruction; + + let mut entry = CommonInformationEntry::new( + Encoding { + address_size: 8, + format: Format::Dwarf32, + version: 1, + }, + 2, // Code alignment factor + -8, // Data alignment factor + Register(regs::link_reg().to_real_reg().unwrap().hw_enc() as u16), + ); + + // Every frame will start with the call frame address (CFA) at SP + let sp = Register(regs::stack_reg().to_real_reg().unwrap().hw_enc().into()); + entry.add_instruction(CallFrameInstruction::Cfa(sp, 0)); + + entry +} + +/// Map Cranelift registers to their corresponding Gimli registers. +pub fn map_reg(reg: Reg) -> Result { + let reg_offset = match reg.class() { + RegClass::Int => 0, + RegClass::Float => 32, + RegClass::Vector => 64, + }; + + let reg = reg.to_real_reg().unwrap().hw_enc() as u16; + Ok(Register(reg_offset + reg)) +} + +pub(crate) struct RegisterMapper; + +impl crate::isa::unwind::systemv::RegisterMapper for RegisterMapper { + fn map(&self, reg: Reg) -> Result { + Ok(map_reg(reg)?.0) + } + fn fp(&self) -> Option { + Some(regs::fp_reg().to_real_reg().unwrap().hw_enc() as u16) + } + fn lr(&self) -> Option { + Some(regs::link_reg().to_real_reg().unwrap().hw_enc() as u16) + } + fn lr_offset(&self) -> Option { + Some(8) + } +} + +#[cfg(test)] +mod tests { + use crate::cursor::{Cursor, FuncCursor}; + + use crate::ir::{ + types, AbiParam, Function, InstBuilder, Signature, StackSlotData, StackSlotKind, + UserFuncName, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use gimli::write::Address; + use target_lexicon::triple; + + #[test] + fn test_simple_func() { + let isa = lookup(triple!("riscv64")) + .expect("expect riscv64 ISA") + .finish(Flags::new(builder())) + .expect("Creating compiler backend"); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64, 0)), + )); + + let code = context + .compile(&*isa, &mut Default::default()) + .expect("expected compilation"); + + let fde = match code + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(1234)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{fde:?}"), "FrameDescriptionEntry { address: Constant(1234), length: 40, lsda: None, instructions: [(12, CfaOffset(16)), (12, Offset(Register(8), -16)), (12, Offset(Register(1), -8)), (16, CfaRegister(Register(8)))] }"); + } + + fn create_function(call_conv: CallConv, stack_slot: Option) -> Function { + let mut func = + Function::with_name_signature(UserFuncName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.sized_stack_slots.push(stack_slot); + } + + func + } + + #[test] + fn test_multi_return_func() { + let isa = lookup(triple!("riscv64")) + .expect("expect riscv64 ISA") + .finish(Flags::new(builder())) + .expect("Creating compiler backend"); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + let code = context + .compile(&*isa, &mut Default::default()) + .expect("expected compilation"); + + let fde = match code + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(4321)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!( + format!("{fde:?}"), + "FrameDescriptionEntry { address: Constant(4321), length: 16, lsda: None, instructions: [] }" + ); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(UserFuncName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brif(v0, block2, &[], block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/hbcb/src/inst/vector.rs b/hbcb/src/inst/vector.rs new file mode 100644 index 0000000..356c747 --- /dev/null +++ b/hbcb/src/inst/vector.rs @@ -0,0 +1,1150 @@ +use crate::lower::isle::generated_code::VecAluOpRRRR; +use crate::lower::isle::generated_code::{ + VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecAvl, + VecElementWidth, VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode, +}; +use crate::machinst::{OperandVisitor, RegClass}; +use crate::Reg; +use core::fmt; + +use super::{Type, UImm5}; + +impl VecAvl { + pub fn _static(size: u32) -> Self { + VecAvl::Static { + size: UImm5::maybe_from_u8(size as u8).expect("Invalid size for AVL"), + } + } + + pub fn is_static(&self) -> bool { + match self { + VecAvl::Static { .. } => true, + } + } + + pub fn unwrap_static(&self) -> UImm5 { + match self { + VecAvl::Static { size } => *size, + } + } +} + +// TODO: Can we tell ISLE to derive this? +impl Copy for VecAvl {} + +// TODO: Can we tell ISLE to derive this? +impl PartialEq for VecAvl { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (VecAvl::Static { size: lhs }, VecAvl::Static { size: rhs }) => lhs == rhs, + } + } +} + +impl fmt::Display for VecAvl { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecAvl::Static { size } => write!(f, "{size}"), + } + } +} + +impl VecElementWidth { + pub fn from_type(ty: Type) -> Self { + Self::from_bits(ty.lane_bits()) + } + + pub fn from_bits(bits: u32) -> Self { + match bits { + 8 => VecElementWidth::E8, + 16 => VecElementWidth::E16, + 32 => VecElementWidth::E32, + 64 => VecElementWidth::E64, + _ => panic!("Invalid number of bits for VecElementWidth: {bits}"), + } + } + + pub fn bits(&self) -> u32 { + match self { + VecElementWidth::E8 => 8, + VecElementWidth::E16 => 16, + VecElementWidth::E32 => 32, + VecElementWidth::E64 => 64, + } + } + + pub fn encode(&self) -> u32 { + match self { + VecElementWidth::E8 => 0b000, + VecElementWidth::E16 => 0b001, + VecElementWidth::E32 => 0b010, + VecElementWidth::E64 => 0b011, + } + } +} + +impl fmt::Display for VecElementWidth { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "e{}", self.bits()) + } +} + +impl VecLmul { + pub fn encode(&self) -> u32 { + match self { + VecLmul::LmulF8 => 0b101, + VecLmul::LmulF4 => 0b110, + VecLmul::LmulF2 => 0b111, + VecLmul::Lmul1 => 0b000, + VecLmul::Lmul2 => 0b001, + VecLmul::Lmul4 => 0b010, + VecLmul::Lmul8 => 0b011, + } + } +} + +impl fmt::Display for VecLmul { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecLmul::LmulF8 => write!(f, "mf8"), + VecLmul::LmulF4 => write!(f, "mf4"), + VecLmul::LmulF2 => write!(f, "mf2"), + VecLmul::Lmul1 => write!(f, "m1"), + VecLmul::Lmul2 => write!(f, "m2"), + VecLmul::Lmul4 => write!(f, "m4"), + VecLmul::Lmul8 => write!(f, "m8"), + } + } +} + +impl VecTailMode { + pub fn encode(&self) -> u32 { + match self { + VecTailMode::Agnostic => 1, + VecTailMode::Undisturbed => 0, + } + } +} + +impl fmt::Display for VecTailMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecTailMode::Agnostic => write!(f, "ta"), + VecTailMode::Undisturbed => write!(f, "tu"), + } + } +} + +impl VecMaskMode { + pub fn encode(&self) -> u32 { + match self { + VecMaskMode::Agnostic => 1, + VecMaskMode::Undisturbed => 0, + } + } +} + +impl fmt::Display for VecMaskMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecMaskMode::Agnostic => write!(f, "ma"), + VecMaskMode::Undisturbed => write!(f, "mu"), + } + } +} + +/// Vector Type (VType) +/// +/// vtype provides the default type used to interpret the contents of the vector register file. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VType { + pub sew: VecElementWidth, + pub lmul: VecLmul, + pub tail_mode: VecTailMode, + pub mask_mode: VecMaskMode, +} + +impl VType { + // https://github.com/riscv/riscv-v-spec/blob/master/vtype-format.adoc + pub fn encode(&self) -> u32 { + let mut bits = 0; + bits |= self.lmul.encode(); + bits |= self.sew.encode() << 3; + bits |= self.tail_mode.encode() << 6; + bits |= self.mask_mode.encode() << 7; + bits + } +} + +impl fmt::Display for VType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}, {}, {}, {}", + self.sew, self.lmul, self.tail_mode, self.mask_mode + ) + } +} + +/// Vector State (VState) +/// +/// VState represents the state of the vector unit that each instruction expects before execution. +/// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is +/// used by our instruction emission code to ensure that the vector unit is in the correct state. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VState { + pub avl: VecAvl, + pub vtype: VType, +} + +impl VState { + pub fn from_type(ty: Type) -> Self { + VState { + avl: VecAvl::_static(ty.lane_count()), + vtype: VType { + sew: VecElementWidth::from_type(ty), + lmul: VecLmul::Lmul1, + tail_mode: VecTailMode::Agnostic, + mask_mode: VecMaskMode::Agnostic, + }, + } + } +} + +impl fmt::Display for VState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "#avl={}, #vtype=({})", self.avl, self.vtype) + } +} + +impl VecOpCategory { + pub fn encode(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#101-vector-arithmetic-instruction-encoding + match self { + VecOpCategory::OPIVV => 0b000, + VecOpCategory::OPFVV => 0b001, + VecOpCategory::OPMVV => 0b010, + VecOpCategory::OPIVI => 0b011, + VecOpCategory::OPIVX => 0b100, + VecOpCategory::OPFVF => 0b101, + VecOpCategory::OPMVX => 0b110, + VecOpCategory::OPCFG => 0b111, + } + } +} + +impl Copy for VecOpMasking {} +impl VecOpMasking { + pub fn is_enabled(&self) -> bool { + match self { + VecOpMasking::Enabled { .. } => true, + VecOpMasking::Disabled => false, + } + } + + pub fn encode(&self) -> u32 { + match self { + VecOpMasking::Enabled { .. } => 0, + VecOpMasking::Disabled => 1, + } + } +} + +impl VecAluOpRRRR { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRRRR::VmaccVV | VecAluOpRRRR::VmaccVX => 0b101101, + VecAluOpRRRR::VnmsacVV | VecAluOpRRRR::VnmsacVX => 0b101111, + VecAluOpRRRR::VfmaccVV | VecAluOpRRRR::VfmaccVF => 0b101100, + VecAluOpRRRR::VfnmaccVV | VecAluOpRRRR::VfnmaccVF => 0b101101, + VecAluOpRRRR::VfmsacVV | VecAluOpRRRR::VfmsacVF => 0b101110, + VecAluOpRRRR::VfnmsacVV | VecAluOpRRRR::VfnmsacVF => 0b101111, + VecAluOpRRRR::Vslide1upVX => 0b001110, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRRR::VmaccVV | VecAluOpRRRR::VnmsacVV => VecOpCategory::OPMVV, + VecAluOpRRRR::VmaccVX | VecAluOpRRRR::VnmsacVX | VecAluOpRRRR::Vslide1upVX => { + VecOpCategory::OPMVX + } + VecAluOpRRRR::VfmaccVV + | VecAluOpRRRR::VfnmaccVV + | VecAluOpRRRR::VfmsacVV + | VecAluOpRRRR::VfnmsacVV => VecOpCategory::OPFVV, + VecAluOpRRRR::VfmaccVF + | VecAluOpRRRR::VfnmaccVF + | VecAluOpRRRR::VfmsacVF + | VecAluOpRRRR::VfnmsacVF => VecOpCategory::OPFVF, + } + } + + // vs1 is the only variable source, vs2 is fixed. + pub fn vs1_regclass(&self) -> RegClass { + match self.category() { + VecOpCategory::OPMVV | VecOpCategory::OPFVV => RegClass::Vector, + VecOpCategory::OPMVX => RegClass::Int, + VecOpCategory::OPFVF => RegClass::Float, + _ => unreachable!(), + } + } +} + +impl VecInstOverlapInfo for VecAluOpRRRR { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRRR::Vslide1upVX => true, + _ => false, + } + } +} + +impl fmt::Display for VecAluOpRRRR { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut s = format!("{self:?}"); + s.make_ascii_lowercase(); + let (opcode, category) = s.split_at(s.len() - 2); + f.write_str(&format!("{opcode}.{category}")) + } +} + +impl VecAluOpRRRImm5 { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRRRImm5::VslideupVI => 0b001110, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRRImm5::VslideupVI => VecOpCategory::OPIVI, + } + } + + pub fn imm_is_unsigned(&self) -> bool { + match self { + VecAluOpRRRImm5::VslideupVI => true, + } + } +} + +impl VecInstOverlapInfo for VecAluOpRRRImm5 { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRRImm5::VslideupVI => true, + } + } +} + +impl fmt::Display for VecAluOpRRRImm5 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut s = format!("{self:?}"); + s.make_ascii_lowercase(); + let (opcode, category) = s.split_at(s.len() - 2); + f.write_str(&format!("{opcode}.{category}")) + } +} + +impl VecAluOpRRR { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRRR::VaddVV + | VecAluOpRRR::VaddVX + | VecAluOpRRR::VfaddVV + | VecAluOpRRR::VfaddVF => 0b000000, + VecAluOpRRR::VsubVV + | VecAluOpRRR::VsubVX + | VecAluOpRRR::VfsubVV + | VecAluOpRRR::VfsubVF => 0b000010, + VecAluOpRRR::VrsubVX => 0b000011, + VecAluOpRRR::VmulVV | VecAluOpRRR::VmulVX => 0b100101, + VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhVX => 0b100111, + VecAluOpRRR::VmulhuVV + | VecAluOpRRR::VmulhuVX + | VecAluOpRRR::VfmulVV + | VecAluOpRRR::VfmulVF => 0b100100, + VecAluOpRRR::VsmulVV | VecAluOpRRR::VsmulVX => 0b100111, + VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101, + VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000, + VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001, + VecAluOpRRR::VandVV | VecAluOpRRR::VandVX => 0b001001, + VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010, + VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011, + VecAluOpRRR::VminuVV | VecAluOpRRR::VminuVX | VecAluOpRRR::VredminuVS => 0b000100, + VecAluOpRRR::VminVV | VecAluOpRRR::VminVX => 0b000101, + VecAluOpRRR::VmaxuVV | VecAluOpRRR::VmaxuVX | VecAluOpRRR::VredmaxuVS => 0b000110, + VecAluOpRRR::VmaxVV | VecAluOpRRR::VmaxVX => 0b000111, + VecAluOpRRR::VslidedownVX => 0b001111, + VecAluOpRRR::VfrsubVF => 0b100111, + VecAluOpRRR::VmergeVVM + | VecAluOpRRR::VmergeVXM + | VecAluOpRRR::VfmergeVFM + | VecAluOpRRR::VcompressVM => 0b010111, + VecAluOpRRR::VfdivVV + | VecAluOpRRR::VfdivVF + | VecAluOpRRR::VsadduVV + | VecAluOpRRR::VsadduVX => 0b100000, + VecAluOpRRR::VfrdivVF | VecAluOpRRR::VsaddVV | VecAluOpRRR::VsaddVX => 0b100001, + VecAluOpRRR::VfminVV => 0b000100, + VecAluOpRRR::VfmaxVV => 0b000110, + VecAluOpRRR::VssubuVV | VecAluOpRRR::VssubuVX => 0b100010, + VecAluOpRRR::VssubVV | VecAluOpRRR::VssubVX => 0b100011, + VecAluOpRRR::VfsgnjVV | VecAluOpRRR::VfsgnjVF => 0b001000, + VecAluOpRRR::VfsgnjnVV => 0b001001, + VecAluOpRRR::VfsgnjxVV => 0b001010, + VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX => 0b001100, + VecAluOpRRR::VwadduVV | VecAluOpRRR::VwadduVX => 0b110000, + VecAluOpRRR::VwaddVV | VecAluOpRRR::VwaddVX => 0b110001, + VecAluOpRRR::VwsubuVV | VecAluOpRRR::VwsubuVX => 0b110010, + VecAluOpRRR::VwsubVV | VecAluOpRRR::VwsubVX => 0b110011, + VecAluOpRRR::VwadduWV | VecAluOpRRR::VwadduWX => 0b110100, + VecAluOpRRR::VwaddWV | VecAluOpRRR::VwaddWX => 0b110101, + VecAluOpRRR::VwsubuWV | VecAluOpRRR::VwsubuWX => 0b110110, + VecAluOpRRR::VwsubWV | VecAluOpRRR::VwsubWX => 0b110111, + VecAluOpRRR::VmseqVV + | VecAluOpRRR::VmseqVX + | VecAluOpRRR::VmfeqVV + | VecAluOpRRR::VmfeqVF => 0b011000, + VecAluOpRRR::VmsneVV + | VecAluOpRRR::VmsneVX + | VecAluOpRRR::VmfleVV + | VecAluOpRRR::VmfleVF + | VecAluOpRRR::VmandMM => 0b011001, + VecAluOpRRR::VmsltuVV | VecAluOpRRR::VmsltuVX | VecAluOpRRR::VmorMM => 0b011010, + VecAluOpRRR::VmsltVV + | VecAluOpRRR::VmsltVX + | VecAluOpRRR::VmfltVV + | VecAluOpRRR::VmfltVF => 0b011011, + VecAluOpRRR::VmsleuVV + | VecAluOpRRR::VmsleuVX + | VecAluOpRRR::VmfneVV + | VecAluOpRRR::VmfneVF => 0b011100, + VecAluOpRRR::VmsleVV + | VecAluOpRRR::VmsleVX + | VecAluOpRRR::VmfgtVF + | VecAluOpRRR::VmnandMM => 0b011101, + VecAluOpRRR::VmsgtuVX | VecAluOpRRR::VmnorMM => 0b011110, + VecAluOpRRR::VmsgtVX | VecAluOpRRR::VmfgeVF => 0b011111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRR::VaddVV + | VecAluOpRRR::VsaddVV + | VecAluOpRRR::VsadduVV + | VecAluOpRRR::VsubVV + | VecAluOpRRR::VssubVV + | VecAluOpRRR::VssubuVV + | VecAluOpRRR::VsmulVV + | VecAluOpRRR::VsllVV + | VecAluOpRRR::VsrlVV + | VecAluOpRRR::VsraVV + | VecAluOpRRR::VandVV + | VecAluOpRRR::VorVV + | VecAluOpRRR::VxorVV + | VecAluOpRRR::VminuVV + | VecAluOpRRR::VminVV + | VecAluOpRRR::VmaxuVV + | VecAluOpRRR::VmaxVV + | VecAluOpRRR::VmergeVVM + | VecAluOpRRR::VrgatherVV + | VecAluOpRRR::VmseqVV + | VecAluOpRRR::VmsneVV + | VecAluOpRRR::VmsltuVV + | VecAluOpRRR::VmsltVV + | VecAluOpRRR::VmsleuVV + | VecAluOpRRR::VmsleVV => VecOpCategory::OPIVV, + VecAluOpRRR::VwaddVV + | VecAluOpRRR::VwaddWV + | VecAluOpRRR::VwadduVV + | VecAluOpRRR::VwadduWV + | VecAluOpRRR::VwsubVV + | VecAluOpRRR::VwsubWV + | VecAluOpRRR::VwsubuVV + | VecAluOpRRR::VwsubuWV + | VecAluOpRRR::VmulVV + | VecAluOpRRR::VmulhVV + | VecAluOpRRR::VmulhuVV + | VecAluOpRRR::VredmaxuVS + | VecAluOpRRR::VredminuVS + | VecAluOpRRR::VcompressVM + | VecAluOpRRR::VmandMM + | VecAluOpRRR::VmorMM + | VecAluOpRRR::VmnandMM + | VecAluOpRRR::VmnorMM => VecOpCategory::OPMVV, + VecAluOpRRR::VwaddVX + | VecAluOpRRR::VwadduVX + | VecAluOpRRR::VwadduWX + | VecAluOpRRR::VwaddWX + | VecAluOpRRR::VwsubVX + | VecAluOpRRR::VwsubuVX + | VecAluOpRRR::VwsubuWX + | VecAluOpRRR::VwsubWX + | VecAluOpRRR::VmulVX + | VecAluOpRRR::VmulhVX + | VecAluOpRRR::VmulhuVX => VecOpCategory::OPMVX, + VecAluOpRRR::VaddVX + | VecAluOpRRR::VsaddVX + | VecAluOpRRR::VsadduVX + | VecAluOpRRR::VsubVX + | VecAluOpRRR::VssubVX + | VecAluOpRRR::VssubuVX + | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VsmulVX + | VecAluOpRRR::VsllVX + | VecAluOpRRR::VsrlVX + | VecAluOpRRR::VsraVX + | VecAluOpRRR::VandVX + | VecAluOpRRR::VorVX + | VecAluOpRRR::VxorVX + | VecAluOpRRR::VminuVX + | VecAluOpRRR::VminVX + | VecAluOpRRR::VmaxuVX + | VecAluOpRRR::VmaxVX + | VecAluOpRRR::VslidedownVX + | VecAluOpRRR::VmergeVXM + | VecAluOpRRR::VrgatherVX + | VecAluOpRRR::VmseqVX + | VecAluOpRRR::VmsneVX + | VecAluOpRRR::VmsltuVX + | VecAluOpRRR::VmsltVX + | VecAluOpRRR::VmsleuVX + | VecAluOpRRR::VmsleVX + | VecAluOpRRR::VmsgtuVX + | VecAluOpRRR::VmsgtVX => VecOpCategory::OPIVX, + VecAluOpRRR::VfaddVV + | VecAluOpRRR::VfsubVV + | VecAluOpRRR::VfmulVV + | VecAluOpRRR::VfdivVV + | VecAluOpRRR::VfmaxVV + | VecAluOpRRR::VfminVV + | VecAluOpRRR::VfsgnjVV + | VecAluOpRRR::VfsgnjnVV + | VecAluOpRRR::VfsgnjxVV + | VecAluOpRRR::VmfeqVV + | VecAluOpRRR::VmfneVV + | VecAluOpRRR::VmfltVV + | VecAluOpRRR::VmfleVV => VecOpCategory::OPFVV, + VecAluOpRRR::VfaddVF + | VecAluOpRRR::VfsubVF + | VecAluOpRRR::VfrsubVF + | VecAluOpRRR::VfmulVF + | VecAluOpRRR::VfdivVF + | VecAluOpRRR::VfrdivVF + | VecAluOpRRR::VfmergeVFM + | VecAluOpRRR::VfsgnjVF + | VecAluOpRRR::VmfeqVF + | VecAluOpRRR::VmfneVF + | VecAluOpRRR::VmfltVF + | VecAluOpRRR::VmfleVF + | VecAluOpRRR::VmfgtVF + | VecAluOpRRR::VmfgeVF => VecOpCategory::OPFVF, + } + } + + // vs1 is the only variable source, vs2 is fixed. + pub fn vs1_regclass(&self) -> RegClass { + match self.category() { + VecOpCategory::OPIVV | VecOpCategory::OPFVV | VecOpCategory::OPMVV => RegClass::Vector, + VecOpCategory::OPIVX | VecOpCategory::OPMVX => RegClass::Int, + VecOpCategory::OPFVF => RegClass::Float, + _ => unreachable!(), + } + } +} + +impl VecInstOverlapInfo for VecAluOpRRR { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRR::VrgatherVV + | VecAluOpRRR::VrgatherVX + | VecAluOpRRR::VcompressVM + | VecAluOpRRR::VwadduVV + | VecAluOpRRR::VwadduVX + | VecAluOpRRR::VwaddVV + | VecAluOpRRR::VwaddVX + | VecAluOpRRR::VwadduWV + | VecAluOpRRR::VwadduWX + | VecAluOpRRR::VwaddWV + | VecAluOpRRR::VwaddWX + | VecAluOpRRR::VwsubuVV + | VecAluOpRRR::VwsubuVX + | VecAluOpRRR::VwsubVV + | VecAluOpRRR::VwsubVX + | VecAluOpRRR::VwsubuWV + | VecAluOpRRR::VwsubuWX + | VecAluOpRRR::VwsubWV + | VecAluOpRRR::VwsubWX => true, + _ => false, + } + } + + // Only mask writing operations, and reduction operations (`vred*`) allow mask / dst overlaps. + fn forbids_mask_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRR::VredmaxuVS + | VecAluOpRRR::VredminuVS + | VecAluOpRRR::VmandMM + | VecAluOpRRR::VmorMM + | VecAluOpRRR::VmnandMM + | VecAluOpRRR::VmnorMM + | VecAluOpRRR::VmseqVX + | VecAluOpRRR::VmsneVX + | VecAluOpRRR::VmsltuVX + | VecAluOpRRR::VmsltVX + | VecAluOpRRR::VmsleuVX + | VecAluOpRRR::VmsleVX + | VecAluOpRRR::VmsgtuVX + | VecAluOpRRR::VmsgtVX + | VecAluOpRRR::VmfeqVV + | VecAluOpRRR::VmfneVV + | VecAluOpRRR::VmfltVV + | VecAluOpRRR::VmfleVV + | VecAluOpRRR::VmfeqVF + | VecAluOpRRR::VmfneVF + | VecAluOpRRR::VmfltVF + | VecAluOpRRR::VmfleVF + | VecAluOpRRR::VmfgtVF + | VecAluOpRRR::VmfgeVF => false, + _ => true, + } + } +} + +impl fmt::Display for VecAluOpRRR { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let suffix_length = match self { + VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 3, + _ => 2, + }; + + let mut s = format!("{self:?}"); + s.make_ascii_lowercase(); + let (opcode, category) = s.split_at(s.len() - suffix_length); + f.write_str(&format!("{opcode}.{category}")) + } +} + +impl VecAluOpRRImm5 { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRRImm5::VaddVI => 0b000000, + VecAluOpRRImm5::VrsubVI => 0b000011, + VecAluOpRRImm5::VsllVI => 0b100101, + VecAluOpRRImm5::VsrlVI => 0b101000, + VecAluOpRRImm5::VsraVI => 0b101001, + VecAluOpRRImm5::VandVI => 0b001001, + VecAluOpRRImm5::VorVI => 0b001010, + VecAluOpRRImm5::VxorVI => 0b001011, + VecAluOpRRImm5::VslidedownVI => 0b001111, + VecAluOpRRImm5::VssrlVI => 0b101010, + VecAluOpRRImm5::VmergeVIM => 0b010111, + VecAluOpRRImm5::VsadduVI => 0b100000, + VecAluOpRRImm5::VsaddVI => 0b100001, + VecAluOpRRImm5::VrgatherVI => 0b001100, + VecAluOpRRImm5::VmvrV => 0b100111, + VecAluOpRRImm5::VnclipWI => 0b101111, + VecAluOpRRImm5::VnclipuWI => 0b101110, + VecAluOpRRImm5::VmseqVI => 0b011000, + VecAluOpRRImm5::VmsneVI => 0b011001, + VecAluOpRRImm5::VmsleuVI => 0b011100, + VecAluOpRRImm5::VmsleVI => 0b011101, + VecAluOpRRImm5::VmsgtuVI => 0b011110, + VecAluOpRRImm5::VmsgtVI => 0b011111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRImm5::VaddVI + | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VsllVI + | VecAluOpRRImm5::VsrlVI + | VecAluOpRRImm5::VsraVI + | VecAluOpRRImm5::VandVI + | VecAluOpRRImm5::VorVI + | VecAluOpRRImm5::VxorVI + | VecAluOpRRImm5::VssrlVI + | VecAluOpRRImm5::VslidedownVI + | VecAluOpRRImm5::VmergeVIM + | VecAluOpRRImm5::VsadduVI + | VecAluOpRRImm5::VsaddVI + | VecAluOpRRImm5::VrgatherVI + | VecAluOpRRImm5::VmvrV + | VecAluOpRRImm5::VnclipWI + | VecAluOpRRImm5::VnclipuWI + | VecAluOpRRImm5::VmseqVI + | VecAluOpRRImm5::VmsneVI + | VecAluOpRRImm5::VmsleuVI + | VecAluOpRRImm5::VmsleVI + | VecAluOpRRImm5::VmsgtuVI + | VecAluOpRRImm5::VmsgtVI => VecOpCategory::OPIVI, + } + } + + pub fn imm_is_unsigned(&self) -> bool { + match self { + VecAluOpRRImm5::VsllVI + | VecAluOpRRImm5::VsrlVI + | VecAluOpRRImm5::VssrlVI + | VecAluOpRRImm5::VsraVI + | VecAluOpRRImm5::VslidedownVI + | VecAluOpRRImm5::VrgatherVI + | VecAluOpRRImm5::VmvrV + | VecAluOpRRImm5::VnclipWI + | VecAluOpRRImm5::VnclipuWI => true, + VecAluOpRRImm5::VaddVI + | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VandVI + | VecAluOpRRImm5::VorVI + | VecAluOpRRImm5::VxorVI + | VecAluOpRRImm5::VmergeVIM + | VecAluOpRRImm5::VsadduVI + | VecAluOpRRImm5::VsaddVI + | VecAluOpRRImm5::VmseqVI + | VecAluOpRRImm5::VmsneVI + | VecAluOpRRImm5::VmsleuVI + | VecAluOpRRImm5::VmsleVI + | VecAluOpRRImm5::VmsgtuVI + | VecAluOpRRImm5::VmsgtVI => false, + } + } +} + +impl VecInstOverlapInfo for VecAluOpRRImm5 { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRImm5::VrgatherVI => true, + _ => false, + } + } + + // Only mask writing operations, and reduction operations (`vred*`) allow mask / dst overlaps. + fn forbids_mask_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRImm5::VmseqVI + | VecAluOpRRImm5::VmsneVI + | VecAluOpRRImm5::VmsleuVI + | VecAluOpRRImm5::VmsleVI + | VecAluOpRRImm5::VmsgtuVI + | VecAluOpRRImm5::VmsgtVI => false, + _ => true, + } + } +} + +impl fmt::Display for VecAluOpRRImm5 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let suffix_length = match self { + VecAluOpRRImm5::VmergeVIM => 3, + _ => 2, + }; + + let mut s = format!("{self:?}"); + s.make_ascii_lowercase(); + let (opcode, category) = s.split_at(s.len() - suffix_length); + f.write_str(&format!("{opcode}.{category}")) + } +} + +impl VecAluOpRR { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => { + 0b010000 + } + VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => 0b010010, + VecAluOpRR::VfsqrtV => 0b010011, + VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111, + VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => 0b010010, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRR::VmvSX => VecOpCategory::OPMVX, + VecAluOpRR::VmvXS + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV, + VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF, + VecAluOpRR::VfmvFS + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => VecOpCategory::OPFVV, + VecAluOpRR::VmvVV => VecOpCategory::OPIVV, + VecAluOpRR::VmvVX => VecOpCategory::OPIVX, + } + } + + /// Returns the auxiliary encoding field for the instruction, if any. + pub fn aux_encoding(&self) -> u32 { + match self { + // VRXUNARY0 + VecAluOpRR::VmvSX => 0b00000, + // VWXUNARY0 + VecAluOpRR::VmvXS => 0b00000, + // VRFUNARY0 + VecAluOpRR::VfmvSF => 0b00000, + // VWFUNARY0 + VecAluOpRR::VfmvFS => 0b00000, + // VFUNARY1 + VecAluOpRR::VfsqrtV => 0b00000, + // VXUNARY0 + VecAluOpRR::VzextVF8 => 0b00010, + VecAluOpRR::VsextVF8 => 0b00011, + VecAluOpRR::VzextVF4 => 0b00100, + VecAluOpRR::VsextVF4 => 0b00101, + VecAluOpRR::VzextVF2 => 0b00110, + VecAluOpRR::VsextVF2 => 0b00111, + // VFUNARY0 + // single-width converts + VecAluOpRR::VfcvtxufV => 0b00000, + VecAluOpRR::VfcvtxfV => 0b00001, + VecAluOpRR::VfcvtrtzxufV => 0b00110, + VecAluOpRR::VfcvtrtzxfV => 0b00111, + VecAluOpRR::VfcvtfxuV => 0b00010, + VecAluOpRR::VfcvtfxV => 0b00011, + // widening converts + VecAluOpRR::VfwcvtffV => 0b01100, + // narrowing converts + VecAluOpRR::VfncvtffW => 0b10100, + // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: + // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. + VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0, + } + } + + /// Most of these opcodes have the source register encoded in the VS2 field and + /// the `aux_encoding` field in VS1. However some special snowflakes have it the + /// other way around. As far as I can tell only vmv.v.* are backwards. + pub fn vs_is_vs2_encoded(&self) -> bool { + match self { + VecAluOpRR::VmvXS + | VecAluOpRR::VfmvFS + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => true, + VecAluOpRR::VmvSX + | VecAluOpRR::VfmvSF + | VecAluOpRR::VmvVV + | VecAluOpRR::VmvVX + | VecAluOpRR::VfmvVF => false, + } + } + + pub fn dst_regclass(&self) -> RegClass { + match self { + VecAluOpRR::VfmvSF + | VecAluOpRR::VmvSX + | VecAluOpRR::VmvVV + | VecAluOpRR::VmvVX + | VecAluOpRR::VfmvVF + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => RegClass::Vector, + VecAluOpRR::VmvXS => RegClass::Int, + VecAluOpRR::VfmvFS => RegClass::Float, + } + } + + pub fn src_regclass(&self) -> RegClass { + match self { + VecAluOpRR::VmvXS + | VecAluOpRR::VfmvFS + | VecAluOpRR::VmvVV + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => RegClass::Vector, + VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float, + VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int, + } + } +} + +impl VecInstOverlapInfo for VecAluOpRR { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfwcvtffV + | VecAluOpRR::VfncvtffW => true, + _ => false, + } + } +} + +impl fmt::Display for VecAluOpRR { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + VecAluOpRR::VmvSX => "vmv.s.x", + VecAluOpRR::VmvXS => "vmv.x.s", + VecAluOpRR::VfmvSF => "vfmv.s.f", + VecAluOpRR::VfmvFS => "vfmv.f.s", + VecAluOpRR::VfsqrtV => "vfsqrt.v", + VecAluOpRR::VzextVF2 => "vzext.vf2", + VecAluOpRR::VzextVF4 => "vzext.vf4", + VecAluOpRR::VzextVF8 => "vzext.vf8", + VecAluOpRR::VsextVF2 => "vsext.vf2", + VecAluOpRR::VsextVF4 => "vsext.vf4", + VecAluOpRR::VsextVF8 => "vsext.vf8", + VecAluOpRR::VmvVV => "vmv.v.v", + VecAluOpRR::VmvVX => "vmv.v.x", + VecAluOpRR::VfmvVF => "vfmv.v.f", + VecAluOpRR::VfcvtxufV => "vfcvt.xu.f.v", + VecAluOpRR::VfcvtxfV => "vfcvt.x.f.v", + VecAluOpRR::VfcvtrtzxufV => "vfcvt.rtz.xu.f.v", + VecAluOpRR::VfcvtrtzxfV => "vfcvt.rtz.x.f.v", + VecAluOpRR::VfcvtfxuV => "vfcvt.f.xu.v", + VecAluOpRR::VfcvtfxV => "vfcvt.f.x.v", + VecAluOpRR::VfwcvtffV => "vfwcvt.f.f.v", + VecAluOpRR::VfncvtffW => "vfncvt.f.f.w", + }) + } +} + +impl VecAluOpRImm5 { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRImm5::VmvVI => 0b010111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRImm5::VmvVI => VecOpCategory::OPIVI, + } + } + + /// Returns the auxiliary encoding field for the instruction, if any. + pub fn aux_encoding(&self) -> u32 { + match self { + // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: + // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. + VecAluOpRImm5::VmvVI => 0, + } + } +} + +impl VecInstOverlapInfo for VecAluOpRImm5 { + fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRImm5::VmvVI => false, + } + } +} + +impl fmt::Display for VecAluOpRImm5 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + VecAluOpRImm5::VmvVI => "vmv.v.i", + }) + } +} + +impl VecAMode { + pub fn get_base_register(&self) -> Option { + match self { + VecAMode::UnitStride { base, .. } => base.get_base_register(), + } + } + + pub fn get_operands(&mut self, collector: &mut impl OperandVisitor) { + match self { + VecAMode::UnitStride { base, .. } => base.get_operands(collector), + } + } + + /// `mop` field, described in Table 7 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn mop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00, + } + } + + /// `lumop` field, described in Table 9 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn lumop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00000, + } + } + + /// `sumop` field, described in Table 10 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn sumop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00000, + } + } + + /// The `nf[2:0]` field encodes the number of fields in each segment. For regular vector loads and + /// stores, nf=0, indicating that a single value is moved between a vector register group and memory + /// at each element position. Larger values in the nf field are used to access multiple contiguous + /// fields within a segment as described in Section 7.8 Vector Load/Store Segment Instructions. + /// + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn nf(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b000, + } + } +} + +pub trait VecInstOverlapInfo { + /// § 5.2 Vector Operands states: + /// + /// A destination vector register group can overlap a source vector register group + /// only if one of the following holds: + /// + /// * The destination EEW equals the source EEW. + /// + /// * The destination EEW is smaller than the source EEW and the overlap is + /// in the lowest-numbered part of the source register group (e.g., when LMUL=1, + /// vnsrl.wi v0, v0, 3 is legal, but a destination of v1 is not). + /// + /// * The destination EEW is greater than the source EEW, the source EMUL is at + /// least 1, and the overlap is in the highest-numbered part of the destination register + /// group (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not). + /// + /// For the purpose of determining register group overlap constraints, mask elements have EEW=1. + fn forbids_src_dst_overlaps(&self) -> bool; + + /// § 5.3 Vector Masking states: + /// + /// > The destination vector register group for a masked vector instruction + /// > cannot overlap the source mask register (v0), unless the destination + /// > vector register is being written with a mask value (e.g., compares) or + /// > the scalar result of a reduction. These instruction encodings are reserved. + /// + /// In almost all instructions we should not allow the mask to be re-used as + /// a destination register. + fn forbids_mask_dst_overlaps(&self) -> bool { + true + } + + /// There are two broad categories of overlaps (see above). But we can't represent such + /// fine grained overlaps to regalloc. So if any of the two come into play we forbid + /// all source and destination overlaps (including masks). + fn forbids_overlaps(&self, mask: &VecOpMasking) -> bool { + self.forbids_src_dst_overlaps() || (mask.is_enabled() && self.forbids_mask_dst_overlaps()) + } +} diff --git a/hbcb/src/inst_vector.isle b/hbcb/src/inst_vector.isle new file mode 100644 index 0000000..4b63618 --- /dev/null +++ b/hbcb/src/inst_vector.isle @@ -0,0 +1,1907 @@ +;; Represents the possible widths of an element when used in an operation. +(type VecElementWidth (enum + (E8) + (E16) + (E32) + (E64) +)) + +;; Vector Register Group Multiplier (LMUL) +;; +;; The LMUL setting specifies how we should group registers together. LMUL can +;; also be a fractional value, reducing the number of bits used in a single +;; vector register. Fractional LMUL is used to increase the number of effective +;; usable vector register groups when operating on mixed-width values. +(type VecLmul (enum + (LmulF8) + (LmulF4) + (LmulF2) + (Lmul1) + (Lmul2) + (Lmul4) + (Lmul8) +)) + +;; Tail Mode +;; +;; The tail mode specifies how the tail elements of a vector register are handled. +(type VecTailMode (enum + ;; Tail Agnostic means that the tail elements are left in an undefined state. + (Agnostic) + ;; Tail Undisturbed means that the tail elements are left in their original values. + (Undisturbed) +)) + +;; Mask Mode +;; +;; The mask mode specifies how the masked elements of a vector register are handled. +(type VecMaskMode (enum + ;; Mask Agnostic means that the masked out elements are left in an undefined state. + (Agnostic) + ;; Mask Undisturbed means that the masked out elements are left in their original values. + (Undisturbed) +)) + +;; Application Vector Length (AVL) +;; +;; This setting specifies the number of elements that are going to be processed +;; in a single instruction. Note: We may end up processing fewer elements than +;; the AVL setting, if they don't fit in a single register. +(type VecAvl (enum + ;; Static AVL emits a `vsetivli` that uses a constant value + (Static (size UImm5)) + ;; TODO: Add a dynamic, register based AVL mode when we are able to properly test it +)) + +(type VType (primitive VType)) +(type VState (primitive VState)) + + +;; Vector Opcode Category +;; +;; These categories are used to determine the type of operands that are allowed in the +;; instruction. +(type VecOpCategory (enum + (OPIVV) + (OPFVV) + (OPMVV) + (OPIVI) + (OPIVX) + (OPFVF) + (OPMVX) + (OPCFG) +)) + +;; Vector Opcode Masking +;; +;; When masked, the instruction will only operate on the elements that are dictated by +;; the mask register. Currently this is always fixed to v0. +(type VecOpMasking (enum + (Enabled (reg Reg)) + (Disabled) +)) + +(decl pure masked (VReg) VecOpMasking) +(rule (masked reg) (VecOpMasking.Enabled reg)) + +(decl pure unmasked () VecOpMasking) +(rule (unmasked) (VecOpMasking.Disabled)) + +;; Register to Register ALU Ops +(type VecAluOpRRR (enum + ;; Vector-Vector Opcodes + (VaddVV) + (VsaddVV) + (VsadduVV) + (VwaddVV) + (VwaddWV) + (VwadduVV) + (VwadduWV) + (VsubVV) + (VwsubVV) + (VwsubWV) + (VwsubuVV) + (VwsubuWV) + (VssubVV) + (VssubuVV) + (VmulVV) + (VmulhVV) + (VmulhuVV) + (VsmulVV) + (VsllVV) + (VsrlVV) + (VsraVV) + (VandVV) + (VorVV) + (VxorVV) + (VmaxVV) + (VmaxuVV) + (VminVV) + (VminuVV) + (VfaddVV) + (VfsubVV) + (VfmulVV) + (VfdivVV) + (VfminVV) + (VfmaxVV) + (VfsgnjVV) + (VfsgnjnVV) + (VfsgnjxVV) + (VmergeVVM) + (VredmaxuVS) + (VredminuVS) + (VrgatherVV) + (VcompressVM) + (VmseqVV) + (VmsneVV) + (VmsltuVV) + (VmsltVV) + (VmsleuVV) + (VmsleVV) + (VmfeqVV) + (VmfneVV) + (VmfltVV) + (VmfleVV) + (VmandMM) + (VmorMM) + (VmnandMM) + (VmnorMM) + + + ;; Vector-Scalar Opcodes + (VaddVX) + (VsaddVX) + (VsadduVX) + (VwaddVX) + (VwaddWX) + (VwadduVX) + (VwadduWX) + (VsubVX) + (VrsubVX) + (VwsubVX) + (VwsubWX) + (VwsubuVX) + (VwsubuWX) + (VssubVX) + (VssubuVX) + (VmulVX) + (VmulhVX) + (VmulhuVX) + (VsmulVX) + (VsllVX) + (VsrlVX) + (VsraVX) + (VandVX) + (VorVX) + (VxorVX) + (VmaxVX) + (VmaxuVX) + (VminVX) + (VminuVX) + (VslidedownVX) + (VfaddVF) + (VfsubVF) + (VfrsubVF) + (VfmulVF) + (VfdivVF) + (VfsgnjVF) + (VfrdivVF) + (VmergeVXM) + (VfmergeVFM) + (VrgatherVX) + (VmseqVX) + (VmsneVX) + (VmsltuVX) + (VmsltVX) + (VmsleuVX) + (VmsleVX) + (VmsgtuVX) + (VmsgtVX) + (VmfeqVF) + (VmfneVF) + (VmfltVF) + (VmfleVF) + (VmfgtVF) + (VmfgeVF) +)) + + + +;; Register-Imm ALU Ops that modify the destination register +(type VecAluOpRRRImm5 (enum + (VslideupVI) +)) + +;; Register-Register ALU Ops that modify the destination register +(type VecAluOpRRRR (enum + ;; Vector-Vector Opcodes + (VmaccVV) + (VnmsacVV) + (VfmaccVV) + (VfnmaccVV) + (VfmsacVV) + (VfnmsacVV) + + ;; Vector-Scalar Opcodes + (VmaccVX) + (VnmsacVX) + (VfmaccVF) + (VfnmaccVF) + (VfmsacVF) + (VfnmsacVF) + (Vslide1upVX) +)) + +;; Register-Imm ALU Ops +(type VecAluOpRRImm5 (enum + ;; Regular VI Opcodes + (VaddVI) + (VsaddVI) + (VsadduVI) + (VrsubVI) + (VsllVI) + (VsrlVI) + (VsraVI) + (VandVI) + (VorVI) + (VxorVI) + (VssrlVI) + (VslidedownVI) + (VmergeVIM) + (VrgatherVI) + ;; This opcode represents multiple instructions `vmv1r`/`vmv2r`/`vmv4r`/etc... + ;; The immediate field specifies how many registers should be copied. + (VmvrV) + (VnclipWI) + (VnclipuWI) + (VmseqVI) + (VmsneVI) + (VmsleuVI) + (VmsleVI) + (VmsgtuVI) + (VmsgtVI) +)) + +;; Imm only ALU Ops +(type VecAluOpRImm5 (enum + (VmvVI) +)) + +;; These are all of the special cases that have weird encodings. They are all +;; single source, single destination instructions, and usually use one of +;; the two source registers as auxiliary encoding space. +(type VecAluOpRR (enum + (VmvSX) + (VmvXS) + (VfmvSF) + (VfmvFS) + ;; vmv.v* is special in that vs2 must be v0 (and is ignored) otherwise the instruction is illegal. + (VmvVV) + (VmvVX) + (VfmvVF) + (VfsqrtV) + (VsextVF2) + (VsextVF4) + (VsextVF8) + (VzextVF2) + (VzextVF4) + (VzextVF8) + (VfcvtxufV) + (VfcvtxfV) + (VfcvtrtzxufV) + (VfcvtrtzxfV) + (VfcvtfxuV) + (VfcvtfxV) + (VfwcvtffV) + (VfncvtffW) +)) + +;; Returns the canonical destination type for a VecAluOpRRImm5. +(decl pure vec_alu_rr_dst_type (VecAluOpRR) Type) +(extern constructor vec_alu_rr_dst_type vec_alu_rr_dst_type) + + +;; Vector Addressing Mode +(type VecAMode (enum + ;; Vector unit-stride operations access elements stored contiguously in memory + ;; starting from the base effective address. + (UnitStride + (base AMode)) + ;; TODO: Constant Stride + ;; TODO: Indexed Operations +)) + + +;; Builds a static VState matching a SIMD type. +;; The VState is guaranteed to be static with AVL set to the number of lanes. +;; Element size is set to the size of the type. +;; LMUL is set to 1. +;; Tail mode is set to agnostic. +;; Mask mode is set to agnostic. +(decl pure vstate_from_type (Type) VState) +(extern constructor vstate_from_type vstate_from_type) +(convert Type VState vstate_from_type) + +;; Alters the LMUL of a VState to mf2 +(decl pure vstate_mf2 (VState) VState) +(extern constructor vstate_mf2 vstate_mf2) + +;; Extracts an element width from a SIMD type. +(decl pure element_width_from_type (Type) VecElementWidth) +(rule (element_width_from_type ty) + (if-let $I8 (lane_type ty)) + (VecElementWidth.E8)) +(rule (element_width_from_type ty) + (if-let $I16 (lane_type ty)) + (VecElementWidth.E16)) +(rule (element_width_from_type ty) + (if-let $I32 (lane_type ty)) + (VecElementWidth.E32)) +(rule (element_width_from_type ty) + (if-let $F32 (lane_type ty)) + (VecElementWidth.E32)) +(rule (element_width_from_type ty) + (if-let $I64 (lane_type ty)) + (VecElementWidth.E64)) +(rule (element_width_from_type ty) + (if-let $F64 (lane_type ty)) + (VecElementWidth.E64)) + +(decl pure min_vec_reg_size () u64) +(extern constructor min_vec_reg_size min_vec_reg_size) + +;; An extractor that matches any type that is known to fit in a single vector +;; register. +(decl ty_vec_fits_in_register (Type) Type) +(extern extractor ty_vec_fits_in_register ty_vec_fits_in_register) + +;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; As noted in the RISC-V Vector Extension Specification, rs2 is the first +;; source register and rs1 is the second source register. This is the opposite +;; of the usual RISC-V register order. +;; See Section 10.1 of the RISC-V Vector Extension Specification. + + +;; Helper for emitting `MInst.VecAluRRRR` instructions. +;; These instructions modify the destination register. +(decl vec_alu_rrrr (VecAluOpRRRR VReg VReg Reg VecOpMasking VState) VReg) +(rule (vec_alu_rrrr op vd_src vs2 vs1 mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRRRR op vd vd_src vs2 vs1 mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRRRImm5` instructions. +;; These instructions modify the destination register. +(decl vec_alu_rrr_imm5 (VecAluOpRRRImm5 VReg VReg Imm5 VecOpMasking VState) VReg) +(rule (vec_alu_rrr_imm5 op vd_src vs2 imm mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRRRImm5 op vd vd_src vs2 imm mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRRRImm5` instructions where the immediate +;; is zero extended instead of sign extended. +(decl vec_alu_rrr_uimm5 (VecAluOpRRRImm5 VReg VReg UImm5 VecOpMasking VState) VReg) +(rule (vec_alu_rrr_uimm5 op vd_src vs2 imm mask vstate) + (vec_alu_rrr_imm5 op vd_src vs2 (uimm5_bitcast_to_imm5 imm) mask vstate)) + +;; Helper for emitting `MInst.VecAluRRR` instructions. +(decl vec_alu_rrr (VecAluOpRRR Reg Reg VecOpMasking VState) Reg) +(rule (vec_alu_rrr op vs2 vs1 mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRRImm5` instructions. +(decl vec_alu_rr_imm5 (VecAluOpRRImm5 Reg Imm5 VecOpMasking VState) Reg) +(rule (vec_alu_rr_imm5 op vs2 imm mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRRImm5` instructions where the immediate +;; is zero extended instead of sign extended. +(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VecOpMasking VState) Reg) +(rule (vec_alu_rr_uimm5 op vs2 imm mask vstate) + (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) mask vstate)) + +;; Helper for emitting `MInst.VecAluRRImm5` instructions that use the Imm5 as +;; auxiliary encoding space. +(decl vec_alu_rr (VecAluOpRR Reg VecOpMasking VState) Reg) +(rule (vec_alu_rr op vs mask vstate) + (let ((vd WritableReg (temp_writable_reg (vec_alu_rr_dst_type op))) + (_ Unit (emit (MInst.VecAluRR op vd vs mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRImm5` instructions. +(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VecOpMasking VState) Reg) +(rule (vec_alu_r_imm5 op imm mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRImm5 op vd imm mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecLoad` instructions. +(decl vec_load (VecElementWidth VecAMode MemFlags VecOpMasking VState) Reg) +(rule (vec_load eew from flags mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecLoad eew vd from flags mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecStore` instructions. +(decl vec_store (VecElementWidth VecAMode VReg MemFlags VecOpMasking VState) InstOutput) +(rule (vec_store eew to from flags mask vstate) + (side_effect + (SideEffectNoResult.Inst (MInst.VecStore eew to from flags mask vstate)))) + +;; Helper for emitting the `vadd.vv` instruction. +(decl rv_vadd_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vadd.vx` instruction. +(decl rv_vadd_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vadd_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vadd.vi` instruction. +(decl rv_vadd_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vadd_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vsadd.vv` instruction. +(decl rv_vsadd_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsaddVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsadd.vx` instruction. +(decl rv_vsadd_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsadd_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsaddVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsadd.vi` instruction. +(decl rv_vsadd_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vsadd_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VsaddVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vsaddu.vv` instruction. +(decl rv_vsaddu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsaddu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsadduVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsaddu.vx` instruction. +(decl rv_vsaddu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsaddu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsadduVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsaddu.vi` instruction. +(decl rv_vsaddu_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vsaddu_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VsadduVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vwadd.vv` instruction. +;; +;; Widening integer add, 2*SEW = SEW + SEW +(decl rv_vwadd_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwaddVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwadd.vx` instruction. +;; +;; Widening integer add, 2*SEW = SEW + SEW +(decl rv_vwadd_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwadd_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwaddVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwadd.wv` instruction. +;; +;; Widening integer add, 2*SEW = 2*SEW + SEW +(decl rv_vwadd_wv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwadd_wv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwaddWV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwadd.wx` instruction. +;; +;; Widening integer add, 2*SEW = 2*SEW + SEW +(decl rv_vwadd_wx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwadd_wx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwaddWX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwaddu.vv` instruction. +;; +;; Widening unsigned integer add, 2*SEW = SEW + SEW +(decl rv_vwaddu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwaddu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwadduVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwaddu.vv` instruction. +;; +;; Widening unsigned integer add, 2*SEW = SEW + SEW +(decl rv_vwaddu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwaddu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwadduVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwaddu.wv` instruction. +;; +;; Widening integer add, 2*SEW = 2*SEW + SEW +(decl rv_vwaddu_wv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwaddu_wv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwadduWV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwaddu.wx` instruction. +;; +;; Widening integer add, 2*SEW = 2*SEW + SEW +(decl rv_vwaddu_wx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwaddu_wx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwadduWX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsub.vv` instruction. +(decl rv_vsub_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsub.vx` instruction. +(decl rv_vsub_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrsub.vx` instruction. +(decl rv_vrsub_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vrsub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsub.vv` instruction. +;; +;; Widening integer sub, 2*SEW = SEW + SEW +(decl rv_vwsub_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwsub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsub.vx` instruction. +;; +;; Widening integer sub, 2*SEW = SEW + SEW +(decl rv_vwsub_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwsub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsub.wv` instruction. +;; +;; Widening integer sub, 2*SEW = 2*SEW + SEW +(decl rv_vwsub_wv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwsub_wv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubWV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsub.wx` instruction. +;; +;; Widening integer sub, 2*SEW = 2*SEW + SEW +(decl rv_vwsub_wx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwsub_wx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubWX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsubu.vv` instruction. +;; +;; Widening unsigned integer sub, 2*SEW = SEW + SEW +(decl rv_vwsubu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwsubu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsubu.vv` instruction. +;; +;; Widening unsigned integer sub, 2*SEW = SEW + SEW +(decl rv_vwsubu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwsubu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsubu.wv` instruction. +;; +;; Widening integer sub, 2*SEW = 2*SEW + SEW +(decl rv_vwsubu_wv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vwsubu_wv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubuWV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vwsubu.wx` instruction. +;; +;; Widening integer sub, 2*SEW = 2*SEW + SEW +(decl rv_vwsubu_wx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vwsubu_wx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VwsubuWX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vssub.vv` instruction. +(decl rv_vssub_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vssub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VssubVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vssub.vx` instruction. +(decl rv_vssub_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vssub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VssubVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vssubu.vv` instruction. +(decl rv_vssubu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vssubu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VssubuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vssubu.vx` instruction. +(decl rv_vssubu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vssubu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VssubuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vneg.v` pseudo-instruction. +(decl rv_vneg_v (VReg VecOpMasking VState) VReg) +(rule (rv_vneg_v vs2 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) mask vstate)) + +;; Helper for emitting the `vrsub.vi` instruction. +(decl rv_vrsub_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vrsub_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmul.vv` instruction. +(decl rv_vmul_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmul.vx` instruction. +(decl rv_vmul_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmul_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmulh.vv` instruction. +(decl rv_vmulh_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmulh_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmulh.vx` instruction. +(decl rv_vmulh_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmulh_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmulhu.vv` instruction. +(decl rv_vmulhu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmulhu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmulhu.vx` instruction. +(decl rv_vmulhu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmulhu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsmul.vv` instruction. +;; +;; Signed saturating and rounding fractional multiply +;; # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1)) +(decl rv_vsmul_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsmulVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsmul.vx` instruction. +;; +;; Signed saturating and rounding fractional multiply +;; # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1)) +(decl rv_vsmul_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsmul_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsmulVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmacc.vv` instruction. +;; +;; Integer multiply-add, overwrite addend +;; # vd[i] = +(vs1[i] * vs2[i]) + vd[i] +(decl rv_vmacc_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmacc_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VmaccVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmacc.vx` instruction. +;; +;; Integer multiply-add, overwrite addend +;; # vd[i] = +(x[rs1] * vs2[i]) + vd[i] +(decl rv_vmacc_vx (VReg VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmacc_vx vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VmaccVX) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vnmsac.vv` instruction. +;; +;; Integer multiply-sub, overwrite minuend +;; # vd[i] = -(vs1[i] * vs2[i]) + vd[i] +(decl rv_vnmsac_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vnmsac_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VnmsacVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vnmsac.vx` instruction. +;; +;; Integer multiply-sub, overwrite minuend +;; # vd[i] = -(x[rs1] * vs2[i]) + vd[i] +(decl rv_vnmsac_vx (VReg VReg XReg VecOpMasking VState) VReg) +(rule (rv_vnmsac_vx vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VnmsacVX) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `sll.vv` instruction. +(decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsll_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsllVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `sll.vx` instruction. +(decl rv_vsll_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsll_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsllVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsll.vi` instruction. +(decl rv_vsll_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vsll_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsllVI) vs2 imm mask vstate)) + +;; Helper for emitting the `srl.vv` instruction. +(decl rv_vsrl_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsrl_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsrlVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `srl.vx` instruction. +(decl rv_vsrl_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsrl_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsrlVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsrl.vi` instruction. +(decl rv_vsrl_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vsrl_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsrlVI) vs2 imm mask vstate)) + +;; Helper for emitting the `sra.vv` instruction. +(decl rv_vsra_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsra_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsraVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `sra.vx` instruction. +(decl rv_vsra_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsra_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsraVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsra.vi` instruction. +(decl rv_vsra_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vsra_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsraVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vand.vv` instruction. +(decl rv_vand_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vand_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vand.vx` instruction. +(decl rv_vand_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vand_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VandVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vand.vi` instruction. +(decl rv_vand_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vand_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VandVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vor.vv` instruction. +(decl rv_vor_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vor_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vor.vx` instruction. +(decl rv_vor_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vor_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VorVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vor.vi` instruction. +(decl rv_vor_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vor_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VorVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vxor.vv` instruction. +(decl rv_vxor_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vxor_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vxor.vx` instruction. +(decl rv_vxor_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vxor_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VxorVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vxor.vi` instruction. +(decl rv_vxor_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vxor_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vssrl.vi` instruction. +;; +;; vd[i] = (unsigned(vs2[i]) >> imm) + r +;; +;; `r` here is the rounding mode currently selected. +(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vssrl_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vnot.v` instruction. +;; This is just a mnemonic for `vxor.vi vd, vs, -1` +(decl rv_vnot_v (VReg VecOpMasking VState) VReg) +(rule (rv_vnot_v vs2 mask vstate) + (if-let neg1 (i8_to_imm5 -1)) + (rv_vxor_vi vs2 neg1 mask vstate)) + +;; Helper for emitting the `vmax.vv` instruction. +(decl rv_vmax_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmax_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmaxVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmax.vx` instruction. +(decl rv_vmax_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmax_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmaxVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmin.vv` instruction. +(decl rv_vmin_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmin_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VminVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmin.vx` instruction. +(decl rv_vmin_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmin_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VminVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmaxu.vv` instruction. +(decl rv_vmaxu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmaxu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmaxuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmaxu.vx` instruction. +(decl rv_vmaxu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmaxu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmaxuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vminu.vv` instruction. +(decl rv_vminu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vminu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VminuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vminu.vx` instruction. +(decl rv_vminu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vminu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VminuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfadd.vv` instruction. +(decl rv_vfadd_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfadd.vf` instruction. +(decl rv_vfadd_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfadd_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfsub.vv` instruction. +(decl rv_vfsub_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfsub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfsub.vf` instruction. +(decl rv_vfsub_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfsub_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfrsub.vf` instruction. +(decl rv_vfrsub_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfrsub_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmul.vv` instruction. +(decl rv_vfmul_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmul.vf` instruction. +(decl rv_vfmul_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfmul_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmacc.vv` instruction. +;; +;; FP multiply-accumulate, overwrites addend +;; # vd[i] = +(vs1[i] * vs2[i]) + vd[i] +(decl rv_vfmacc_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfmacc_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfmaccVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmacc.vf` instruction. +;; +;; FP multiply-accumulate, overwrites addend +;; # vd[i] = +(f[rs1] * vs2[i]) + vd[i] +(decl rv_vfmacc_vf (VReg VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfmacc_vf vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfmaccVF) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfnmacc.vv` instruction. +;; +;; FP negate-(multiply-accumulate), overwrites subtrahend +;; # vd[i] = -(vs1[i] * vs2[i]) - vd[i] +(decl rv_vfnmacc_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfnmacc_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfnmaccVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfnmacc.vf` instruction. +;; +;; FP negate-(multiply-accumulate), overwrites subtrahend +;; # vd[i] = -(f[rs1] * vs2[i]) - vd[i] +(decl rv_vfnmacc_vf (VReg VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfnmacc_vf vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfnmaccVF) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmsac.vv` instruction. +;; +;; FP multiply-subtract-accumulator, overwrites subtrahend +;; # vd[i] = +(vs1[i] * vs2[i]) - vd[i] +(decl rv_vfmsac_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfmsac_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfmsacVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmsac.vf` instruction. +;; +;; FP multiply-subtract-accumulator, overwrites subtrahend +;; # vd[i] = +(f[rs1] * vs2[i]) - vd[i] +(decl rv_vfmsac_vf (VReg VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfmsac_vf vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfmsacVF) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfnmsac.vv` instruction. +;; +;; FP negate-(multiply-subtract-accumulator), overwrites minuend +;; # vd[i] = -(vs1[i] * vs2[i]) + vd[i] +(decl rv_vfnmsac_vv (VReg VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfnmsac_vv vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfnmsacVV) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfnmsac.vf` instruction. +;; +;; FP negate-(multiply-subtract-accumulator), overwrites minuend +;; # vd[i] = -(f[rs1] * vs2[i]) + vd[i] +(decl rv_vfnmsac_vf (VReg VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfnmsac_vf vd vs2 vs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.VfnmsacVF) vd vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfdiv.vv` instruction. +(decl rv_vfdiv_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfdiv_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfdiv.vf` instruction. +(decl rv_vfdiv_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfdiv_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfrdiv.vf` instruction. +(decl rv_vfrdiv_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfrdiv_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmin.vv` instruction. +(decl rv_vfmin_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfmin_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfminVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfmax.vv` instruction. +(decl rv_vfmax_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfmax_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmaxVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfsgnj.vv` ("Floating Point Sign Injection") instruction. +;; The output of this instruction is `vs2` with the sign bit from `vs1` +(decl rv_vfsgnj_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfsgnj_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfsgnj.vf` ("Floating Point Sign Injection") instruction. +(decl rv_vfsgnj_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vfsgnj_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfsgnjn.vv` ("Floating Point Sign Injection Negated") instruction. +;; The output of this instruction is `vs2` with the negated sign bit from `vs1` +(decl rv_vfsgnjn_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfsgnjn_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfneg.v` instruction. +;; This instruction is a mnemonic for `vfsgnjn.vv vd, vs, vs` +(decl rv_vfneg_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfneg_v vs mask vstate) (rv_vfsgnjn_vv vs vs mask vstate)) + +;; Helper for emitting the `vfsgnjx.vv` ("Floating Point Sign Injection Exclusive") instruction. +;; The output of this instruction is `vs2` with the XOR of the sign bits from `vs2` and `vs1`. +;; When `vs2 == vs1` this implements `fabs` +(decl rv_vfsgnjx_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vfsgnjx_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjxVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vfabs.v` instruction. +;; This instruction is a mnemonic for `vfsgnjx.vv vd, vs, vs` +(decl rv_vfabs_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfabs_v vs mask vstate) (rv_vfsgnjx_vv vs vs mask vstate)) + +;; Helper for emitting the `vfsqrt.v` instruction. +;; This instruction splats the F register into all elements of the destination vector. +(decl rv_vfsqrt_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfsqrt_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfsqrtV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.xu.f.v` instruction. +;; This instruction converts a float to an unsigned integer. +(decl rv_vfcvt_xu_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_xu_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtxufV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.x.f.v` instruction. +;; This instruction converts a float to a signed integer. +(decl rv_vfcvt_x_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_x_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtxfV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.rtz.xu.f.v` instruction. +;; This instruction converts a float to an unsigned integer +;; using the Round to Zero (RTZ) rounding mode and ignoring +;; the currently set FRM rounding mode. +(decl rv_vfcvt_rtz_xu_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_rtz_xu_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtrtzxufV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.rtz.x.f.v` instruction. +;; This instruction converts a float to a signed integer. +;; using the Round to Zero (RTZ) rounding mode and ignoring +;; the currently set FRM rounding mode. +(decl rv_vfcvt_rtz_x_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_rtz_x_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtrtzxfV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.f.xu.v` instruction. +;; This instruction converts a unsigned integer to a float. +(decl rv_vfcvt_f_xu_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_f_xu_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtfxuV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.x.f.v` instruction. +;; This instruction converts a signed integer to a float. +(decl rv_vfcvt_f_x_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_f_x_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtfxV) vs mask vstate)) + + ;; Helper for emitting the `vfwcvt.f.f.v` instruction. +;; Convert single-width float to double-width float. +(decl rv_vfwcvt_f_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfwcvt_f_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfwcvtffV) vs mask vstate)) + +;; Helper for emitting the `vfncvt.f.f.w` instruction. +;; Convert double-width float to single-width float. +(decl rv_vfncvt_f_f_w (VReg VecOpMasking VState) VReg) +(rule (rv_vfncvt_f_f_w vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfncvtffW) vs mask vstate)) + +;; Helper for emitting the `vslidedown.vx` instruction. +;; `vslidedown` moves all elements in the vector down by n elements. +;; The top most elements are up to the tail policy. +(decl rv_vslidedown_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vslidedown_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vslidedown.vi` instruction. +;; Unlike other `vi` instructions the immediate is zero extended. +(decl rv_vslidedown_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vslidedown_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vslideup.vi` instruction. +;; Unlike other `vi` instructions the immediate is zero extended. +;; This is implemented as a 2 source operand instruction, since it only +;; partially modifies the destination register. +(decl rv_vslideup_vvi (VReg VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vslideup_vvi vd vs2 imm mask vstate) + (vec_alu_rrr_uimm5 (VecAluOpRRRImm5.VslideupVI) vd vs2 imm mask vstate)) + +;; Helper for emitting the `vslide1up.vx` instruction. +;; +;; # vd[0]=x[rs1], vd[i+1] = vs2[i] +(decl rv_vslide1up_vx (VReg VReg XReg VecOpMasking VState) VReg) +(rule (rv_vslide1up_vx vd vs2 rs1 mask vstate) + (vec_alu_rrrr (VecAluOpRRRR.Vslide1upVX) vd vs2 rs1 mask vstate)) + +;; Helper for emitting the `vmv.x.s` instruction. +;; This instruction copies the first element of the source vector to the destination X register. +;; Masked versions of this instruction are not supported. +(decl rv_vmv_xs (VReg VState) XReg) +(rule (rv_vmv_xs vs vstate) + (vec_alu_rr (VecAluOpRR.VmvXS) vs (unmasked) vstate)) + +;; Helper for emitting the `vfmv.f.s` instruction. +;; This instruction copies the first element of the source vector to the destination F register. +;; Masked versions of this instruction are not supported. +(decl rv_vfmv_fs (VReg VState) FReg) +(rule (rv_vfmv_fs vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvFS) vs (unmasked) vstate)) + +;; Helper for emitting the `vmv.s.x` instruction. +;; This instruction copies the source X register into first element of the source vector. +;; Masked versions of this instruction are not supported. +(decl rv_vmv_sx (XReg VState) VReg) +(rule (rv_vmv_sx vs vstate) + (vec_alu_rr (VecAluOpRR.VmvSX) vs (unmasked) vstate)) + +;; Helper for emitting the `vfmv.s.f` instruction. +;; This instruction copies the source F register into first element of the source vector. +;; Masked versions of this instruction are not supported. +(decl rv_vfmv_sf (FReg VState) VReg) +(rule (rv_vfmv_sf vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvSF) vs (unmasked) vstate)) + +;; Helper for emitting the `vmv.v.x` instruction. +;; This instruction splats the X register into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` +(decl rv_vmv_vx (XReg VState) VReg) +(rule (rv_vmv_vx vs vstate) + (vec_alu_rr (VecAluOpRR.VmvVX) vs (unmasked) vstate)) + +;; Helper for emitting the `vfmv.v.f` instruction. +;; This instruction splats the F register into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` +(decl rv_vfmv_vf (FReg VState) VReg) +(rule (rv_vfmv_vf vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvVF) vs (unmasked) vstate)) + +;; Helper for emitting the `vmv.v.i` instruction. +;; This instruction splat's the immediate value into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` +(decl rv_vmv_vi (Imm5 VState) VReg) +(rule (rv_vmv_vi imm vstate) + (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm (unmasked) vstate)) + +;; Helper for emitting the `vmerge.vvm` instruction. +;; This instruction merges the elements of the two source vectors into the destination vector +;; based on a mask. Elements are taken from the first source vector if the mask bit is clear, +;; and from the second source vector if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? vs1[i] : vs2[i] +(decl rv_vmerge_vvm (VReg VReg VReg VState) VReg) +(rule (rv_vmerge_vvm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmergeVVM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vmerge.vxm` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the X +;; register if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? x[rs1] : vs2[i] +(decl rv_vmerge_vxm (VReg XReg VReg VState) VReg) +(rule (rv_vmerge_vxm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmergeVXM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vfmerge.vfm` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the F +;; register if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? f[rs1] : vs2[i] +(decl rv_vfmerge_vfm (VReg FReg VReg VState) VReg) +(rule (rv_vfmerge_vfm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmergeVFM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vmerge.vim` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the +;; immediate value if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? imm : vs2[i] +(decl rv_vmerge_vim (VReg Imm5 VReg VState) VReg) +(rule (rv_vmerge_vim vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmergeVIM) vs2 imm (masked mask) vstate)) + + +;; Helper for emitting the `vredminu.vs` instruction. +;; +;; vd[0] = minu( vs1[0] , vs2[*] ) +(decl rv_vredminu_vs (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vredminu_vs vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VredminuVS) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vredmaxu.vs` instruction. +;; +;; vd[0] = maxu( vs1[0] , vs2[*] ) +(decl rv_vredmaxu_vs (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vredmaxu_vs vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VredmaxuVS) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrgather.vv` instruction. +;; +;; vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; +(decl rv_vrgather_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vrgather_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrgatherVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrgather.vx` instruction. +;; +;; vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]] +(decl rv_vrgather_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vrgather_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrgatherVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrgather.vi` instruction. +(decl rv_vrgather_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vrgather_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VrgatherVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vcompress.vm` instruction. +;; +;; The vector compress instruction allows elements selected by a vector mask +;; register from a source vector register group to be packed into contiguous +;; elements at the start of the destination vector register group. +;; +;; The mask register is specified through vs1 +(decl rv_vcompress_vm (VReg VReg VState) VReg) +(rule (rv_vcompress_vm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VcompressVM) vs2 vs1 (unmasked) vstate)) + +;; Helper for emitting the `vmseq.vv` (Vector Mask Set If Equal) instruction. +(decl rv_vmseq_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmseq_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmseqVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmseq.vx` (Vector Mask Set If Equal) instruction. +(decl rv_vmseq_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmseq_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmseqVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmseq.vi` (Vector Mask Set If Equal) instruction. +(decl rv_vmseq_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmseq_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmseqVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsne.vv` (Vector Mask Set If Not Equal) instruction. +(decl rv_vmsne_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsne_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsneVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsne.vx` (Vector Mask Set If Not Equal) instruction. +(decl rv_vmsne_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsne_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsneVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsne.vi` (Vector Mask Set If Not Equal) instruction. +(decl rv_vmsne_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmsne_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmsneVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsltu.vv` (Vector Mask Set If Less Than, Unsigned) instruction. +(decl rv_vmsltu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsltu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsltuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsltu.vx` (Vector Mask Set If Less Than, Unsigned) instruction. +(decl rv_vmsltu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsltu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsltuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmslt.vv` (Vector Mask Set If Less Than) instruction. +(decl rv_vmslt_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmslt_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsltVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmslt.vx` (Vector Mask Set If Less Than) instruction. +(decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmslt_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsleu.vv` (Vector Mask Set If Less Than or Equal, Unsigned) instruction. +(decl rv_vmsleu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsleu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsleuVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsleu.vx` (Vector Mask Set If Less Than or Equal, Unsigned) instruction. +(decl rv_vmsleu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsleu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsleuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsleu.vi` (Vector Mask Set If Less Than or Equal, Unsigned) instruction. +(decl rv_vmsleu_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmsleu_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmsleuVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsle.vv` (Vector Mask Set If Less Than or Equal) instruction. +(decl rv_vmsle_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsle_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsleVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsle.vx` (Vector Mask Set If Less Than or Equal) instruction. +(decl rv_vmsle_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsle_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsleVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsle.vi` (Vector Mask Set If Less Than or Equal) instruction. +(decl rv_vmsle_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmsle_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmsleVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsgt.vv` (Vector Mask Set If Greater Than, Unsigned) instruction. +;; This is an alias for `vmsltu.vv` with the operands inverted. +(decl rv_vmsgtu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsgtu_vv vs2 vs1 mask vstate) (rv_vmsltu_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmsgtu.vx` (Vector Mask Set If Greater Than, Unsigned) instruction. +(decl rv_vmsgtu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsgtu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsgtuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsgtu.vi` (Vector Mask Set If Greater Than, Unsigned) instruction. +(decl rv_vmsgtu_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmsgtu_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmsgtuVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsgt.vv` (Vector Mask Set If Greater Than) instruction. +;; This is an alias for `vmslt.vv` with the operands inverted. +(decl rv_vmsgt_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsgt_vv vs2 vs1 mask vstate) (rv_vmslt_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmsgt.vx` (Vector Mask Set If Greater Than) instruction. +(decl rv_vmsgt_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmsgt_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmsgtVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmsgt.vi` (Vector Mask Set If Greater Than) instruction. +(decl rv_vmsgt_vi (VReg Imm5 VecOpMasking VState) VReg) +(rule (rv_vmsgt_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmsgtVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmsgeu.vv` (Vector Mask Set If Greater Than or Equal, Unsigned) instruction. +;; This is an alias for `vmsleu.vv` with the operands inverted. +(decl rv_vmsgeu_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsgeu_vv vs2 vs1 mask vstate) (rv_vmsleu_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmsge.vv` (Vector Mask Set If Greater Than or Equal) instruction. +;; This is an alias for `vmsle.vv` with the operands inverted. +(decl rv_vmsge_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmsge_vv vs2 vs1 mask vstate) (rv_vmsle_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmfeq.vv` (Vector Mask Set If Float Equal) instruction. +(decl rv_vmfeq_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmfeq_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfeqVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfeq.vf` (Vector Mask Set If Float Equal) instruction. +(decl rv_vmfeq_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmfeq_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfeqVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfne.vv` (Vector Mask Set If Float Not Equal) instruction. +(decl rv_vmfne_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmfne_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfneVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfne.vf` (Vector Mask Set If Float Not Equal) instruction. +(decl rv_vmfne_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmfne_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfneVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmflt.vv` (Vector Mask Set If Float Less Than) instruction. +(decl rv_vmflt_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmflt_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfltVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmflt.vf` (Vector Mask Set If Float Less Than) instruction. +(decl rv_vmflt_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmflt_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfltVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfle.vv` (Vector Mask Set If Float Less Than Or Equal) instruction. +(decl rv_vmfle_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmfle_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfleVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfle.vf` (Vector Mask Set If Float Less Than Or Equal) instruction. +(decl rv_vmfle_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmfle_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfleVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfgt.vv` (Vector Mask Set If Float Greater Than) instruction. +;; This is an alias for `vmflt.vv` with the operands inverted. +(decl rv_vmfgt_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmfgt_vv vs2 vs1 mask vstate) (rv_vmflt_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmfgt.vf` (Vector Mask Set If Float Greater Than) instruction. +(decl rv_vmfgt_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmfgt_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfgtVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vmfge.vv` (Vector Mask Set If Float Greater Than Or Equal) instruction. +;; This is an alias for `vmfle.vv` with the operands inverted. +(decl rv_vmfge_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vmfge_vv vs2 vs1 mask vstate) (rv_vmfle_vv vs1 vs2 mask vstate)) + +;; Helper for emitting the `vmfge.vf` (Vector Mask Set If Float Greater Than Or Equal) instruction. +(decl rv_vmfge_vf (VReg FReg VecOpMasking VState) VReg) +(rule (rv_vmfge_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmfgeVF) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vzext.vf2` instruction. +;; Zero-extend SEW/2 source to SEW destination +(decl rv_vzext_vf2 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf2 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF2) vs mask vstate)) + +;; Helper for emitting the `vzext.vf4` instruction. +;; Zero-extend SEW/4 source to SEW destination +(decl rv_vzext_vf4 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf4 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF4) vs mask vstate)) + +;; Helper for emitting the `vzext.vf8` instruction. +;; Zero-extend SEW/8 source to SEW destination +(decl rv_vzext_vf8 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf8 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF8) vs mask vstate)) + +;; Helper for emitting the `vsext.vf2` instruction. +;; Sign-extend SEW/2 source to SEW destination +(decl rv_vsext_vf2 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf2 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF2) vs mask vstate)) + +;; Helper for emitting the `vsext.vf4` instruction. +;; Sign-extend SEW/4 source to SEW destination +(decl rv_vsext_vf4 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf4 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF4) vs mask vstate)) + +;; Helper for emitting the `vsext.vf8` instruction. +;; Sign-extend SEW/8 source to SEW destination +(decl rv_vsext_vf8 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf8 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF8) vs mask vstate)) + +;; Helper for emitting the `vnclip.wi` instruction. +;; +;; vd[i] = clip(roundoff_signed(vs2[i], uimm)) +(decl rv_vnclip_wi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vnclip_wi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VnclipWI) vs2 imm mask vstate)) + +;; Helper for emitting the `vnclipu.wi` instruction. +;; +;; vd[i] = clip(roundoff_unsigned(vs2[i], uimm)) +(decl rv_vnclipu_wi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vnclipu_wi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VnclipuWI) vs2 imm mask vstate)) + +;; Helper for emitting the `vmand.mm` (Mask Bitwise AND) instruction. +;; +;; vd.mask[i] = vs2.mask[i] && vs1.mask[i] +(decl rv_vmand_mm (VReg VReg VState) VReg) +(rule (rv_vmand_mm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VmandMM) vs2 vs1 (unmasked) vstate)) + +;; Helper for emitting the `vmor.mm` (Mask Bitwise OR) instruction. +;; +;; vd.mask[i] = vs2.mask[i] || vs1.mask[i] +(decl rv_vmor_mm (VReg VReg VState) VReg) +(rule (rv_vmor_mm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VmorMM) vs2 vs1 (unmasked) vstate)) + +;; Helper for emitting the `vmnand.mm` (Mask Bitwise NAND) instruction. +;; +;; vd.mask[i] = !(vs2.mask[i] && vs1.mask[i]) +(decl rv_vmnand_mm (VReg VReg VState) VReg) +(rule (rv_vmnand_mm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VmnandMM) vs2 vs1 (unmasked) vstate)) + +;; Helper for emitting the `vmnot.m` (Mask Bitwise NOT) instruction. +;; This is an alias for `vmnand.mm vd, vs, vs` +;; +;; vd.mask[i] = !vs.mask[i] +(decl rv_vmnot_m (VReg VState) VReg) +(rule (rv_vmnot_m vs vstate) (rv_vmnand_mm vs vs vstate)) + +;; Helper for emitting the `vmnor.mm` (Mask Bitwise NOR) instruction. +;; +;; vd.mask[i] = !(vs2.mask[i] || vs1.mask[i]) +(decl rv_vmnor_mm (VReg VReg VState) VReg) +(rule (rv_vmnor_mm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VmnorMM) vs2 vs1 (unmasked) vstate)) + +;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl gen_extractlane (Type VReg u8) Reg) + +;; When extracting lane 0 for floats, we can use `vfmv.f.s` directly. +(rule 3 (gen_extractlane (ty_vec_fits_in_register ty) src 0) + (if (ty_vector_float ty)) + (rv_vfmv_fs src ty)) + +;; When extracting lane 0 for integers, we can use `vmv.x.s` directly. +(rule 2 (gen_extractlane (ty_vec_fits_in_register ty) src 0) + (if (ty_vector_not_float ty)) + (rv_vmv_xs src ty)) + +;; In the general case, we must first use a `vslidedown` to place the correct lane +;; in index 0, and then use the appropriate `vmv` instruction. +;; If the index fits into a 5-bit immediate, we can emit a `vslidedown.vi`. +(rule 1 (gen_extractlane (ty_vec_fits_in_register ty) src (uimm5_from_u8 idx)) + (gen_extractlane ty (rv_vslidedown_vi src idx (unmasked) ty) 0)) + +;; Otherwise lower it into an X register. +(rule 0 (gen_extractlane (ty_vec_fits_in_register ty) src idx) + (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) (unmasked) ty) 0)) + + +;; Build a vector mask from a u64 +;; TODO(#6571): We should merge this with the `vconst` rules, and take advantage of +;; the other existing `vconst` rules. +(decl gen_vec_mask (u64) VReg) + +;; When the immediate fits in a 5-bit immediate, we can use `vmv.v.i` directly. +(rule 1 (gen_vec_mask (imm5_from_u64 imm)) + (rv_vmv_vi imm (vstate_from_type $I64X2))) + +;; Materialize the mask into an X register, and move it into the bottom of +;; the vector register. +(rule 0 (gen_vec_mask mask) + (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2))) + + +;; Loads a `VCodeConstant` value into a vector register. For some special `VCodeConstant`s +;; we can use a dedicated instruction, otherwise we load the value from the pool. +;; +;; Type is the preferred type to use when loading the constant. +(decl gen_constant (Type VCodeConstant) VReg) + +;; The fallback case is to load the constant from the pool. +(rule (gen_constant ty n) + (vec_load + (element_width_from_type ty) + (VecAMode.UnitStride (gen_const_amode n)) + (mem_flags_trusted) + (unmasked) + ty)) + + +;; Emits a vslidedown instruction that moves half the lanes down. +(decl gen_slidedown_half (Type VReg) VReg) + +;; If the lane count can fit in a 5-bit immediate, we can use `vslidedown.vi`. +(rule 1 (gen_slidedown_half (ty_vec_fits_in_register ty) src) + (if-let (uimm5_from_u64 amt) (u64_udiv (ty_lane_count ty) 2)) + (rv_vslidedown_vi src amt (unmasked) ty)) + +;; Otherwise lower it into an X register. +(rule 0 (gen_slidedown_half (ty_vec_fits_in_register ty) src) + (if-let amt (u64_udiv (ty_lane_count ty) 2)) + (rv_vslidedown_vx src (imm $I64 amt) (unmasked) ty)) + + +;; Expands a mask into SEW wide lanes. Enabled lanes are set to all ones, disabled +;; lanes are set to all zeros. +(decl gen_expand_mask (Type VReg) VReg) +(rule (gen_expand_mask ty mask) + (if-let zero (i8_to_imm5 0)) + (if-let neg1 (i8_to_imm5 -1)) + (rv_vmerge_vim (rv_vmv_vi zero ty) neg1 mask ty)) + + +;; Builds a vector mask corresponding to the IntCC operation. +;; TODO: We are still missing some rules here for immediates. See #6623 +(decl gen_icmp_mask (Type IntCC Value Value) VReg) + +;; IntCC.Equal + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.Equal) x y) + (rv_vmseq_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.Equal) x (splat y)) + (rv_vmseq_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.Equal) (splat x) y) + (rv_vmseq_vx y x (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.Equal) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmseq_vi x y_imm (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.Equal) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmseq_vi y x_imm (unmasked) ty)) + +;; IntCC.NotEqual + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.NotEqual) x y) + (rv_vmsne_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.NotEqual) x (splat y)) + (rv_vmsne_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.NotEqual) (splat x) y) + (rv_vmsne_vx y x (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.NotEqual) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmsne_vi x y_imm (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.NotEqual) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmsne_vi y x_imm (unmasked) ty)) + +;; IntCC.UnsignedLessThan + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThan) x y) + (rv_vmsltu_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThan) x (splat y)) + (rv_vmsltu_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThan) (splat x) y) + (rv_vmsgtu_vx y x (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThan) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmsgtu_vi y x_imm (unmasked) ty)) + +;; IntCC.SignedLessThan + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThan) x y) + (rv_vmslt_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThan) x (splat y)) + (rv_vmslt_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThan) (splat x) y) + (rv_vmsgt_vx y x (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThan) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmsgt_vi y x_imm (unmasked) ty)) + +;; IntCC.UnsignedLessThanOrEqual + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThanOrEqual) x y) + (rv_vmsleu_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThanOrEqual) x (splat y)) + (rv_vmsleu_vx x y (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedLessThanOrEqual) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmsleu_vi x y_imm (unmasked) ty)) + +;; IntCC.SignedLessThanOrEqual + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThanOrEqual) x y) + (rv_vmsle_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThanOrEqual) x (splat y)) + (rv_vmsle_vx x y (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedLessThanOrEqual) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmsle_vi x y_imm (unmasked) ty)) + +;; IntCC.UnsignedGreaterThan + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThan) x y) + (rv_vmsgtu_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThan) x (splat y)) + (rv_vmsgtu_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThan) (splat x) y) + (rv_vmsltu_vx y x (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThan) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmsgtu_vi x y_imm (unmasked) ty)) + +;; IntCC.SignedGreaterThan + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThan) x y) + (rv_vmsgt_vv x y (unmasked) ty)) + +(rule 1 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThan) x (splat y)) + (rv_vmsgt_vx x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThan) (splat x) y) + (rv_vmslt_vx y x (unmasked) ty)) + +(rule 3 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThan) x y) + (if-let y_imm (replicated_imm5 y)) + (rv_vmsgt_vi x y_imm (unmasked) ty)) + +;; IntCC.UnsignedGreaterThanOrEqual + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThanOrEqual) x y) + (rv_vmsgeu_vv x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThanOrEqual) (splat x) y) + (rv_vmsleu_vx y x (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.UnsignedGreaterThanOrEqual) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmsleu_vi y x_imm (unmasked) ty)) + +;; IntCC.SignedGreaterThanOrEqual + +(rule 0 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThanOrEqual) x y) + (rv_vmsge_vv x y (unmasked) ty)) + +(rule 2 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThanOrEqual) (splat x) y) + (rv_vmsle_vx y x (unmasked) ty)) + +(rule 4 (gen_icmp_mask (ty_vec_fits_in_register ty) (IntCC.SignedGreaterThanOrEqual) x y) + (if-let x_imm (replicated_imm5 x)) + (rv_vmsle_vi y x_imm (unmasked) ty)) + + + +;; Builds a vector mask corresponding to the FloatCC operation. +(decl gen_fcmp_mask (Type FloatCC Value Value) VReg) + +;; FloatCC.Equal + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.Equal) x y) + (rv_vmfeq_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.Equal) x (splat y)) + (rv_vmfeq_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.Equal) (splat x) y) + (rv_vmfeq_vf y x (unmasked) ty)) + +;; FloatCC.NotEqual +;; Note: This is UnorderedNotEqual. It is the only unordered comparison that is not named as such. + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.NotEqual) x y) + (rv_vmfne_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.NotEqual) x (splat y)) + (rv_vmfne_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.NotEqual) (splat x) y) + (rv_vmfne_vf y x (unmasked) ty)) + +;; FloatCC.LessThan + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThan) x y) + (rv_vmflt_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThan) x (splat y)) + (rv_vmflt_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThan) (splat x) y) + (rv_vmfgt_vf y x (unmasked) ty)) + +;; FloatCC.LessThanOrEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThanOrEqual) x y) + (rv_vmfle_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThanOrEqual) x (splat y)) + (rv_vmfle_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.LessThanOrEqual) (splat x) y) + (rv_vmfge_vf y x (unmasked) ty)) + +;; FloatCC.GreaterThan + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThan) x y) + (rv_vmfgt_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThan) x (splat y)) + (rv_vmfgt_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThan) (splat x) y) + (rv_vmflt_vf y x (unmasked) ty)) + +;; FloatCC.GreaterThanOrEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThanOrEqual) x y) + (rv_vmfge_vv x y (unmasked) ty)) + +(rule 1 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThanOrEqual) x (splat y)) + (rv_vmfge_vf x y (unmasked) ty)) + +(rule 2 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.GreaterThanOrEqual) (splat x) y) + (rv_vmfle_vf y x (unmasked) ty)) + +;; FloatCC.Ordered + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.Ordered) x y) + (rv_vmand_mm + (gen_fcmp_mask ty (FloatCC.Equal) x x) + (gen_fcmp_mask ty (FloatCC.Equal) y y) + ty)) + +;; FloatCC.Unordered + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.Unordered) x y) + (rv_vmor_mm + (gen_fcmp_mask ty (FloatCC.NotEqual) x x) + (gen_fcmp_mask ty (FloatCC.NotEqual) y y) + ty)) + +;; FloatCC.OrderedNotEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.OrderedNotEqual) x y) + (rv_vmor_mm + (gen_fcmp_mask ty (FloatCC.LessThan) x y) + (gen_fcmp_mask ty (FloatCC.LessThan) y x) + ty)) + +;; FloatCC.UnorderedOrEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrEqual) x y) + (rv_vmnor_mm + (gen_fcmp_mask ty (FloatCC.LessThan) x y) + (gen_fcmp_mask ty (FloatCC.LessThan) y x) + ty)) + +;; FloatCC.UnorderedOrGreaterThan + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrGreaterThan) x y) + (rv_vmnot_m (gen_fcmp_mask ty (FloatCC.LessThanOrEqual) x y) ty)) + +;; FloatCC.UnorderedOrGreaterThanOrEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrGreaterThanOrEqual) x y) + (rv_vmnot_m (gen_fcmp_mask ty (FloatCC.LessThan) x y) ty)) + +;; FloatCC.UnorderedOrLessThan + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrLessThan) x y) + (rv_vmnot_m (gen_fcmp_mask ty (FloatCC.GreaterThanOrEqual) x y) ty)) + +;; FloatCC.UnorderedOrLessThanOrEqual + +(rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrLessThanOrEqual) x y) + (rv_vmnot_m (gen_fcmp_mask ty (FloatCC.GreaterThan) x y) ty)) + + +;; Emits a `vfcvt.x.f.v` instruction with the given rounding mode. +(decl gen_vfcvt_x_f (VReg FRM VState) VReg) + +;; We have a special instruction for RTZ +(rule 1 (gen_vfcvt_x_f x (FRM.RTZ) vstate) + (rv_vfcvt_rtz_x_f_v x (unmasked) vstate)) + +;; In the general case we need to first switch into the appropriate rounding mode. +(rule 0 (gen_vfcvt_x_f x frm vstate) + (let (;; Set the rounding mode and save the current mode + (saved_frm XReg (rv_fsrmi frm)) + (res VReg (rv_vfcvt_x_f_v x (unmasked) vstate)) + ;; Restore the previous rounding mode + (_ Unit (rv_fsrm saved_frm))) + res)) + + +;; Returns the maximum value integer value that can be represented by a float +(decl float_int_max (Type) u64) +(rule (float_int_max $F32) 0x4B000000) +(rule (float_int_max $F64) 0x4330000000000000) + +;; Builds the instruction sequence to round a vector register to FRM +(decl gen_vec_round (VReg FRM Type) VReg) + +;; For floating-point round operations, if the input is NaN, +/-infinity, or +/-0, the +;; same input is returned as the rounded result; this differs from behavior of +;; RISCV fcvt instructions (which round out-of-range values to the nearest +;; max or min value), therefore special handling is needed for these values. +(rule (gen_vec_round x frm (ty_vec_fits_in_register ty)) + (let ((scalar_ty Type (lane_type ty)) + ;; if x is NaN/+-Infinity/+-Zero or if the exponent is larger than # of bits + ;; in mantissa, the result is the same as src, build a mask for those cases. + ;; (There is an additional fixup for NaN's at the end) + (abs VReg (rv_vfabs_v x (unmasked) ty)) + (max FReg (imm scalar_ty (float_int_max scalar_ty))) + (exact VReg (rv_vmflt_vf abs max (unmasked) ty)) + + ;; The rounding is performed by converting from float to integer, with the + ;; desired rounding mode. And then converting back with the default rounding + ;; mode. + (int VReg (gen_vfcvt_x_f x frm ty)) + (cvt VReg (rv_vfcvt_f_x_v int (unmasked) ty)) + ;; Copy the sign bit from the original value. + (signed VReg (rv_vfsgnj_vv cvt x (unmasked) ty)) + + ;; We want to return a arithmetic nan if the input is a canonical nan. + ;; Convert them by adding 0.0 to the input. + (float_zero FReg (gen_bitcast (zero_reg) (float_int_of_same_size scalar_ty) scalar_ty)) + (corrected_nan VReg (rv_vfadd_vf x float_zero (unmasked) ty))) + ;; Merge the original value if it does not need rounding, or the rounded value + (rv_vmerge_vvm corrected_nan signed exact ty))) diff --git a/hbcb/src/lib.rs b/hbcb/src/lib.rs new file mode 100644 index 0000000..6eb5530 --- /dev/null +++ b/hbcb/src/lib.rs @@ -0,0 +1,264 @@ +//! risc-v 64-bit Instruction Set Architecture. + +#![allow(clippy::all)] + +extern crate alloc; + +use { + crate::settings as riscv_settings, + alloc::{boxed::Box, vec::Vec}, + core::fmt, + cranelift_codegen::{ + dominator_tree::DominatorTree, + ir::{self, Function, Type}, + isa::{Builder as IsaBuilder, FunctionAlignment, OwnedTargetIsa, TargetIsa}, + machinst::{ + compile, CompiledCode, CompiledCodeStencil, MachInst, MachTextSectionBuilder, Reg, + SigSet, TextSectionBuilder, VCode, + }, + result::CodegenResult, + settings::{self as shared_settings, Flags}, + CodegenError, + }, + cranelift_control::ControlPlane, + target_lexicon::{Architecture, Triple}, +}; +mod abi; +pub(crate) mod inst; +mod lower; +mod settings; +use self::inst::EmitInfo; +#[cfg(feature = "unwind")] +use crate::isa::unwind::systemv; + +/// An riscv64 backend. +pub struct Riscv64Backend { + triple: Triple, + flags: shared_settings::Flags, + isa_flags: riscv_settings::Flags, +} + +impl Riscv64Backend { + /// Create a new riscv64 backend with the given (shared) flags. + pub fn new_with_flags( + triple: Triple, + flags: shared_settings::Flags, + isa_flags: riscv_settings::Flags, + ) -> Riscv64Backend { + Riscv64Backend { triple, flags, isa_flags } + } + + /// This performs lowering to VCode, register-allocates the code, computes block layout and + /// finalizes branches. The result is ready for binary emission. + fn compile_vcode( + &self, + func: &Function, + domtree: &DominatorTree, + ctrl_plane: &mut ControlPlane, + ) -> CodegenResult<(VCode, regalloc2::Output)> { + let emit_info = EmitInfo::new(self.flags.clone(), self.isa_flags.clone()); + let sigs = SigSet::new::(func, &self.flags)?; + let abi = abi::Riscv64Callee::new(func, self, &self.isa_flags, &sigs)?; + compile::compile::(func, domtree, self, abi, emit_info, sigs, ctrl_plane) + } +} + +impl TargetIsa for Riscv64Backend { + fn compile_function( + &self, + func: &Function, + domtree: &DominatorTree, + want_disasm: bool, + ctrl_plane: &mut ControlPlane, + ) -> CodegenResult { + let (vcode, regalloc_result) = self.compile_vcode(func, domtree, ctrl_plane)?; + + let want_disasm = want_disasm || log::log_enabled!(log::Level::Debug); + let emit_result = vcode.emit(®alloc_result, want_disasm, &self.flags, ctrl_plane); + let frame_size = emit_result.frame_size; + let value_labels_ranges = emit_result.value_labels_ranges; + let buffer = emit_result.buffer; + let sized_stackslot_offsets = emit_result.sized_stackslot_offsets; + let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets; + + if let Some(disasm) = emit_result.disasm.as_ref() { + log::debug!("disassembly:\n{}", disasm); + } + + Ok(CompiledCodeStencil { + buffer, + frame_size, + vcode: emit_result.disasm, + value_labels_ranges, + sized_stackslot_offsets, + dynamic_stackslot_offsets, + bb_starts: emit_result.bb_offsets, + bb_edges: emit_result.bb_edges, + }) + } + + fn name(&self) -> &'static str { + "riscv64" + } + + fn dynamic_vector_bytes(&self, _dynamic_ty: ir::Type) -> u32 { + 16 + } + + fn triple(&self) -> &Triple { + &self.triple + } + + fn flags(&self) -> &shared_settings::Flags { + &self.flags + } + + fn isa_flags(&self) -> Vec { + self.isa_flags.iter().collect() + } + + #[cfg(feature = "unwind")] + fn emit_unwind_info( + &self, + result: &CompiledCode, + kind: crate::isa::unwind::UnwindInfoKind, + ) -> CodegenResult> { + use crate::isa::unwind::{UnwindInfo, UnwindInfoKind}; + Ok(match kind { + UnwindInfoKind::SystemV => { + let mapper = self::inst::unwind::systemv::RegisterMapper; + Some(UnwindInfo::SystemV( + crate::isa::unwind::systemv::create_unwind_info_from_insts( + &result.buffer.unwind_info[..], + result.buffer.data().len(), + &mapper, + )?, + )) + } + UnwindInfoKind::Windows => None, + _ => None, + }) + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option { + Some(inst::unwind::systemv::create_cie()) + } + + fn text_section_builder(&self, num_funcs: usize) -> Box { + Box::new(MachTextSectionBuilder::::new(num_funcs)) + } + + #[cfg(feature = "unwind")] + fn map_regalloc_reg_to_dwarf(&self, reg: Reg) -> Result { + inst::unwind::systemv::map_reg(reg).map(|reg| reg.0) + } + + fn function_alignment(&self) -> FunctionAlignment { + inst::Inst::function_alignment() + } + + fn page_size_align_log2(&self) -> u8 { + debug_assert_eq!(1 << 12, 0x1000); + 12 + } + + #[cfg(feature = "disas")] + fn to_capstone(&self) -> Result { + use capstone::prelude::*; + let mut cs_builder = Capstone::new().riscv().mode(arch::riscv::ArchMode::RiscV64); + + // Enable C instruction decoding if we have compressed instructions enabled. + // + // We can't enable this unconditionally because it will cause Capstone to + // emit weird instructions and generally mess up when it encounters unknown + // instructions, such as any Zba,Zbb,Zbc or Vector instructions. + // + // This causes the default disassembly to be quite unreadable, so enable + // it only when we are actually going to be using them. + let uses_compressed = self + .isa_flags() + .iter() + .filter(|f| ["has_zca", "has_zcb", "has_zcd"].contains(&f.name)) + .any(|f| f.as_bool().unwrap_or(false)); + if uses_compressed { + cs_builder = cs_builder.extra_mode([arch::riscv::ArchExtraMode::RiscVC].into_iter()); + } + + let mut cs = cs_builder.build()?; + + // Similar to AArch64, RISC-V uses inline constants rather than a separate + // constant pool. We want to skip disassembly over inline constants instead + // of stopping on invalid bytes. + cs.set_skipdata(true)?; + Ok(cs) + } + + fn has_native_fma(&self) -> bool { + true + } + + fn has_x86_blendv_lowering(&self, _: Type) -> bool { + false + } + + fn has_x86_pshufb_lowering(&self) -> bool { + false + } + + fn has_x86_pmulhrsw_lowering(&self) -> bool { + false + } + + fn has_x86_pmaddubsw_lowering(&self) -> bool { + false + } +} + +impl fmt::Display for Riscv64Backend { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("MachBackend") + .field("name", &self.name()) + .field("triple", &self.triple()) + .field("flags", &format!("{}", self.flags())) + .finish() + } +} + +/// Create a new `isa::Builder`. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + match triple.architecture { + Architecture::Riscv64(..) => {} + _ => unreachable!(), + } + IsaBuilder { triple, setup: riscv_settings::builder(), constructor: isa_constructor } +} + +fn isa_constructor( + triple: Triple, + shared_flags: Flags, + builder: &shared_settings::Builder, +) -> CodegenResult { + let isa_flags = riscv_settings::Flags::new(&shared_flags, builder); + + // The RISC-V backend does not work without at least the G extension enabled. + // The G extension is simply a combination of the following extensions: + // - I: Base Integer Instruction Set + // - M: Integer Multiplication and Division + // - A: Atomic Instructions + // - F: Single-Precision Floating-Point + // - D: Double-Precision Floating-Point + // - Zicsr: Control and Status Register Instructions + // - Zifencei: Instruction-Fetch Fence + // + // Ensure that those combination of features is enabled. + if !isa_flags.has_g() { + return Err(CodegenError::Unsupported( + "The RISC-V Backend currently requires all the features in the G Extension enabled" + .into(), + )); + } + + let backend = Riscv64Backend::new_with_flags(triple, shared_flags, isa_flags); + Ok(backend.wrapped()) +} diff --git a/hbcb/src/lower.isle b/hbcb/src/lower.isle new file mode 100644 index 0000000..fff894e --- /dev/null +++ b/hbcb/src/lower.isle @@ -0,0 +1,2966 @@ +;; riscv64 instruction selection and CLIF-to-MachInst lowering. + +;; The main lowering constructor term: takes a clif `Inst` and returns the +;; register(s) within which the lowered instruction's result values live. +(decl partial lower (Inst) InstOutput) + +;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty (iconst (u64_from_imm64 n)))) + (imm ty n)) + +;; ;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (ty_supported_vec ty) (vconst n))) + (gen_constant ty (const_to_vconst n))) + +;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (f16const (u16_from_ieee16 n))) + (imm $F16 n)) + +;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (f32const (u32_from_ieee32 n))) + (imm $F32 n)) + +;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (f64const (u64_from_ieee64 n))) + (imm $F64 n)) + +;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Base case, simply adding things in registers. +(rule -1 (lower (has_type (fits_in_32 (ty_int ty)) (iadd x y))) + (rv_addw x y)) + +(rule 0 (lower (has_type $I64 (iadd x y))) + (rv_add x y)) + +;; Special cases for when one operand is an immediate that fits in 12 bits. +(rule 1 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd x (imm12_from_value y)))) + (alu_rr_imm12 (select_addi ty) x y)) + +(rule 2 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd (imm12_from_value x) y))) + (alu_rr_imm12 (select_addi ty) y x)) + +;; Special case when one of the operands is uextended +;; Needs `Zba` +(rule 3 (lower (has_type $I64 (iadd x (uextend y @ (value_type $I32))))) + (if-let $true (has_zba)) + (rv_adduw y x)) + +(rule 4 (lower (has_type $I64 (iadd (uextend x @ (value_type $I32)) y))) + (if-let $true (has_zba)) + (rv_adduw x y)) + +;; Add with const shift. We have a few of these instructions with `Zba`. +(decl pure partial match_shnadd (Imm64) AluOPRRR) +(rule (match_shnadd (u64_from_imm64 1)) (AluOPRRR.Sh1add)) +(rule (match_shnadd (u64_from_imm64 2)) (AluOPRRR.Sh2add)) +(rule (match_shnadd (u64_from_imm64 3)) (AluOPRRR.Sh3add)) + +(rule 3 (lower (has_type $I64 (iadd x (ishl y (maybe_uextend (iconst n)))))) + (if-let $true (has_zba)) + (if-let shnadd (match_shnadd n)) + (alu_rrr shnadd y x)) + +(rule 4 (lower (has_type $I64 (iadd (ishl x (maybe_uextend (iconst n))) y))) + (if-let $true (has_zba)) + (if-let shnadd (match_shnadd n)) + (alu_rrr shnadd x y)) + + +;; Add with uextended const shift. We have a few of these instructions with `Zba`. +;; +;; !!! Important !!! +;; These rules only work for (ishl (uextend _) _) and not for (uextend (ishl _ _))! +;; Getting this wrong means a potential misscalculation of the shift amount. +;; Additionally we can only ensure that this is correct if the uextend is 32 to 64 bits. +(decl pure partial match_shnadd_uw (Imm64) AluOPRRR) +(rule (match_shnadd_uw (u64_from_imm64 1)) (AluOPRRR.Sh1adduw)) +(rule (match_shnadd_uw (u64_from_imm64 2)) (AluOPRRR.Sh2adduw)) +(rule (match_shnadd_uw (u64_from_imm64 3)) (AluOPRRR.Sh3adduw)) + +(rule 5 (lower (has_type $I64 (iadd x (ishl (uextend y @ (value_type $I32)) (maybe_uextend (iconst n)))))) + (if-let $true (has_zba)) + (if-let shnadd_uw (match_shnadd_uw n)) + (alu_rrr shnadd_uw y x)) + +(rule 6 (lower (has_type $I64 (iadd (ishl (uextend x @ (value_type $I32)) (maybe_uextend (iconst n))) y))) + (if-let $true (has_zba)) + (if-let shnadd_uw (match_shnadd_uw n)) + (alu_rrr shnadd_uw x y)) + +;; I128 cases +(rule 7 (lower (has_type $I128 (iadd x y))) + (let ((low XReg (rv_add (value_regs_get x 0) (value_regs_get y 0))) + ;; compute carry. + (carry XReg (rv_sltu low (value_regs_get y 0))) + ;; + (high_tmp XReg (rv_add (value_regs_get x 1) (value_regs_get y 1))) + ;; add carry. + (high XReg (rv_add high_tmp carry))) + (value_regs low high))) + +;; SIMD Vectors +(rule 8 (lower (has_type (ty_supported_vec ty) (iadd x y))) + (rv_vadd_vv x y (unmasked) ty)) + +(rule 9 (lower (has_type (ty_supported_vec ty) (iadd x (splat y)))) + (rv_vadd_vx x y (unmasked) ty)) + +(rule 10 (lower (has_type (ty_supported_vec ty) (iadd x (splat (sextend y @ (value_type sext_ty)))))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) sext_ty)) + (rv_vwadd_wx x y (unmasked) (vstate_mf2 half_ty))) + +(rule 10 (lower (has_type (ty_supported_vec ty) (iadd x (splat (uextend y @ (value_type uext_ty)))))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) uext_ty)) + (rv_vwaddu_wx x y (unmasked) (vstate_mf2 half_ty))) + +(rule 20 (lower (has_type (ty_supported_vec ty) (iadd x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vadd_vi x y_imm (unmasked) ty)) + + +(rule 12 (lower (has_type (ty_supported_vec ty) (iadd (splat x) y))) + (rv_vadd_vx y x (unmasked) ty)) + +(rule 13 (lower (has_type (ty_supported_vec ty) (iadd (splat (sextend x @ (value_type sext_ty))) y))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) sext_ty)) + (rv_vwadd_wx y x (unmasked) (vstate_mf2 half_ty))) + +(rule 13 (lower (has_type (ty_supported_vec ty) (iadd (splat (uextend x @ (value_type uext_ty))) y))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) uext_ty)) + (rv_vwaddu_wx y x (unmasked) (vstate_mf2 half_ty))) + +(rule 21 (lower (has_type (ty_supported_vec ty) (iadd x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vadd_vi y x_imm (unmasked) ty)) + +;; Signed Widening Low Additions + +(rule 9 (lower (has_type (ty_supported_vec _) (iadd x (swiden_low y @ (value_type in_ty))))) + (rv_vwadd_wv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 12 (lower (has_type (ty_supported_vec _) (iadd (swiden_low x @ (value_type in_ty)) y))) + (rv_vwadd_wv y x (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_low x @ (value_type in_ty)) + (swiden_low y)))) + (rv_vwadd_vv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_low x @ (value_type in_ty)) + (splat (sextend y @ (value_type sext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwadd_vx x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 15 (lower (has_type (ty_supported_vec _) (iadd (splat (sextend x @ (value_type sext_ty))) + (swiden_low y @ (value_type in_ty))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwadd_vx y x (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Signed Widening High Additions +;; These are the same as the low additions, but we first slide down the inputs. + +(rule 9 (lower (has_type (ty_supported_vec _) (iadd x (swiden_high y @ (value_type in_ty))))) + (rv_vwadd_wv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 12 (lower (has_type (ty_supported_vec _) (iadd (swiden_high x @ (value_type in_ty)) y))) + (rv_vwadd_wv y (gen_slidedown_half in_ty x) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_high x @ (value_type in_ty)) + (swiden_high y)))) + (rv_vwadd_vv (gen_slidedown_half in_ty x) (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_high x @ (value_type in_ty)) + (splat (sextend y @ (value_type sext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwadd_vx (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 15 (lower (has_type (ty_supported_vec _) (iadd (splat (sextend x @ (value_type sext_ty))) + (swiden_high y @ (value_type in_ty))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwadd_vx (gen_slidedown_half in_ty y) x (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening Low Additions + +(rule 9 (lower (has_type (ty_supported_vec _) (iadd x (uwiden_low y @ (value_type in_ty))))) + (rv_vwaddu_wv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 12 (lower (has_type (ty_supported_vec _) (iadd (uwiden_low x @ (value_type in_ty)) y))) + (rv_vwaddu_wv y x (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_low x @ (value_type in_ty)) + (uwiden_low y)))) + (rv_vwaddu_vv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_low x @ (value_type in_ty)) + (splat (uextend y @ (value_type uext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwaddu_vx x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 15 (lower (has_type (ty_supported_vec _) (iadd (splat (uextend x @ (value_type uext_ty))) + (uwiden_low y @ (value_type in_ty))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwaddu_vx y x (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening High Additions +;; These are the same as the low additions, but we first slide down the inputs. + +(rule 9 (lower (has_type (ty_supported_vec _) (iadd x (uwiden_high y @ (value_type in_ty))))) + (rv_vwaddu_wv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 12 (lower (has_type (ty_supported_vec _) (iadd (uwiden_high x @ (value_type in_ty)) y))) + (rv_vwaddu_wv y (gen_slidedown_half in_ty x) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_high x @ (value_type in_ty)) + (uwiden_high y)))) + (rv_vwaddu_vv (gen_slidedown_half in_ty x) (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_high x @ (value_type in_ty)) + (splat (uextend y @ (value_type uext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwaddu_vx (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 15 (lower (has_type (ty_supported_vec _) (iadd (splat (uextend y @ (value_type uext_ty))) + (uwiden_high x @ (value_type in_ty))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwaddu_vx (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Signed Widening Mixed High/Low Additions + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_low x @ (value_type in_ty)) + (swiden_high y)))) + (rv_vwadd_vv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (swiden_high x @ (value_type in_ty)) + (swiden_low y)))) + (rv_vwadd_vv (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening Mixed High/Low Additions + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_low x @ (value_type in_ty)) + (uwiden_high y)))) + (rv_vwaddu_vv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 13 (lower (has_type (ty_supported_vec _) (iadd (uwiden_high x @ (value_type in_ty)) + (uwiden_low y)))) + (rv_vwaddu_vv (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Fused Multiply Accumulate Rules `vmacc` +;; +;; I dont think we can use `vmadd`/`vmnsub` here since it just modifies the multiplication +;; register instead of the addition one. The actual pattern matched seems to be +;; exactly the same. + +(rule 9 (lower (has_type (ty_supported_vec ty) (iadd x (imul y z)))) + (rv_vmacc_vv x y z (unmasked) ty)) + +(rule 10 (lower (has_type (ty_supported_vec ty) (iadd x (imul y (splat z))))) + (rv_vmacc_vx x y z (unmasked) ty)) + +(rule 11 (lower (has_type (ty_supported_vec ty) (iadd x (imul (splat y) z)))) + (rv_vmacc_vx x z y (unmasked) ty)) + +(rule 12 (lower (has_type (ty_supported_vec ty) (iadd (imul x y) z))) + (rv_vmacc_vv z x y (unmasked) ty)) + +(rule 13 (lower (has_type (ty_supported_vec ty) (iadd (imul x (splat y)) z))) + (rv_vmacc_vx z x y (unmasked) ty)) + +(rule 14 (lower (has_type (ty_supported_vec ty) (iadd (imul (splat x) y) z))) + (rv_vmacc_vx z y x (unmasked) ty)) + +;; Fused Multiply Subtract Rules `vnmsac` + +(rule 9 (lower (has_type (ty_supported_vec ty) (iadd x (ineg (imul y z))))) + (rv_vnmsac_vv x y z (unmasked) ty)) + +(rule 10 (lower (has_type (ty_supported_vec ty) (iadd x (ineg (imul y (splat z)))))) + (rv_vnmsac_vx x y z (unmasked) ty)) + +(rule 11 (lower (has_type (ty_supported_vec ty) (iadd x (ineg (imul (splat y) z))))) + (rv_vnmsac_vx x z y (unmasked) ty)) + +(rule 12 (lower (has_type (ty_supported_vec ty) (iadd (ineg (imul x y)) z))) + (rv_vnmsac_vv z x y (unmasked) ty)) + +(rule 13 (lower (has_type (ty_supported_vec ty) (iadd (ineg (imul x (splat y))) z))) + (rv_vnmsac_vx z x y (unmasked) ty)) + +(rule 14 (lower (has_type (ty_supported_vec ty) (iadd (ineg (imul (splat x) y)) z))) + (rv_vnmsac_vx z y x (unmasked) ty)) + +;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_32 ty) (uadd_overflow_trap x y tc))) + (let ((tmp_x XReg (zext x)) + (tmp_y XReg (zext y)) + (sum XReg (rv_add tmp_x tmp_y)) + (test XReg (rv_srli sum (imm12_const (ty_bits ty)))) + (_ InstOutput (gen_trapnz test tc))) + sum)) + +(rule 1 (lower (has_type $I64 (uadd_overflow_trap x y tc))) + (let ((tmp XReg (rv_add x y)) + (_ InstOutput (gen_trapif (IntCC.UnsignedLessThan) tmp x tc))) + tmp)) + +;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Base case, simply subtracting things in registers. + +(rule 0 (lower (has_type (fits_in_32 (ty_int ty)) (isub x y))) + (rv_subw x y)) + +(rule 1 (lower (has_type $I64 (isub x y))) + (rv_sub x y)) + +(rule 2 (lower (has_type $I128 (isub x y))) + (i128_sub x y)) + +;; Switch to an `addi` by a negative if we can fit the value in an `imm12`. +(rule 3 (lower (has_type (ty_int_ref_scalar_64 ty) (isub x y))) + (if-let imm12_neg (imm12_from_negated_value y)) + (alu_rr_imm12 (select_addi ty) x imm12_neg)) + +;; SIMD Vectors +(rule 4 (lower (has_type (ty_supported_vec ty) (isub x y))) + (rv_vsub_vv x y (unmasked) ty)) + +(rule 5 (lower (has_type (ty_supported_vec ty) (isub x (splat y)))) + (rv_vsub_vx x y (unmasked) ty)) + +(rule 6 (lower (has_type (ty_supported_vec ty) (isub x (splat (sextend y @ (value_type sext_ty)))))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) sext_ty)) + (rv_vwsub_wx x y (unmasked) (vstate_mf2 half_ty))) + +(rule 6 (lower (has_type (ty_supported_vec ty) (isub x (splat (uextend y @ (value_type uext_ty)))))) + (if-let half_ty (ty_half_width ty)) + (if-let $true (ty_equal (lane_type half_ty) uext_ty)) + (rv_vwsubu_wx x y (unmasked) (vstate_mf2 half_ty))) + +(rule 7 (lower (has_type (ty_supported_vec ty) (isub (splat x) y))) + (rv_vrsub_vx y x (unmasked) ty)) + +(rule 8 (lower (has_type (ty_supported_vec ty) (isub x y))) + (if-let imm5_neg (negated_replicated_imm5 y)) + (rv_vadd_vi x imm5_neg (unmasked) ty)) + +(rule 9 (lower (has_type (ty_supported_vec ty) (isub x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vrsub_vi y x_imm (unmasked) ty)) + + +;; Signed Widening Low Subtractions + +(rule 6 (lower (has_type (ty_supported_vec _) (isub x (swiden_low y @ (value_type in_ty))))) + (rv_vwsub_wv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_low x @ (value_type in_ty)) + (swiden_low y)))) + (rv_vwsub_vv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_low x @ (value_type in_ty)) + (splat (sextend y @ (value_type sext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwsub_vx x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Signed Widening High Subtractions +;; These are the same as the low widenings, but we first slide down the inputs. + +(rule 6 (lower (has_type (ty_supported_vec _) (isub x (swiden_high y @ (value_type in_ty))))) + (rv_vwsub_wv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_high x @ (value_type in_ty)) + (swiden_high y)))) + (rv_vwsub_vv (gen_slidedown_half in_ty x) (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_high x @ (value_type in_ty)) + (splat (sextend y @ (value_type sext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) sext_ty)) + (rv_vwsub_vx (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening Low Subtractions + +(rule 6 (lower (has_type (ty_supported_vec _) (isub x (uwiden_low y @ (value_type in_ty))))) + (rv_vwsubu_wv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_low x @ (value_type in_ty)) + (uwiden_low y)))) + (rv_vwsubu_vv x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_low x @ (value_type in_ty)) + (splat (uextend y @ (value_type uext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwsubu_vx x y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening High Subtractions +;; These are the same as the low widenings, but we first slide down the inputs. + +(rule 6 (lower (has_type (ty_supported_vec _) (isub x (uwiden_high y @ (value_type in_ty))))) + (rv_vwsubu_wv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_high x @ (value_type in_ty)) + (uwiden_high y)))) + (rv_vwsubu_vv (gen_slidedown_half in_ty x) (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_high x @ (value_type in_ty)) + (splat (uextend y @ (value_type uext_ty)))))) + (if-let $true (ty_equal (lane_type in_ty) uext_ty)) + (rv_vwsubu_vx (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Signed Widening Mixed High/Low Subtractions + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_low x @ (value_type in_ty)) + (swiden_high y)))) + (rv_vwsub_vv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (swiden_high x @ (value_type in_ty)) + (swiden_low y)))) + (rv_vwsub_vv (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +;; Unsigned Widening Mixed High/Low Subtractions + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_low x @ (value_type in_ty)) + (uwiden_high y)))) + (rv_vwsubu_vv x (gen_slidedown_half in_ty y) (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + +(rule 10 (lower (has_type (ty_supported_vec _) (isub (uwiden_high x @ (value_type in_ty)) + (uwiden_low y)))) + (rv_vwsubu_vv (gen_slidedown_half in_ty x) y (unmasked) (vstate_mf2 (ty_half_lanes in_ty)))) + + +;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (ty_int ty) (ineg val))) + (neg ty val)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (ineg x))) + (rv_vneg_v x (unmasked) ty)) + + +;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (imul x y))) + (rv_mul x y)) + +(rule 1 (lower (has_type (fits_in_32 (ty_int ty)) (imul x y))) + (rv_mulw x y)) + +;; for I128 +(rule 2 (lower (has_type $I128 (imul x y))) + (let + ((x_regs ValueRegs x) + (x_lo XReg (value_regs_get x_regs 0)) + (x_hi XReg (value_regs_get x_regs 1)) + + ;; Get the high/low registers for `y`. + (y_regs ValueRegs y) + (y_lo XReg (value_regs_get y_regs 0)) + (y_hi XReg (value_regs_get y_regs 1)) + + ;; 128bit mul formula: + ;; dst_lo = x_lo * y_lo + ;; dst_hi = mulhu(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo) + ;; + ;; We can convert the above formula into the following + ;; mulhu dst_hi, x_lo, y_lo + ;; madd dst_hi, x_lo, y_hi, dst_hi + ;; madd dst_hi, x_hi, y_lo, dst_hi + ;; madd dst_lo, x_lo, y_lo, zero + (dst_hi1 XReg (rv_mulhu x_lo y_lo)) + (dst_hi2 XReg (madd x_lo y_hi dst_hi1)) + (dst_hi XReg (madd x_hi y_lo dst_hi2)) + (dst_lo XReg (madd x_lo y_lo (zero_reg)))) + (value_regs dst_lo dst_hi))) + +;; Special case 128-bit multiplication where the operands are extended since +;; that maps directly to the `mulhu` and `mulh` instructions. +(rule 6 (lower (has_type $I128 (imul (uextend x) (uextend y)))) + (let ((x XReg (zext x)) + (y XReg (zext y))) + (value_regs (rv_mul x y) (rv_mulhu x y)))) + +(rule 6 (lower (has_type $I128 (imul (sextend x) (sextend y)))) + (let ((x XReg (sext x)) + (y XReg (sext y))) + (value_regs (rv_mul x y) (rv_mulh x y)))) + +;; Vector multiplication + +(rule 3 (lower (has_type (ty_supported_vec ty) (imul x y))) + (rv_vmul_vv x y (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (imul (splat x) y))) + (rv_vmul_vx y x (unmasked) ty)) + +(rule 5 (lower (has_type (ty_supported_vec ty) (imul x (splat y)))) + (rv_vmul_vx x y (unmasked) ty)) + +;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y))) + (lower_smlhi ty (sext x) (sext y))) + +(rule 1 (lower (has_type (ty_supported_vec ty) (smulhi x y))) + (rv_vmulh_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (smulhi (splat x) y))) + (rv_vmulh_vx y x (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (smulhi x (splat y)))) + (rv_vmulh_vx x y (unmasked) ty)) + +;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_32 ty) (umulhi x y))) + (let ((tmp XReg (rv_mul (zext x) (zext y)))) + (rv_srli tmp (imm12_const (ty_bits ty))))) + +(rule 1 (lower (has_type $I64 (umulhi x y))) + (rv_mulhu x y)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (umulhi x y))) + (rv_vmulhu_vv x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (umulhi (splat x) y))) + (rv_vmulhu_vx y x (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (umulhi x (splat y)))) + (rv_vmulhu_vx x y (unmasked) ty)) + +;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_16 ty) (udiv x y))) + (if-let $true (has_m)) + (rv_divuw (zext x) (nonzero_divisor (zext y)))) + +(rule 1 (lower (has_type (fits_in_16 ty) (udiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 ty imm)) + (rv_divuw (zext x) (zext y))) + +(rule 2 (lower (has_type $I32 (udiv x y))) + (if-let $true (has_m)) + (rv_divuw x (nonzero_divisor (zext y)))) + +(rule 3 (lower (has_type $I32 (udiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I32 imm)) + (rv_divuw x y)) + +(rule 2 (lower (has_type $I64 (udiv x y))) + (if-let $true (has_m)) + (rv_divu x (nonzero_divisor y))) + +(rule 3 (lower (has_type $I64 (udiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I64 imm)) + (rv_divu x y)) + +;; Traps if the input register is zero, otherwise returns the same register. +(decl nonzero_divisor (XReg) XReg) +(rule (nonzero_divisor val) + (let ((_ InstOutput (gen_trapif (IntCC.Equal) val (zero_reg) (TrapCode.IntegerDivisionByZero)))) + val)) + +;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_16 ty) (sdiv x y))) + (if-let $true (has_m)) + (let ((x XReg (sext x))) + (rv_divw x (safe_sdiv_divisor ty x (sext y))))) + +(rule 1 (lower (has_type (fits_in_16 ty) (sdiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 ty imm)) + (rv_divw (sext x) (sext y))) + +(rule 2 (lower (has_type $I32 (sdiv x y))) + (if-let $true (has_m)) + (let ((x XReg (sext x))) + (rv_divw x (safe_sdiv_divisor $I32 x (sext y))))) + +(rule 3 (lower (has_type $I32 (sdiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I32 imm)) + (rv_divw x y)) + +(rule 2 (lower (has_type $I64 (sdiv x y))) + (if-let $true (has_m)) + (rv_div x (safe_sdiv_divisor $I64 x y))) + +(rule 3 (lower (has_type $I64 (sdiv x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I64 imm)) + (rv_div x y)) + +;; Check for two trapping conditions: +;; +;; * the divisor is 0, or... +;; * the divisor is -1 and the dividend is $ty::MIN +(decl safe_sdiv_divisor (Type XReg XReg) XReg) +(rule (safe_sdiv_divisor ty x y) + (let ( + (y XReg (nonzero_divisor y)) + (min XReg (imm $I64 (u64_shl 0xffffffff_ffffffff (u64_sub (ty_bits ty) 1)))) + (x_is_not_min XReg (rv_xor x min)) + (y_is_not_neg_one XReg (rv_not y)) + (no_int_overflow XReg (rv_or x_is_not_min y_is_not_neg_one)) + (_ InstOutput (gen_trapif + (IntCC.Equal) + no_int_overflow (zero_reg) + (TrapCode.IntegerOverflow)))) + y)) + +;;;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_16 ty) (urem x y))) + (if-let $true (has_m)) + (rv_remuw (zext x) (nonzero_divisor (zext y)))) + +(rule 1 (lower (has_type (fits_in_16 ty) (urem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 ty imm)) + (rv_remuw (zext x) (zext y))) + +(rule 2 (lower (has_type $I32 (urem x y))) + (if-let $true (has_m)) + (rv_remuw x (nonzero_divisor (zext y)))) + +(rule 3 (lower (has_type $I32 (urem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I32 imm)) + (rv_remuw x y)) + +(rule 2 (lower (has_type $I64 (urem x y))) + (if-let $true (has_m)) + (rv_remu x (nonzero_divisor y))) + +(rule 3 (lower (has_type $I64 (urem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I64 imm)) + (rv_remu x y)) + +;;;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_16 ty) (srem x y))) + (if-let $true (has_m)) + (rv_remw (sext x) (nonzero_divisor (sext y)))) + +(rule 1 (lower (has_type (fits_in_16 ty) (srem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 ty imm)) + (rv_remw (sext x) (sext y))) + +(rule 2 (lower (has_type $I32 (srem x y))) + (if-let $true (has_m)) + (rv_remw x (nonzero_divisor (sext y)))) + +(rule 3 (lower (has_type $I32 (srem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I32 imm)) + (rv_remw x y)) + +(rule 2 (lower (has_type $I64 (srem x y))) + (if-let $true (has_m)) + (rv_rem x (nonzero_divisor y))) + +(rule 3 (lower (has_type $I64 (srem x y @ (iconst imm)))) + (if-let $true (has_m)) + (if (safe_divisor_from_imm64 $I64 imm)) + (rv_rem x y)) + +;;;; Rules for `and` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule -1 (lower (has_type (fits_in_64 ty) (band x y))) + (rv_and x y)) + +(rule 0 (lower (has_type $I128 (band x y))) + (value_regs + (rv_and (value_regs_get x 0) (value_regs_get y 0)) + (rv_and (value_regs_get x 1) (value_regs_get y 1)))) + +;; Special cases for when one operand is an immediate that fits in 12 bits. +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (band x (imm12_from_value y)))) + (rv_andi x y)) + +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (band (imm12_from_value x) y))) + (rv_andi y x)) + +(rule 3 (lower (has_type (ty_supported_float ty) (band x y))) + (lower_float_binary (AluOPRRR.And) x y ty)) + +;; Specialized lowerings for `(band x (bnot y))` which is additionally produced +;; by Cranelift's `band_not` instruction that is legalized into the simpler +;; forms early on. + +(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (band x (bnot y)))) + (if-let $true (has_zbb)) + (rv_andn x y)) + +(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (band (bnot y) x))) + (if-let $true (has_zbb)) + (rv_andn x y)) + +(rule 6 (lower (has_type $I128 (band x (bnot y)))) + (if-let $true (has_zbb)) + (let ((low XReg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) + (high XReg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) + (value_regs low high))) + +(rule 7 (lower (has_type $I128 (band (bnot y) x))) + (if-let $true (has_zbb)) + (let ((low XReg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) + (high XReg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) + (value_regs low high))) + +(rule 8 (lower (has_type (ty_supported_vec ty) (band x y))) + (rv_vand_vv x y (unmasked) ty)) + +(rule 9 (lower (has_type (ty_supported_vec ty) (band x (splat y)))) + (if (ty_vector_not_float ty)) + (rv_vand_vx x y (unmasked) ty)) + +(rule 10 (lower (has_type (ty_supported_vec ty) (band (splat x) y))) + (if (ty_vector_not_float ty)) + (rv_vand_vx y x (unmasked) ty)) + +(rule 11 (lower (has_type (ty_supported_vec ty) (band x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vand_vi x y_imm (unmasked) ty)) + +(rule 12 (lower (has_type (ty_supported_vec ty) (band x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vand_vi y x_imm (unmasked) ty)) + +;; `bclr{,i}` specializations from `zbs` + +(rule 13 (lower (has_type (fits_in_32 ty) (band x (bnot (ishl (i64_from_iconst 1) y))))) + (if-let $true (has_zbs)) + (rv_bclr x (rv_andi y (imm12_const (u8_sub (ty_bits ty) 1))))) +(rule 14 (lower (has_type (fits_in_32 ty) (band (bnot (ishl (i64_from_iconst 1) y)) x))) + (if-let $true (has_zbs)) + (rv_bclr x (rv_andi y (imm12_const (u8_sub (ty_bits ty) 1))))) + +(rule 15 (lower (has_type $I64 (band x (bnot (ishl (i64_from_iconst 1) y))))) + (if-let $true (has_zbs)) + (rv_bclr x y)) +(rule 16 (lower (has_type $I64 (band (bnot (ishl (i64_from_iconst 1) y)) x))) + (if-let $true (has_zbs)) + (rv_bclr x y)) + +(rule 17 (lower (has_type (fits_in_64 ty) (band x (u64_from_iconst n)))) + (if-let $true (has_zbs)) + (if-let imm (bclr_imm ty n)) + (rv_bclri x imm)) +(rule 18 (lower (has_type (fits_in_64 ty) (band (u64_from_iconst n) x))) + (if-let $true (has_zbs)) + (if-let imm (bclr_imm ty n)) + (rv_bclri x imm)) + +(decl pure partial bclr_imm (Type u64) Imm12) +(extern constructor bclr_imm bclr_imm) + +;; `bext{,i}` specializations from `zbs` + +(rule 19 (lower (has_type $I32 (band (ushr x y) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bext x (rv_andi y (imm12_const 31)))) +(rule 19 (lower (has_type $I32 (band (sshr x y) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bext x (rv_andi y (imm12_const 31)))) +(rule 19 (lower (has_type $I32 (band (u64_from_iconst 1) (ushr x y)))) + (if-let $true (has_zbs)) + (rv_bext x (rv_andi y (imm12_const 31)))) +(rule 19 (lower (has_type $I32 (band (u64_from_iconst 1) (sshr x y)))) + (if-let $true (has_zbs)) + (rv_bext x (rv_andi y (imm12_const 31)))) + +(rule 19 (lower (has_type $I64 (band (ushr x y) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bext x y)) +(rule 19 (lower (has_type $I64 (band (sshr x y) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bext x y)) +(rule 19 (lower (has_type $I64 (band (u64_from_iconst 1) (ushr x y)))) + (if-let $true (has_zbs)) + (rv_bext x y)) +(rule 19 (lower (has_type $I64 (band (u64_from_iconst 1) (sshr x y)))) + (if-let $true (has_zbs)) + (rv_bext x y)) + +(rule 20 (lower (has_type $I32 (band (ushr x (imm12_from_value y)) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bexti x (imm12_and y 31))) +(rule 20 (lower (has_type $I32 (band (sshr x (imm12_from_value y)) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bexti x (imm12_and y 31))) +(rule 20 (lower (has_type $I64 (band (ushr x (imm12_from_value y)) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bexti x (imm12_and y 63))) +(rule 20 (lower (has_type $I64 (band (sshr x (imm12_from_value y)) (u64_from_iconst 1)))) + (if-let $true (has_zbs)) + (rv_bexti x (imm12_and y 63))) + +;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_int ty) (bor x y))) + (gen_or ty x y)) + +;; Special cases for when one operand is an immediate that fits in 12 bits. +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (imm12_from_value y)))) + (rv_ori x y)) + +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor (imm12_from_value x) y))) + (rv_ori y x)) + +(rule 3 (lower (has_type (ty_supported_float ty) (bor x y))) + (lower_float_binary (AluOPRRR.Or) x y ty)) + +;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced +;; by Cranelift's `bor_not` instruction that is legalized into the simpler +;; forms early on. + +(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (bnot y)))) + (if-let $true (has_zbb)) + (rv_orn x y)) + +(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (bor (bnot y) x))) + (if-let $true (has_zbb)) + (rv_orn x y)) + +(rule 6 (lower (has_type $I128 (bor x (bnot y)))) + (if-let $true (has_zbb)) + (let ((low XReg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) + (high XReg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) + (value_regs low high))) + +(rule 7 (lower (has_type $I128 (bor (bnot y) x))) + (if-let $true (has_zbb)) + (let ((low XReg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) + (high XReg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) + (value_regs low high))) + +(rule 8 (lower (has_type (ty_supported_vec ty) (bor x y))) + (rv_vor_vv x y (unmasked) ty)) + +(rule 9 (lower (has_type (ty_supported_vec ty) (bor x (splat y)))) + (if (ty_vector_not_float ty)) + (rv_vor_vx x y (unmasked) ty)) + +(rule 10 (lower (has_type (ty_supported_vec ty) (bor (splat x) y))) + (if (ty_vector_not_float ty)) + (rv_vor_vx y x (unmasked) ty)) + +(rule 11 (lower (has_type (ty_supported_vec ty) (bor x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vor_vi x y_imm (unmasked) ty)) + +(rule 12 (lower (has_type (ty_supported_vec ty) (bor x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vor_vi y x_imm (unmasked) ty)) + +;; `bset{,i}` specializations from `zbs` + +(rule 13 (lower (has_type $I32 (bor x (ishl (i64_from_iconst 1) y)))) + (if-let $true (has_zbs)) + (rv_bset x (rv_andi y (imm12_const 31)))) +(rule 14 (lower (has_type $I32 (bor (ishl (i64_from_iconst 1) y) x))) + (if-let $true (has_zbs)) + (rv_bset x (rv_andi y (imm12_const 31)))) + +(rule 13 (lower (has_type $I64 (bor x (ishl (i64_from_iconst 1) y)))) + (if-let $true (has_zbs)) + (rv_bset x y)) +(rule 14 (lower (has_type $I64 (bor (ishl (i64_from_iconst 1) y) x))) + (if-let $true (has_zbs)) + (rv_bset x y)) + +(rule 15 (lower (has_type (fits_in_64 _) (bor x (u64_from_iconst n)))) + (if-let $true (has_zbs)) + (if-let imm (bseti_imm n)) + (rv_bseti x imm)) +(rule 16 (lower (has_type (fits_in_64 _) (bor (u64_from_iconst n) x))) + (if-let $true (has_zbs)) + (if-let imm (bseti_imm n)) + (rv_bseti x imm)) + +(decl pure partial bseti_imm (u64) Imm12) +(extern constructor bseti_imm bseti_imm) + +;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y))) + (rv_xor x y)) + +;; Special cases for when one operand is an immediate that fits in 12 bits. +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x (imm12_from_value y)))) + (rv_xori x y)) + +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor (imm12_from_value x) y))) + (rv_xori y x)) + +(rule 3 (lower (has_type $I128 (bxor x y))) + (lower_b128_binary (AluOPRRR.Xor) x y)) + +(rule 4 (lower (has_type (ty_supported_float ty) (bxor x y))) + (lower_float_binary (AluOPRRR.Xor) x y ty)) + +(rule 5 (lower (has_type (ty_supported_vec ty) (bxor x y))) + (rv_vxor_vv x y (unmasked) ty)) + +(rule 6 (lower (has_type (ty_supported_vec ty) (bxor x (splat y)))) + (if (ty_vector_not_float ty)) + (rv_vxor_vx x y (unmasked) ty)) + +(rule 7 (lower (has_type (ty_supported_vec ty) (bxor (splat x) y))) + (if (ty_vector_not_float ty)) + (rv_vxor_vx y x (unmasked) ty)) + +(rule 8 (lower (has_type (ty_supported_vec ty) (bxor x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vxor_vi x y_imm (unmasked) ty)) + +(rule 9 (lower (has_type (ty_supported_vec ty) (bxor x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vxor_vi y x_imm (unmasked) ty)) + +;; `binv{,i}` specializations from `zbs` + +(rule 13 (lower (has_type $I32 (bxor x (ishl (i64_from_iconst 1) y)))) + (if-let $true (has_zbs)) + (rv_binv x (rv_andi y (imm12_const 31)))) +(rule 14 (lower (has_type $I32 (bxor (ishl (i64_from_iconst 1) y) x))) + (if-let $true (has_zbs)) + (rv_binv x (rv_andi y (imm12_const 31)))) + +(rule 13 (lower (has_type $I64 (bxor x (ishl (i64_from_iconst 1) y)))) + (if-let $true (has_zbs)) + (rv_binv x y)) +(rule 14 (lower (has_type $I64 (bxor (ishl (i64_from_iconst 1) y) x))) + (if-let $true (has_zbs)) + (rv_binv x y)) + +(rule 15 (lower (has_type (fits_in_64 _) (bxor x (u64_from_iconst n)))) + (if-let $true (has_zbs)) + (if-let imm (binvi_imm n)) + (rv_binvi x imm)) +(rule 16 (lower (has_type (fits_in_64 _) (bxor (u64_from_iconst n) x))) + (if-let $true (has_zbs)) + (if-let imm (binvi_imm n)) + (rv_binvi x imm)) + +(decl pure partial binvi_imm (u64) Imm12) +(extern constructor binvi_imm binvi_imm) + +;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_int_ref_scalar_64 _) (bnot x))) + (rv_not x)) + +(rule 1 (lower (has_type (ty_supported_float ty) (bnot x))) + (move_x_to_f (rv_not (move_f_to_x x ty)) (float_int_of_same_size ty))) + +(rule 2 (lower (has_type $I128 (bnot x))) + (value_regs + (rv_not (value_regs_get x 0)) + (rv_not (value_regs_get x 1)))) + +(rule 3 (lower (has_type (ty_supported_vec ty) (bnot x))) + (rv_vnot_v x (unmasked) ty)) + +(rule 4 (lower (has_type (ty_int_ref_scalar_64 _) (bnot (bxor x y)))) + (if-let $true (has_zbb)) + (rv_xnor x y)) + +;;;; Rules for `bit_reverse` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (bitrev x))) + (gen_bitrev ty x)) + +(rule 1 (lower (has_type $I128 (bitrev x))) + (value_regs + (gen_bitrev $I64 (value_regs_get x 1)) + (gen_bitrev $I64 (value_regs_get x 0)))) + + +;; Constructs a sequence of instructions that reverse all bits in `x` up to +;; the given type width. +(decl gen_bitrev (Type XReg) XReg) + +(rule 0 (gen_bitrev (ty_16_or_32 (ty_int ty)) x) + (if-let shift_amt (u64_to_imm12 (u64_sub 64 (ty_bits ty)))) + (rv_srli (gen_bitrev $I64 x) shift_amt)) + +(rule 1 (gen_bitrev $I8 x) + (gen_brev8 x $I8)) + +(rule 1 (gen_bitrev $I64 x) + (gen_brev8 (gen_bswap $I64 x) $I64)) + + +;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bswap x))) + (gen_bswap ty x)) + +(rule 2 (lower (has_type $I128 (bswap x))) + (value_regs + (gen_bswap $I64 (value_regs_get x 1)) + (gen_bswap $I64 (value_regs_get x 0)))) + +;; Builds a sequence of instructions that swaps the bytes in `x` up to the given +;; type width. +(decl gen_bswap (Type XReg) XReg) + +;; This is only here to make the rule below work. bswap.i8 isn't valid +(rule 0 (gen_bswap $I8 x) x) +(rule 1 (gen_bswap (ty_int_ref_16_to_64 ty) x) + (if-let half_ty (ty_half_width ty)) + (if-let half_size (u64_to_imm12 (ty_bits half_ty))) + (let (;; This swaps the top bytes and zeroes the bottom bytes, so that + ;; we can or it with the bottom bytes later. + (swap_top XReg (gen_bswap half_ty x)) + (top XReg (rv_slli swap_top half_size)) + + ;; Get the top half, swap it, and zero extend it so we can `or` it + ;; with the bottom half. Note that zero extension here already knows + ;; that `zbb` isn't available and that `half_ty` is not `$I64`, so this + ;; falls back to the shift-then-shift sequence. + (shifted XReg (rv_srli x half_size)) + (swap_bot XReg (gen_bswap half_ty shifted)) + (shift Imm12 (imm_from_bits (u64_sub 64 (ty_bits half_ty)))) + (bot_shifted_left XReg (rv_slli swap_bot shift)) + (bot XReg (rv_srli bot_shifted_left shift))) + (rv_or top bot))) + +(rule 2 (gen_bswap (ty_16_or_32 (ty_int ty)) x) + (if-let $true (has_zbb)) + (if-let shift_amt (u64_to_imm12 (u64_sub 64 (ty_bits ty)))) + (rv_srli (rv_rev8 x) shift_amt)) + +(rule 3 (gen_bswap $I64 x) + (if-let $true (has_zbb)) + (rv_rev8 x)) + +;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (fits_in_64 ty) (ctz x))) + (lower_ctz ty x)) + +(rule 1 (lower (has_type $I128 (ctz x))) + (let ((x_lo XReg (value_regs_get x 0)) + (x_hi XReg (value_regs_get x 1)) + ;; Count both halves + (high XReg (lower_ctz $I64 x_hi)) + (low XReg (lower_ctz $I64 x_lo)) + ;; Only add the top half if the bottom is zero + (high XReg (gen_select_xreg (cmp_eqz x_lo) high (zero_reg))) + (result XReg (rv_add low high))) + (value_regs result (imm $I64 0)))) + +;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_64 ty) (clz x))) + (gen_cltz $true x ty)) + +(rule 1 (lower (has_type $I128 (clz x))) + (let ((x_lo XReg (value_regs_get x 0)) + (x_hi XReg (value_regs_get x 1)) + ;; Count both halves + (high XReg (gen_clz x_hi)) + (low XReg (gen_clz x_lo)) + ;; Only add the bottom zeros if the top half is zero + (low XReg (gen_select_xreg (cmp_eqz x_hi) low (zero_reg)))) + (value_regs (rv_add high low) (imm $I64 0)))) + +(rule 2 (lower (has_type (fits_in_16 ty) (clz x))) + (if-let $true (has_zbb)) + (let ((tmp XReg (zext x)) + (count XReg (rv_clz tmp))) + ;; We always do the operation on the full 64-bit register, so subtract 64 from the result. + (rv_addi count (imm12_const_add (ty_bits ty) -64)))) + +(rule 3 (lower (has_type $I32 (clz x))) + (if-let $true (has_zbb)) + (rv_clzw x)) + +(rule 3 (lower (has_type $I64 (clz x))) + (if-let $true (has_zbb)) + (rv_clz x)) + +(decl gen_clz (XReg) XReg) +(rule 0 (gen_clz rs) + (gen_cltz $true rs $I64)) +(rule 1 (gen_clz rs) + (if-let $true (has_zbb)) + (rv_clz rs)) + +;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (fits_in_64 ty) (cls x))) + (let ((tmp XReg (sext x)) + (tmp2 XReg (gen_select_xreg (cmp_ltz tmp) (rv_not tmp) tmp)) + (tmp3 XReg (gen_clz tmp2))) + ;; clz counted the full register width, so subtract (64-$width), and then + ;; additionally subtract one more, meaning here -65+width is added. + (rv_addi tmp3 (imm12_const_add (ty_bits ty) -65)))) + +;; If the sign bit is set, we count the leading zeros of the inverted value. +;; Otherwise we can just count the leading zeros of the original value. +;; Subtract 1 since the sign bit does not count. +(rule 1 (lower (has_type $I128 (cls x))) + (let ((low XReg (value_regs_get x 0)) + (high XReg (value_regs_get x 1)) + (low XReg (gen_select_xreg (cmp_ltz high) (rv_not low) low)) + (high XReg (gen_select_xreg (cmp_ltz high) (rv_not high) high)) + + ;; Count both halves + (high_cnt XReg (gen_clz high)) + (low_cnt XReg (gen_clz low)) + ;; Only add the bottom zeros if the top half is zero + (low_cnt XReg (gen_select_xreg (cmp_eqz high) low_cnt (zero_reg))) + (count XReg (rv_add high_cnt low_cnt)) + (result XReg (rv_addi count (imm12_const -1)))) + (value_regs result (imm $I64 0)))) + + +;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_64 _) (uextend val))) + (zext val)) + +(rule 1 (lower (has_type $I128 (uextend val))) + (value_regs (zext val) (imm $I64 0))) + +;; When the source of an `uextend` is a load, we can merge both ops +(rule 2 (lower (has_type (fits_in_64 _) (uextend (sinkable_load inst ty flags addr offset)))) + (gen_sunk_load inst (amode addr offset) (uextend_load_op ty) flags)) + +(decl pure uextend_load_op (Type) LoadOP) +(rule (uextend_load_op $I8) (LoadOP.Lbu)) +(rule (uextend_load_op $I16) (LoadOP.Lhu)) +(rule (uextend_load_op $I32) (LoadOP.Lwu)) + +;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (fits_in_64 _) (sextend val @ (value_type in_ty)))) + (sext val)) + +(rule 1 (lower (has_type $I128 (sextend val @ (value_type in_ty)))) + (let ((lo XReg (sext val))) + (value_regs lo (rv_srai lo (imm12_const 63))))) + +;; When the source of an `sextend` is a load, we can merge both ops +(rule 2 (lower (has_type (fits_in_64 _) (sextend (sinkable_load inst ty flags addr offset)))) + (gen_sunk_load inst (amode addr offset) (sextend_load_op ty) flags)) + +(decl pure sextend_load_op (Type) LoadOP) +(rule (sextend_load_op $I8) (LoadOP.Lb)) +(rule (sextend_load_op $I16) (LoadOP.Lh)) +(rule (sextend_load_op $I32) (LoadOP.Lw)) + +;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 _) (popcnt x))) + (gen_popcnt (zext x))) + +(rule 1 (lower (has_type $I128 (popcnt x))) + (let + ((x ValueRegs x) + (low XReg (gen_popcnt (value_regs_get x 0))) + (high XReg (gen_popcnt (value_regs_get x 1))) + (result XReg (rv_add low high))) + (value_regs result (imm $I64 0)))) + +(rule 2 (lower (has_type (fits_in_64 _) (popcnt x))) + (if-let $true (has_zbb)) + (rv_cpop (zext x))) + +(rule 3 (lower (has_type $I32 (popcnt x))) + (if-let $true (has_zbb)) + (rv_cpopw x)) + +(rule 3 (lower (has_type $I128 (popcnt x))) + (if-let $true (has_zbb)) + (let + ((x ValueRegs x) + (low XReg (rv_cpop (value_regs_get x 0))) + (high XReg (rv_cpop (value_regs_get x 1))) + (result XReg (rv_add low high))) + (value_regs result (imm $I64 0)))) + +;; Popcount using multiply. +;; This is popcount64c() from +;; http://en.wikipedia.org/wiki/Hamming_weight +;; +;; Here's the C version for 32 bits: +;; x = x - ((x>> 1) & 0x55555555); +;; x = (x & 0x33333333) + ((x >> 2) & 0x33333333); +;; x = ((x + (x >> 4)) & 0x0F0F0F0F); +;; return (x * 0x01010101) >> 24; // Here 24 is the type width - 8. +;; +;; TODO: LLVM generates a much better implementation for I8X16. See: https://godbolt.org/z/qr6vf9Gr3 +;; For the other types it seems to be largely the same. +(rule 4 (lower (has_type (ty_supported_vec ty) (popcnt x))) + (if-let one (u64_to_uimm5 1)) + (if-let two (u64_to_uimm5 2)) + (if-let four (u64_to_uimm5 4)) + + (let (;; x = x - ((x >> 1) & 0x55555555); + (mask_55 XReg (imm (lane_type ty) (u64_and 0x5555555555555555 (ty_mask (lane_type ty))))) + (count2_shr VReg (rv_vsrl_vi x one (unmasked) ty)) + (count2_and VReg (rv_vand_vx count2_shr mask_55 (unmasked) ty)) + (count2 VReg (rv_vsub_vv x count2_and (unmasked) ty)) + + ;; x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + (mask_33 XReg (imm (lane_type ty) (u64_and 0x3333333333333333 (ty_mask (lane_type ty))))) + (count4_shr VReg (rv_vsrl_vi count2 two (unmasked) ty)) + (count4_and VReg (rv_vand_vx count4_shr mask_33 (unmasked) ty)) + (count4_lhs VReg (rv_vand_vx count2 mask_33 (unmasked) ty)) + (count4 VReg (rv_vadd_vv count4_lhs count4_and (unmasked) ty)) + + ;; x = (x + (x >> 4)) & 0x0F0F0F0F; + (mask_0f XReg (imm (lane_type ty) (u64_and 0x0f0f0f0f0f0f0f0f (ty_mask (lane_type ty))))) + (count8_shr VReg (rv_vsrl_vi count4 four (unmasked) ty)) + (count8_add VReg (rv_vadd_vv count4 count8_shr (unmasked) ty)) + (count8 VReg (rv_vand_vx count8_add mask_0f (unmasked) ty)) + + ;; (x * 0x01010101) >> ( - 8) + (mask_01 XReg (imm (lane_type ty) (u64_and 0x0101010101010101 (ty_mask (lane_type ty))))) + (mul VReg (rv_vmul_vx count8 mask_01 (unmasked) ty)) + (shift XReg (imm $I64 (u64_sub (ty_bits (lane_type ty)) 8))) + (res VReg (rv_vsrl_vx mul shift (unmasked) ty))) + res)) + +;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; 8/16 bit types need a mask on the shift amount +(rule 0 (lower (has_type (ty_int (ty_8_or_16 ty)) (ishl x y))) + (if-let mask (u64_to_imm12 (ty_shift_mask ty))) + (rv_sllw x (rv_andi (value_regs_get y 0) mask))) + +;; Using the 32bit version of `sll` automatically masks the shift amount. +(rule 1 (lower (has_type $I32 (ishl x y))) + (rv_sllw x (value_regs_get y 0))) + +;; Similarly, the 64bit version does the right thing. +(rule 1 (lower (has_type $I64 (ishl x y))) + (rv_sll x (value_regs_get y 0))) + +;; If the shift amount is known. We can mask it and encode it in the instruction. +(rule 2 (lower (has_type (int_fits_in_32 ty) (ishl x (maybe_uextend (imm12_from_value y))))) + (rv_slliw x (imm12_and y (ty_shift_mask ty)))) + +;; We technically don't need to mask the shift amount here. The instruction +;; does the right thing. But it's neater when pretty printing it. +(rule 3 (lower (has_type ty @ $I64 (ishl x (maybe_uextend (imm12_from_value y))))) + (rv_slli x (imm12_and y (ty_shift_mask ty)))) + +;; With `Zba` we have a shift that zero extends the LHS argument. +(rule 4 (lower (has_type $I64 (ishl (uextend x @ (value_type $I32)) (maybe_uextend (imm12_from_value y))))) + (if-let $true (has_zba)) + (rv_slliuw x y)) + +;; I128 cases +(rule 4 (lower (has_type $I128 (ishl x y))) + (let ((tmp ValueRegs (gen_shamt $I128 (value_regs_get y 0))) + (shamt XReg (value_regs_get tmp 0)) + (len_sub_shamt XReg (value_regs_get tmp 1)) + ;; + (low XReg (rv_sll (value_regs_get x 0) shamt)) + ;; high part. + (high_part1 XReg (rv_srl (value_regs_get x 0) len_sub_shamt)) + (high_part2 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) high_part1)) + ;; + (high_part3 XReg (rv_sll (value_regs_get x 1) shamt)) + (high XReg (rv_or high_part2 high_part3)) + ;; + (const64 XReg (imm $I64 64)) + (shamt_128 XReg (rv_andi (value_regs_get y 0) (imm12_const 127)))) + (gen_select_regs + (cmp_geu shamt_128 const64) + (value_regs (zero_reg) low) + (value_regs low high)))) + +;; SIMD Cases +;; We don't need to mask anything since it is done by the instruction according to SEW. + +(rule 5 (lower (has_type (ty_supported_vec ty) (ishl x y))) + (rv_vsll_vx x (value_regs_get y 0) (unmasked) ty)) + +(rule 6 (lower (has_type (ty_supported_vec ty) (ishl x (maybe_uextend (uimm5_from_value y))))) + (rv_vsll_vi x y (unmasked) ty)) + +;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; 8/16 bit types need a mask on the shift amount, and the LHS needs to be +;; zero extended. +(rule 0 (lower (has_type (ty_int (fits_in_16 ty)) (ushr x y))) + (if-let mask (u64_to_imm12 (ty_shift_mask ty))) + (rv_srlw (zext x) (rv_andi (value_regs_get y 0) mask))) + +;; Using the 32bit version of `srl` automatically masks the shift amount. +(rule 1 (lower (has_type $I32 (ushr x y))) + (rv_srlw x (value_regs_get y 0))) + +;; Similarly, the 64bit version does the right thing. +(rule 1 (lower (has_type $I64 (ushr x y))) + (rv_srl x (value_regs_get y 0))) + +;; When the RHS is known we can just encode it in the instruction. +(rule 2 (lower (has_type (ty_int (fits_in_16 ty)) (ushr x (maybe_uextend (imm12_from_value y))))) + (rv_srliw (zext x) (imm12_and y (ty_shift_mask ty)))) + +(rule 3 (lower (has_type $I32 (ushr x (maybe_uextend (imm12_from_value y))))) + (rv_srliw x y)) + +(rule 3 (lower (has_type $I64 (ushr x (maybe_uextend (imm12_from_value y))))) + (rv_srli x y)) + +(rule 3 (lower (has_type $I128 (ushr x y))) + (let ((tmp ValueRegs (gen_shamt $I128 (value_regs_get y 0))) + (shamt XReg (value_regs_get tmp 0)) + (len_sub_shamt XReg (value_regs_get tmp 1)) + ;; low part. + (low_part1 XReg (rv_sll (value_regs_get x 1) len_sub_shamt)) + (low_part2 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) low_part1)) + ;; + (low_part3 XReg (rv_srl (value_regs_get x 0) shamt)) + (low XReg (rv_or low_part2 low_part3)) + ;; + (const64 XReg (imm $I64 64)) + ;; + (high XReg (rv_srl (value_regs_get x 1) shamt)) + (shamt_128 XReg (rv_andi (value_regs_get y 0) (imm12_const 127)))) + (gen_select_regs + (cmp_geu shamt_128 const64) + (value_regs high (zero_reg)) + (value_regs low high)))) + +;; SIMD Cases +;; We don't need to mask or extend anything since it is done by the instruction according to SEW. + +(rule 4 (lower (has_type (ty_supported_vec ty) (ushr x y))) + (rv_vsrl_vx x (value_regs_get y 0) (unmasked) ty)) + +(rule 5 (lower (has_type (ty_supported_vec ty) (ushr x (maybe_uextend (uimm5_from_value y))))) + (rv_vsrl_vi x y (unmasked) ty)) + +;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; 8/16 bit types need a mask on the shift amount, and the LHS needs to be +;; zero extended. +(rule 0 (lower (has_type (ty_int (fits_in_16 ty)) (sshr x y))) + (if-let mask (u64_to_imm12 (ty_shift_mask ty))) + (rv_sraw (sext x) (rv_andi (value_regs_get y 0) mask))) + +;; Using the 32bit version of `sra` automatically masks the shift amount. +(rule 1 (lower (has_type $I32 (sshr x y))) + (rv_sraw x (value_regs_get y 0))) + +;; Similarly, the 64bit version does the right thing. +(rule 1 (lower (has_type $I64 (sshr x y))) + (rv_sra x (value_regs_get y 0))) + +;; When the RHS is known we can just encode it in the instruction. +(rule 2 (lower (has_type (ty_int (fits_in_16 ty)) (sshr x (maybe_uextend (imm12_from_value y))))) + (rv_sraiw (sext x) (imm12_and y (ty_shift_mask ty)))) + +(rule 3 (lower (has_type $I32 (sshr x (maybe_uextend (imm12_from_value y))))) + (rv_sraiw x y)) + +(rule 3 (lower (has_type $I64 (sshr x (maybe_uextend (imm12_from_value y))))) + (rv_srai x y)) + +(rule 3 (lower (has_type $I128 (sshr x y))) + (let ((tmp ValueRegs (gen_shamt $I128 (value_regs_get y 0))) + (shamt XReg (value_regs_get tmp 0)) + (len_sub_shamt XReg (value_regs_get tmp 1)) + ;; low part. + (low_part1 XReg (rv_sll (value_regs_get x 1) len_sub_shamt)) + (low_part2 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) low_part1)) + ;; + (low_part3 XReg (rv_srl (value_regs_get x 0) shamt)) + (low XReg (rv_or low_part2 low_part3)) + ;; + (const64 XReg (imm $I64 64)) + ;; + (high XReg (rv_sra (value_regs_get x 1) shamt)) + ;; + (const_neg_1 XReg (imm $I64 (i64_as_u64 -1))) + ;; + (high_replacement XReg (gen_select_xreg (cmp_ltz (value_regs_get x 1)) const_neg_1 (zero_reg))) + (const64 XReg (imm $I64 64)) + (shamt_128 XReg (rv_andi (value_regs_get y 0) (imm12_const 127)))) + (gen_select_regs + (cmp_geu shamt_128 const64) + (value_regs high high_replacement) + (value_regs low high)))) + +;; SIMD Cases +;; We don't need to mask or extend anything since it is done by the instruction according to SEW. + +(rule 4 (lower (has_type (ty_supported_vec ty) (sshr x y))) + (rv_vsra_vx x (value_regs_get y 0) (unmasked) ty)) + +(rule 5 (lower (has_type (ty_supported_vec ty) (sshr x (maybe_uextend (uimm5_from_value y))))) + (rv_vsra_vi x y (unmasked) ty)) + + +;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 ty) (rotl rs amount))) + (let + ((rs XReg (zext rs)) + (amount XReg (value_regs_get amount 0)) + (x ValueRegs (gen_shamt ty amount)) + (shamt XReg (value_regs_get x 0)) + (len_sub_shamt Reg (value_regs_get x 1)) + (part1 Reg (rv_sll rs shamt)) + (part2 Reg (rv_srl rs len_sub_shamt)) + (part3 Reg (gen_select_xreg (cmp_eqz shamt) (zero_reg) part2))) + (rv_or part1 part3))) + +(rule 1 (lower (has_type $I32 (rotl rs amount))) + (if-let $true (has_zbb)) + (rv_rolw rs (value_regs_get amount 0))) + +(rule 2 (lower (has_type $I32 (rotl rs (u64_from_iconst n)))) + (if-let $true (has_zbb)) + (if-let (imm12_from_u64 imm) (u64_sub 32 (u64_and n 31))) + (rv_roriw rs imm)) + +(rule 1 (lower (has_type $I64 (rotl rs amount))) + (if-let $true (has_zbb)) + (rv_rol rs (value_regs_get amount 0))) + +(rule 2 (lower (has_type $I64 (rotl rs (u64_from_iconst n)))) + (if-let $true (has_zbb)) + (if-let (imm12_from_u64 imm) (u64_sub 64 (u64_and n 63))) + (rv_rori rs imm)) + +(rule 1 (lower (has_type $I128 (rotl x y))) + (let + ((tmp ValueRegs (gen_shamt $I128 (value_regs_get y 0))) + (shamt XReg (value_regs_get tmp 0)) + (len_sub_shamt XReg (value_regs_get tmp 1)) + (low_part1 XReg (rv_sll (value_regs_get x 0) shamt)) + (low_part2 XReg (rv_srl (value_regs_get x 1) len_sub_shamt)) + ;;; if shamt == 0 low_part2 will overflow we should zero instead. + (low_part3 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) low_part2)) + (low XReg (rv_or low_part1 low_part3)) + (high_part1 XReg (rv_sll (value_regs_get x 1) shamt)) + (high_part2 XReg (rv_srl (value_regs_get x 0) len_sub_shamt)) + (high_part3 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) high_part2)) + (high XReg (rv_or high_part1 high_part3)) + (const64 XReg (imm $I64 64)) + (shamt_128 XReg (rv_andi (value_regs_get y 0) (imm12_const 127)))) + ;; right now we only rotate less than 64 bits. + ;; if shamt is greater than or equal 64 , we should switch low and high. + (gen_select_regs + (cmp_geu shamt_128 const64) + (value_regs high low) + (value_regs low high) + ))) + +;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (fits_in_64 ty) (rotr rs amount))) + (let + ((rs XReg (zext rs)) + (amount XReg (value_regs_get amount 0)) + (x ValueRegs (gen_shamt ty amount)) + (shamt XReg (value_regs_get x 0)) + (len_sub_shamt XReg (value_regs_get x 1)) + (part1 XReg (rv_srl rs shamt)) + (part2 XReg (rv_sll rs len_sub_shamt)) + (part3 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) part2))) + (rv_or part1 part3))) + +(rule 1 (lower (has_type $I32 (rotr rs amount))) + (if-let $true (has_zbb)) + (rv_rorw rs (value_regs_get amount 0))) + +(rule 2 (lower (has_type $I32 (rotr rs (imm12_from_value n)))) + (if-let $true (has_zbb)) + (rv_roriw rs n)) + +(rule 1 (lower (has_type $I64 (rotr rs amount))) + (if-let $true (has_zbb)) + (rv_ror rs (value_regs_get amount 0))) + +(rule 2 (lower (has_type $I64 (rotr rs (imm12_from_value n)))) + (if-let $true (has_zbb)) + (rv_rori rs n)) + +(rule 1 (lower (has_type $I128 (rotr x y))) + (let + ((tmp ValueRegs (gen_shamt $I128 (value_regs_get y 0))) + (shamt XReg (value_regs_get tmp 0)) + (len_sub_shamt XReg (value_regs_get tmp 1)) + (low_part1 XReg (rv_srl (value_regs_get x 0) shamt)) + (low_part2 XReg (rv_sll (value_regs_get x 1) len_sub_shamt)) + ;;; if shamt == 0 low_part2 will overflow we should zero instead. + (low_part3 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) low_part2)) + (low XReg (rv_or low_part1 low_part3)) + (high_part1 XReg (rv_srl (value_regs_get x 1) shamt)) + (high_part2 XReg (rv_sll (value_regs_get x 0) len_sub_shamt)) + (high_part3 XReg (gen_select_xreg (cmp_eqz shamt) (zero_reg) high_part2)) + (high XReg (rv_or high_part1 high_part3)) + (const64 XReg (imm $I64 64)) + (shamt_128 XReg (rv_andi (value_regs_get y 0) (imm12_const 127)))) + ;; right now we only rotate less than 64 bits. + ;; if shamt is greater than or equal 64 , we should switch low and high. + (gen_select_regs + (cmp_geu shamt_128 const64) + (value_regs high low) + (value_regs low high) + ))) + +;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fabs x))) + (rv_fabs ty x)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fabs x))) + (rv_vfabs_v x (unmasked) ty)) + +;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fneg x))) + (rv_fneg ty x)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fneg x))) + (rv_vfneg_v x (unmasked) ty)) + +;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fcopysign x y))) + (rv_fsgnj ty x y)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fcopysign x y))) + (rv_vfsgnj_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (fcopysign x (splat y)))) + (rv_vfsgnj_vf x y (unmasked) ty)) + +;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; RISC-V has 4 FMA instructions that do a slightly different computation. +;; +;; fmadd: (rs1 * rs2) + rs3 +;; fmsub: (rs1 * rs2) - rs3 +;; fnmadd: -(rs1 * rs2) - rs3 +;; fnmsub: -(rs1 * rs2) + rs3 +;; +;; Additionally there are vector versions of these instructions with slightly different names. +;; The vector instructions also have two variants each. `.vv` and `.vf`, where `.vv` variants +;; take two vector operands and the `.vf` variants take a vector operand and a scalar operand. +;; +;; Due to this, variation they receive the arguments in a different order. So we need to swap +;; the arguments below. +;; +;; vfmacc: vd[i] = +(vs1[i] * vs2[i]) + vd[i] +;; vfmsac: vd[i] = +(vs1[i] * vs2[i]) - vd[i] +;; vfnmacc: vd[i] = -(vs1[i] * vs2[i]) - vd[i] +;; vfnmsac: vd[i] = -(vs1[i] * vs2[i]) + vd[i] + +(type IsFneg (enum (Result (negate u64) (value Value)))) + +(decl pure is_fneg (Value) IsFneg) +(rule 1 (is_fneg (fneg x)) (IsFneg.Result 1 x)) +(rule 0 (is_fneg x) (IsFneg.Result 0 x)) + +(decl pure is_fneg_neg (IsFneg) u64) +(rule (is_fneg_neg (IsFneg.Result n _)) n) + +(decl pure get_fneg_value (IsFneg) Value) +(rule (get_fneg_value (IsFneg.Result _ v)) v) + +(rule (lower (has_type ty (fma x_src y_src z_src))) + (let + ((x_res IsFneg (is_fneg x_src)) + (y_res IsFneg (is_fneg y_src)) + (z_res IsFneg (is_fneg z_src)) + (x Value (get_fneg_value x_res)) + (y Value (get_fneg_value y_res)) + (z Value (get_fneg_value z_res))) + (rv_fma ty (u64_xor (is_fneg_neg x_res) (is_fneg_neg y_res)) (is_fneg_neg z_res) x y z))) + +; parity arguments indicate whether to negate the x*y term or the z term, respectively +(decl rv_fma (Type u64 u64 Value Value Value) InstOutput) +(rule 0 (rv_fma (ty_supported_float ty) 0 0 x y z) (rv_fmadd ty (FRM.RNE) x y z)) +(rule 0 (rv_fma (ty_supported_float ty) 0 1 x y z) (rv_fmsub ty (FRM.RNE) x y z)) +(rule 0 (rv_fma (ty_supported_float ty) 1 0 x y z) (rv_fnmsub ty (FRM.RNE) x y z)) +(rule 0 (rv_fma (ty_supported_float ty) 1 1 x y z) (rv_fnmadd ty (FRM.RNE) x y z)) +(rule 1 (rv_fma (ty_supported_vec ty) 0 0 x y z) (rv_vfmacc_vv z y x (unmasked) ty)) +(rule 1 (rv_fma (ty_supported_vec ty) 0 1 x y z) (rv_vfmsac_vv z y x (unmasked) ty)) +(rule 1 (rv_fma (ty_supported_vec ty) 1 0 x y z) (rv_vfnmsac_vv z y x (unmasked) ty)) +(rule 1 (rv_fma (ty_supported_vec ty) 1 1 x y z) (rv_vfnmacc_vv z y x (unmasked) ty)) +(rule 2 (rv_fma (ty_supported_vec ty) 0 0 (splat x) y z) (rv_vfmacc_vf z y x (unmasked) ty)) +(rule 2 (rv_fma (ty_supported_vec ty) 0 1 (splat x) y z) (rv_vfmsac_vf z y x (unmasked) ty)) +(rule 2 (rv_fma (ty_supported_vec ty) 1 0 (splat x) y z) (rv_vfnmsac_vf z y x (unmasked) ty)) +(rule 2 (rv_fma (ty_supported_vec ty) 1 1 (splat x) y z) (rv_vfnmacc_vf z y x (unmasked) ty)) +(rule 3 (rv_fma (ty_supported_vec ty) 0 0 x (splat y) z) (rv_vfmacc_vf z x y (unmasked) ty)) +(rule 3 (rv_fma (ty_supported_vec ty) 0 1 x (splat y) z) (rv_vfmsac_vf z x y (unmasked) ty)) +(rule 3 (rv_fma (ty_supported_vec ty) 1 0 x (splat y) z) (rv_vfnmsac_vf z x y (unmasked) ty)) +(rule 3 (rv_fma (ty_supported_vec ty) 1 1 x (splat y) z) (rv_vfnmacc_vf z x y (unmasked) ty)) + +;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (sqrt x))) + (rv_fsqrt ty (FRM.RNE) x)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (sqrt x))) + (rv_vfsqrt_v x (unmasked) ty)) + +;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule -1 + ;; + (lower + (has_type (valid_atomic_transaction ty) (atomic_rmw flags op addr x))) + (gen_atomic (get_atomic_rmw_op ty op) addr x (atomic_amo))) + +;;; for I8 and I16 +(rule 1 + (lower + (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags op addr x))) + (gen_atomic_rmw_loop op ty addr x)) + +;;;special for I8 and I16 max min etc. +;;;because I need uextend or sextend the value. +(rule 2 + (lower + (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags (is_atomic_rmw_max_etc op $true) addr x))) + (gen_atomic_rmw_loop op ty addr (sext x))) + + +(rule 2 + ;; + (lower + (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags (is_atomic_rmw_max_etc op $false) addr x))) + ;; + (gen_atomic_rmw_loop op ty addr (zext x))) + +;;;;; Rules for `AtomicRmwOp.Sub` +(rule + (lower + (has_type (valid_atomic_transaction ty) (atomic_rmw flags (AtomicRmwOp.Sub) addr x))) + (let + ((tmp WritableReg (temp_writable_reg ty)) + (x2 Reg (rv_neg x))) + (gen_atomic (get_atomic_rmw_op ty (AtomicRmwOp.Add)) addr x2 (atomic_amo)))) + +(decl gen_atomic_rmw_loop (AtomicRmwOp Type XReg XReg) XReg) +(rule + (gen_atomic_rmw_loop op ty addr x) + (let + ((dst WritableXReg (temp_writable_xreg)) + (t0 WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.AtomicRmwLoop (gen_atomic_offset addr ty) op dst ty (gen_atomic_p addr ty) x t0)))) + (writable_reg_to_reg dst))) + +;;;;; Rules for `AtomicRmwOp.Nand` +(rule + (lower + (has_type (valid_atomic_transaction ty) (atomic_rmw flags (AtomicRmwOp.Nand) addr x))) + (gen_atomic_rmw_loop (AtomicRmwOp.Nand) ty addr x)) + +(decl is_atomic_rmw_max_etc (AtomicRmwOp bool) AtomicRmwOp) +(extern extractor is_atomic_rmw_max_etc is_atomic_rmw_max_etc) + +;;;;; Rules for `atomic load`;;;;;;;;;;;;;;;;; +(rule + (lower (has_type (valid_atomic_transaction ty) (atomic_load flags p))) + (gen_atomic_load p ty)) + + +;;;;; Rules for `atomic store`;;;;;;;;;;;;;;;;; +(rule + (lower (atomic_store flags src @ (value_type (valid_atomic_transaction ty)) p)) + (gen_atomic_store p ty src)) + +(decl gen_atomic_offset (XReg Type) XReg) +(rule 1 (gen_atomic_offset p (fits_in_16 ty)) + (rv_slli (rv_andi p (imm12_const 3)) (imm12_const 3))) + +(rule (gen_atomic_offset p _) + (zero_reg)) + +(decl gen_atomic_p (XReg Type) XReg) +(rule 1 (gen_atomic_p p (fits_in_16 ty)) + (rv_andi p (imm12_const -4))) + +(rule (gen_atomic_p p _) + p) + + +;;;;; Rules for `atomic cas`;;;;;;;;;;;;;;;;; +(rule + (lower (has_type (valid_atomic_transaction ty) (atomic_cas flags p e x))) + (let + ((t0 WritableReg (temp_writable_reg ty)) + (dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.AtomicCas (gen_atomic_offset p ty) t0 dst (zext e) (gen_atomic_p p ty) x ty)))) + (writable_reg_to_reg dst))) + +;;;;; Rules for `ireduce`;;;;;;;;;;;;;;;;; +(rule + (lower (has_type ty (ireduce x))) + (value_regs_get x 0)) + +;;;;; Rules for `fpromote`;;;;;;;;;;;;;;;;; +(rule (lower (fpromote x)) + (rv_fcvtds x)) + +;;;;; Rules for `fvpromote_low`;;;;;;;;;;;; + +(rule (lower (has_type (ty_supported_vec ty) (fvpromote_low x))) + (if-let half_ty (ty_half_width ty)) + (rv_vfwcvt_f_f_v x (unmasked) (vstate_mf2 half_ty))) + +;;;;; Rules for `fdemote`;;;;;;;;;;;;;;;;;; +(rule (lower (fdemote x)) + (rv_fcvtsd (FRM.RNE) x)) + +;;;;; Rules for `fvdemote`;;;;;;;;;;;;;;;;; + +;; `vfncvt...` leaves the upper bits of the register undefined so +;; we need to zero them out. +(rule (lower (has_type (ty_supported_vec ty @ $F32X4) (fvdemote x))) + (if-let zero (i8_to_imm5 0)) + (let ((narrow VReg (rv_vfncvt_f_f_w x (unmasked) (vstate_mf2 ty))) + (mask VReg (gen_vec_mask 0xC))) + (rv_vmerge_vim narrow zero mask ty))) + + +;;;;; Rules for for float arithmetic + + +;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_float ty) (fadd x y))) + (rv_fadd ty (FRM.RNE) x y)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fadd x y))) + (rv_vfadd_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (fadd x (splat y)))) + (rv_vfadd_vf x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (fadd (splat x) y))) + (rv_vfadd_vf y x (unmasked) ty)) + + +;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fsub x y))) + (rv_fsub ty (FRM.RNE) x y)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fsub x y))) + (rv_vfsub_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (fsub x (splat y)))) + (rv_vfsub_vf x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (fsub (splat x) y))) + (rv_vfrsub_vf y x (unmasked) ty)) + +;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fmul x y))) + (rv_fmul ty (FRM.RNE) x y)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fmul x y))) + (rv_vfmul_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (fmul x (splat y)))) + (rv_vfmul_vf x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (fmul (splat x) y))) + (rv_vfmul_vf y x (unmasked) ty)) + + +;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (fdiv x y))) + (rv_fdiv ty (FRM.RNE) x y)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (fdiv x y))) + (rv_vfdiv_vv x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (fdiv x (splat y)))) + (rv_vfdiv_vf x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (fdiv (splat x) y))) + (rv_vfrdiv_vf y x (unmasked) ty)) + +;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; RISC-V's `fmin` instruction returns the number input if one of inputs is a +;; NaN. We handle this by manually checking if one of the inputs is a NaN +;; and selecting based on that result. +(rule 0 (lower (has_type (ty_supported_float ty) (fmin x y))) + (let (;; Check if both inputs are not nan. + (is_ordered FloatCompare (fcmp_to_float_compare (FloatCC.Ordered) ty x y)) + ;; `fadd` returns a nan if any of the inputs is a NaN. + (nan FReg (rv_fadd ty (FRM.RNE) x y)) + (min FReg (rv_fmin ty x y))) + (gen_select_freg is_ordered min nan))) + +;; With Zfa we can use the special `fminm` that precisely matches the expected +;; NaN behavior. +(rule 1 (lower (has_type (ty_supported_float ty) (fmin x y))) + (if-let $true (has_zfa)) + (rv_fminm ty x y)) + +;; vfmin does almost the right thing, but it does not handle NaN's correctly. +;; We should return a NaN if any of the inputs is a NaN, but vfmin returns the +;; number input instead. +;; +;; TODO: We can improve this by using a masked `fmin` instruction that modifies +;; the canonical nan register. That way we could avoid the `vmerge.vv` instruction. +(rule 2 (lower (has_type (ty_supported_vec ty) (fmin x y))) + (let ((is_not_nan VReg (gen_fcmp_mask ty (FloatCC.Ordered) x y)) + (nan XReg (imm $I64 (canonical_nan_u64 (lane_type ty)))) + (vec_nan VReg (rv_vmv_vx nan ty)) + (min VReg (rv_vfmin_vv x y (unmasked) ty))) + (rv_vmerge_vvm vec_nan min is_not_nan ty))) + +;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; RISC-V's `fmax` instruction returns the number input if one of inputs is a +;; NaN. We handle this by manually checking if one of the inputs is a NaN +;; and selecting based on that result. +(rule 0 (lower (has_type (ty_supported_float ty) (fmax x y))) + (let (;; Check if both inputs are not nan. + (is_ordered FloatCompare (fcmp_to_float_compare (FloatCC.Ordered) ty x y)) + ;; `fadd` returns a NaN if any of the inputs is a NaN. + (nan FReg (rv_fadd ty (FRM.RNE) x y)) + (max FReg (rv_fmax ty x y))) + (gen_select_freg is_ordered max nan))) + +;; With Zfa we can use the special `fmaxm` that precisely matches the expected +;; NaN behavior. +(rule 1 (lower (has_type (ty_supported_float ty) (fmax x y))) + (if-let $true (has_zfa)) + (rv_fmaxm ty x y)) + +;; vfmax does almost the right thing, but it does not handle NaN's correctly. +;; We should return a NaN if any of the inputs is a NaN, but vfmax returns the +;; number input instead. +;; +;; TODO: We can improve this by using a masked `fmax` instruction that modifies +;; the canonical nan register. That way we could avoid the `vmerge.vv` instruction. +(rule 2 (lower (has_type (ty_supported_vec ty) (fmax x y))) + (let ((is_not_nan VReg (gen_fcmp_mask ty (FloatCC.Ordered) x y)) + (nan XReg (imm $I64 (canonical_nan_u64 (lane_type ty)))) + (vec_nan VReg (rv_vmv_vx nan ty)) + (max VReg (rv_vfmax_vv x y (unmasked) ty))) + (rv_vmerge_vvm vec_nan max is_not_nan ty))) + +;;;;; Rules for `stack_addr`;;;;;;;;; +(rule + (lower (stack_addr ss offset)) + (gen_stack_addr ss offset)) + +;;;;; Rules for `select`;;;;;;;;; + +;; Manually matching (iconst 0) here is a bit of a hack. We can't do that as part +;; of the iconst rule because that runs into regalloc issues. gen_select_xreg +;; has some optimizations based on the use of the zero register so we have to +;; manually match it here. +(rule 5 (lower (has_type (ty_int_ref_scalar_64 _) (select c (i64_from_iconst 0) y))) + (gen_select_xreg (is_nonzero_cmp c) (zero_reg) y)) + +(rule 4 (lower (has_type (ty_int_ref_scalar_64 _) (select c x (i64_from_iconst 0)))) + (gen_select_xreg (is_nonzero_cmp c) x (zero_reg))) + +(rule 3 (lower (has_type (ty_int_ref_scalar_64 _) (select c x y))) + (gen_select_xreg (is_nonzero_cmp c) x y)) + +(rule 2 (lower (has_type $I128 (select c x y))) + (gen_select_regs (is_nonzero_cmp c) x y)) + +(rule 1 (lower (has_type (ty_supported_vec _) (select c x y))) + (gen_select_vreg (is_nonzero_cmp c) x y)) + +(rule 0 (lower (has_type (ty_supported_float _) (select c x y))) + (gen_select_freg (is_nonzero_cmp c) x y)) + +;;;;; Rules for `bitselect`;;;;;;;;; + +;; Do a (c & x) | (~c & y) operation. +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect c x y))) + (let ((tmp_x XReg (rv_and c x)) + (c_inverse XReg (rv_not c)) + (tmp_y XReg (rv_and c_inverse y))) + (rv_or tmp_x tmp_y))) + +;; For vectors, we also do the same operation. +;; We can technically use any type in the bitwise operations, but prefer +;; using the type of the inputs so that we avoid emitting unnecessary +;; `vsetvl` instructions. it's likely that the vector unit is already +;; configured for that type. +(rule 1 (lower (has_type (ty_supported_vec ty) (bitselect c x y))) + (let ((tmp_x VReg (rv_vand_vv c x (unmasked) ty)) + (c_inverse VReg (rv_vnot_v c (unmasked) ty)) + (tmp_y VReg (rv_vand_vv c_inverse y (unmasked) ty))) + (rv_vor_vv tmp_x tmp_y (unmasked) ty))) + +;; Special case for bitselects with cmp's as an input. +;; +;; This allows us to skip the mask expansion step and use the more efficient +;; vmerge.vvm instruction. +;; +;; We should be careful to ensure that the mask and the vmerge have the +;; same type. So that we don't generate a mask with length 16 (i.e. for i8x16), and then +;; only copy the first few lanes of the result to the destination register because +;; the bitselect has a different length (i.e. i64x2). +;; +;; See: https://github.com/bytecodealliance/wasmtime/issues/8131 + +(rule 2 (lower (has_type (ty_supported_vec _ty) (bitselect (icmp cc a @ (value_type (ty_supported_vec cmp_ty)) b) x y))) + (let ((mask VReg (gen_icmp_mask cmp_ty cc a b))) + (rv_vmerge_vvm y x mask cmp_ty))) + +(rule 2 (lower (has_type (ty_supported_vec _ty) (bitselect (fcmp cc a @ (value_type (ty_supported_vec cmp_ty)) b) x y))) + (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b))) + (rv_vmerge_vvm y x mask cmp_ty))) + +(rule 2 (lower (has_type (ty_supported_vec _ty) (bitselect (bitcast _ (fcmp cc a @ (value_type (ty_supported_vec cmp_ty)) b)) x y))) + (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b))) + (rv_vmerge_vvm y x mask cmp_ty))) + +(rule 2 (lower (has_type (ty_supported_vec _ty) (bitselect (bitcast _ (icmp cc a @ (value_type (ty_supported_vec cmp_ty)) b)) x y))) + (let ((mask VReg (gen_icmp_mask cmp_ty cc a b))) + (rv_vmerge_vvm y x mask cmp_ty))) + + +;;;;; Rules for `isplit`;;;;;;;;; +(rule + (lower (isplit x)) + (let + ((t1 XReg (value_regs_get x 0)) + (t2 XReg (value_regs_get x 1))) + (output_pair t1 t2))) + +;;;;; Rules for `iconcat`;;;;;;;;; +(rule + (lower (has_type $I128 (iconcat x y))) + (let + ((t1 XReg x) + (t2 XReg y)) + (value_regs t1 t2))) + +;; Special-case the lowering of an `isplit` of a 128-bit multiply where the +;; lower bits of the result are discarded and the operands are sign or zero +;; extended. This maps directly to `umulh` and `smulh`. +(rule 1 (lower i @ (isplit (has_type $I128 (imul (uextend x) (uextend y))))) + (if-let (first_result lo) i) + (if-let $true (value_is_unused lo)) + (output_pair (invalid_reg) (rv_mulhu (zext x) (zext y)))) + +(rule 1 (lower i @ (isplit (has_type $I128 (imul (sextend x) (sextend y))))) + (if-let (first_result lo) i) + (if-let $true (value_is_unused lo)) + (output_pair (invalid_reg) (rv_mulh (sext x) (sext y)))) + +;;;;; Rules for `smax`;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 ty) (smax x y))) + (let ((x XReg (sext x)) + (y XReg (sext y))) + (gen_select_xreg (cmp_gt x y) x y))) + +(rule 1 (lower (has_type $I128 (smax x y))) + (gen_select_regs (icmp_to_int_compare (IntCC.SignedGreaterThan) x y) x y)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (smax x y))) + (rv_vmax_vv x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (smax x (splat y)))) + (rv_vmax_vx x y (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (smax (splat x) y))) + (rv_vmax_vx y x (unmasked) ty)) + +;;;;; Rules for `smin`;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 ty) (smin x y))) + (let ((x XReg (sext x)) + (y XReg (sext y))) + (gen_select_xreg (cmp_lt x y) x y))) + +(rule 1 (lower (has_type $I128 (smin x y))) + (gen_select_regs (icmp_to_int_compare (IntCC.SignedLessThan) x y) x y)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (smin x y))) + (rv_vmin_vv x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (smin x (splat y)))) + (rv_vmin_vx x y (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (smin (splat x) y))) + (rv_vmin_vx y x (unmasked) ty)) + +;;;;; Rules for `umax`;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 ty) (umax x y))) + (let ((x XReg (zext x)) + (y XReg (zext y))) + (gen_select_xreg (cmp_gtu x y) x y))) + +(rule 1 (lower (has_type $I128 (umax x y))) + (gen_select_regs (icmp_to_int_compare (IntCC.UnsignedGreaterThan) x y) x y)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (umax x y))) + (rv_vmaxu_vv x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (umax x (splat y)))) + (rv_vmaxu_vx x y (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (umax (splat x) y))) + (rv_vmaxu_vx y x (unmasked) ty)) + +;;;;; Rules for `umin`;;;;;;;;; + +(rule 0 (lower (has_type (fits_in_64 ty) (umin x y))) + (let ((x XReg (zext x)) + (y XReg (zext y))) + (gen_select_xreg (cmp_ltu x y) x y))) + +(rule 1 (lower (has_type $I128 (umin x y))) + (gen_select_regs (icmp_to_int_compare (IntCC.UnsignedLessThan) x y) x y)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (umin x y))) + (rv_vminu_vv x y (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (umin x (splat y)))) + (rv_vminu_vx x y (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (umin (splat x) y))) + (rv_vminu_vx y x (unmasked) ty)) + + +;;;;; Rules for `debugtrap`;;;;;;;;; +(rule + (lower (debugtrap)) + (side_effect (SideEffectNoResult.Inst (MInst.EBreak)))) + +;;;;; Rules for `fence`;;;;;;;;; +(rule + (lower (fence)) + (side_effect (SideEffectNoResult.Inst (MInst.Fence 15 15)))) + +;;;;; Rules for `trap`;;;;;;;;; +(rule + (lower (trap code)) + (udf code)) + +;;;;; Rules for `uload8`;;;;;;;;; +(rule (lower (uload8 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lbu) flags)) + +;;;;; Rules for `sload8`;;;;;;;;; +(rule (lower (sload8 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lb) flags)) + +;;;;; Rules for `uload16`;;;;;;;;; +(rule (lower (uload16 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lhu) flags)) + +;;;;; Rules for `iload16`;;;;;;;;; +(rule (lower (sload16 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lh) flags)) + +;;;;; Rules for `uload32`;;;;;;;;; +(rule (lower (uload32 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lwu) flags)) + +;;;;; Rules for `sload32`;;;;;;;;; +(rule (lower (sload32 flags addr offset)) + (gen_load (amode addr offset) (LoadOP.Lw) flags)) + +;;;;; Rules for `load`;;;;;;;;; +(rule (lower (has_type ty (load flags addr offset))) + (gen_load (amode addr offset) (load_op ty) flags)) + +(rule 1 (lower (has_type $I128 (load flags addr offset))) + (if-let offset_plus_8 (s32_add_fallible offset 8)) + (let ((lo XReg (gen_load (amode addr offset) (LoadOP.Ld) flags)) + (hi XReg (gen_load (amode addr offset_plus_8) (LoadOP.Ld) flags))) + (value_regs lo hi))) + +(rule 2 (lower (has_type (ty_supported_vec ty) (load flags addr offset))) + (let ((eew VecElementWidth (element_width_from_type ty)) + (amode AMode (amode addr offset))) + (vec_load eew (VecAMode.UnitStride amode) flags (unmasked) ty))) + +;;;;; Rules for Load + Extend Combos ;;;;;;;;; + +;; These rules cover the special loads that load a 64bit value and do some sort of extension. +;; We don't have any special instructions to do this, so just load the 64 bits as a vector, and +;; do a SEW/2 extension. This only reads half width elements from the source vector register +;; extends it, and writes the back the full register. + +(decl gen_load64_extend (Type ExtendOp MemFlags AMode) VReg) + +(rule (gen_load64_extend ty (ExtendOp.Signed) flags amode) + (let ((eew VecElementWidth (element_width_from_type $I64)) + (load_state VState (vstate_from_type $I64)) + (loaded VReg (vec_load eew (VecAMode.UnitStride amode) flags (unmasked) load_state))) + (rv_vsext_vf2 loaded (unmasked) ty))) + +(rule (gen_load64_extend ty (ExtendOp.Zero) flags amode) + (let ((eew VecElementWidth (element_width_from_type $I64)) + (load_state VState (vstate_from_type $I64)) + (loaded VReg (vec_load eew (VecAMode.UnitStride amode) flags (unmasked) load_state))) + (rv_vzext_vf2 loaded (unmasked) ty))) + +;;;;; Rules for `uload8x8`;;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I16X8) (uload8x8 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags (amode addr offset))) + +;;;;; Rules for `uload16x4`;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I32X4) (uload16x4 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags (amode addr offset))) + +;;;;; Rules for `uload32x2`;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I64X2) (uload32x2 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags (amode addr offset))) + +;;;;; Rules for `sload8x8`;;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I16X8) (sload8x8 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags (amode addr offset))) + +;;;;; Rules for `sload16x4`;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I32X4) (sload16x4 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags (amode addr offset))) + +;;;;; Rules for `sload32x2`;;;;;;;;; +(rule (lower (has_type (ty_supported_vec ty @ $I64X2) (sload32x2 flags addr offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags (amode addr offset))) + +;;;;; Rules for `istore8`;;;;;;;;; +(rule (lower (istore8 flags src addr offset)) + (rv_store (amode addr offset) (StoreOP.Sb) flags src)) + +;;;;; Rules for `istore16`;;;;;;;;; +(rule (lower (istore16 flags src addr offset)) + (rv_store (amode addr offset) (StoreOP.Sh) flags src)) + +;;;;; Rules for `istore32`;;;;;;;;; +(rule (lower (istore32 flags src addr offset)) + (rv_store (amode addr offset) (StoreOP.Sw) flags src)) + +;;;;; Rules for `store`;;;;;;;;; +(rule (lower (store flags src @ (value_type ty) addr offset)) + (gen_store (amode addr offset) flags src)) + +(rule 1 (lower (store flags src @ (value_type $I128) addr offset)) + (if-let offset_plus_8 (s32_add_fallible offset 8)) + (let ((_ InstOutput (rv_store (amode addr offset) (StoreOP.Sd) flags (value_regs_get src 0)))) + (rv_store (amode addr offset_plus_8) (StoreOP.Sd) flags (value_regs_get src 1)))) + +(rule 2 (lower (store flags src @ (value_type (ty_supported_vec ty)) addr offset)) + (let ((eew VecElementWidth (element_width_from_type ty)) + (amode AMode (amode addr offset))) + (vec_store eew (VecAMode.UnitStride amode) src flags (unmasked) ty))) + + +;;;;; Rules for `icmp`;;;;;;;;; + +;; 8-64 bit comparisons. Mostly fall back onto `IntegerCompare` and then +;; materializing that, but before that happens try to match some +;; constant-related patterns + +(rule 0 (lower (icmp cc x @ (value_type (fits_in_64 ty)) y)) + (lower_icmp cc x y)) + +(decl lower_icmp (IntCC Value Value) XReg) +(rule 0 (lower_icmp cc x y) + (lower_int_compare (icmp_to_int_compare cc x y))) + +;; a == $imm => seqz(xori(..)) +(rule 1 (lower_icmp (IntCC.Equal) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) y) + (rv_seqz (rv_xori (sext x) imm))) +(rule 2 (lower_icmp (IntCC.Equal) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) x) + (rv_seqz (rv_xori (sext y) imm))) + +;; a != $imm => snez(xori(..)) +(rule 1 (lower_icmp (IntCC.NotEqual) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) y) + (rv_snez (rv_xori (sext x) imm))) +(rule 2 (lower_icmp (IntCC.NotEqual) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) x) + (rv_snez (rv_xori (sext y) imm))) + +;; a < $imm => slti(..) +(rule 1 (lower_icmp (IntCC.SignedLessThan) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) y) + (rv_slti (sext x) imm)) +(rule 1 (lower_icmp (IntCC.SignedGreaterThan) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 imm))) x) + (rv_slti (sext y) imm)) +(rule 1 (lower_icmp (IntCC.UnsignedLessThan) x y) + (if-let (u64_from_iconst (u64_nonzero (imm12_from_u64 imm))) y) + (rv_sltiu (zext x) imm)) +(rule 1 (lower_icmp (IntCC.UnsignedGreaterThan) x y) + (if-let (u64_from_iconst (u64_nonzero (imm12_from_u64 imm))) x) + (rv_sltiu (zext y) imm)) + +;; a >= $imm => !(a < $imm) +(rule 2 (lower_icmp cc @ (IntCC.SignedGreaterThanOrEqual) x y) + (if-let (i64_from_iconst (i64_nonzero (imm12_from_i64 _))) y) + (rv_xori (lower_icmp (intcc_complement cc) x y) (imm12_const 1))) +(rule 2 (lower_icmp cc @ (IntCC.UnsignedGreaterThanOrEqual) x y) + (if-let (u64_from_iconst (u64_nonzero (imm12_from_u64 _))) y) + (rv_xori (lower_icmp (intcc_complement cc) x y) (imm12_const 1))) + +;; Materializes an `IntegerCompare` bundle directly into an `XReg` with a 0 +;; or 1 value. +(decl lower_int_compare (IntegerCompare) XReg) + +;; x == y => x ^ y == 0 +(rule 0 (lower_int_compare (int_compare_decompose (IntCC.Equal) x y)) + (rv_seqz (rv_xor x y))) +(rule 1 (lower_int_compare (int_compare_decompose (IntCC.Equal) x (zero_reg))) + (rv_seqz x)) +(rule 2 (lower_int_compare (int_compare_decompose (IntCC.Equal) (zero_reg) y)) + (rv_seqz y)) +;; x != y => x ^ y != 0 +(rule 0 (lower_int_compare (int_compare_decompose (IntCC.NotEqual) x y)) + (rv_snez (rv_xor x y))) +(rule 1 (lower_int_compare (int_compare_decompose (IntCC.NotEqual) x (zero_reg))) + (rv_snez x)) +(rule 2 (lower_int_compare (int_compare_decompose (IntCC.NotEqual) (zero_reg) x)) + (rv_snez x)) +;; x < y => x < y +(rule (lower_int_compare (int_compare_decompose (IntCC.SignedLessThan) x y)) + (rv_slt x y)) +(rule (lower_int_compare (int_compare_decompose (IntCC.UnsignedLessThan) x y)) + (rv_sltu x y)) +;; x > y => y < x +(rule (lower_int_compare (int_compare_decompose (IntCC.SignedGreaterThan) x y)) + (rv_slt y x)) +(rule (lower_int_compare (int_compare_decompose (IntCC.UnsignedGreaterThan) x y)) + (rv_sltu y x)) +;; x <= y => !(y < x) +(rule (lower_int_compare (int_compare_decompose (IntCC.SignedLessThanOrEqual) x y)) + (rv_xori (rv_slt y x) (imm12_const 1))) +(rule (lower_int_compare (int_compare_decompose (IntCC.UnsignedLessThanOrEqual) x y)) + (rv_xori (rv_sltu y x) (imm12_const 1))) +;; x >= y => !(x < y) +(rule (lower_int_compare (int_compare_decompose (IntCC.SignedGreaterThanOrEqual) x y)) + (rv_xori (rv_slt x y) (imm12_const 1))) +(rule (lower_int_compare (int_compare_decompose (IntCC.UnsignedGreaterThanOrEqual) x y)) + (rv_xori (rv_sltu x y) (imm12_const 1))) + +;; 128-bit comparisons. +;; +;; Currently only `==`, `!=`, and `<` are implemented, and everything else +;; delegates to one of those. + +(rule 20 (lower (icmp cc x @ (value_type $I128) y)) + (lower_icmp_i128 cc x y)) + +(decl lower_icmp_i128 (IntCC ValueRegs ValueRegs) XReg) +(rule 0 (lower_icmp_i128 (IntCC.Equal) x y) + (let ((lo XReg (rv_xor (value_regs_get x 0) (value_regs_get y 0))) + (hi XReg (rv_xor (value_regs_get x 1) (value_regs_get y 1)))) + (rv_seqz (rv_or lo hi)))) +(rule 0 (lower_icmp_i128 (IntCC.NotEqual) x y) + (let ((lo XReg (rv_xor (value_regs_get x 0) (value_regs_get y 0))) + (hi XReg (rv_xor (value_regs_get x 1) (value_regs_get y 1)))) + (rv_snez (rv_or lo hi)))) + +;; swap args for `>` to use `<` instead +(rule 0 (lower_icmp_i128 cc @ (IntCC.SignedGreaterThan) x y) + (lower_icmp_i128 (intcc_swap_args cc) y x)) +(rule 0 (lower_icmp_i128 cc @ (IntCC.UnsignedGreaterThan) x y) + (lower_icmp_i128 (intcc_swap_args cc) y x)) + +;; complement `=`-related conditions to get ones that don't use `=`. +(rule 0 (lower_icmp_i128 cc @ (IntCC.SignedLessThanOrEqual) x y) + (rv_xori (lower_icmp_i128 (intcc_complement cc) x y) (imm12_const 1))) +(rule 0 (lower_icmp_i128 cc @ (IntCC.SignedGreaterThanOrEqual) x y) + (rv_xori (lower_icmp_i128 (intcc_complement cc) x y) (imm12_const 1))) +(rule 0 (lower_icmp_i128 cc @ (IntCC.UnsignedLessThanOrEqual) x y) + (rv_xori (lower_icmp_i128 (intcc_complement cc) x y) (imm12_const 1))) +(rule 0 (lower_icmp_i128 cc @ (IntCC.UnsignedGreaterThanOrEqual) x y) + (rv_xori (lower_icmp_i128 (intcc_complement cc) x y) (imm12_const 1))) + +;; Compare both the bottom and upper halves of the 128-bit values. If +;; the top half is equal use the bottom comparison, otherwise use the upper +;; comparison. Note that the lower comparison is always unsigned since if it's +;; used the top halves are all zeros and the semantic values are positive. +(rule 1 (lower_icmp_i128 cc x y) + (if-let (IntCC.UnsignedLessThan) (intcc_unsigned cc)) + (let ((x_lo Reg (value_regs_get x 0)) + (x_hi Reg (value_regs_get x 1)) + (y_lo Reg (value_regs_get y 0)) + (y_hi Reg (value_regs_get y 1)) + (top_cmp XReg (lower_int_compare (int_compare cc x_hi y_hi))) + (bottom_cmp XReg (rv_sltu x_lo y_lo))) + (gen_select_xreg (cmp_eqz (rv_xor x_hi y_hi)) bottom_cmp top_cmp))) + +;; vector icmp comparisons + +(rule 30 (lower (icmp cc x @ (value_type (ty_supported_vec ty)) y)) + (gen_expand_mask ty (gen_icmp_mask ty cc x y))) + +;;;;; Rules for `fcmp`;;;;;;;;; +(rule 0 (lower (fcmp cc x @ (value_type (ty_supported_float ty)) y)) + (lower_float_compare (fcmp_to_float_compare cc ty x y))) + +(decl lower_float_compare (FloatCompare) XReg) +(rule (lower_float_compare (FloatCompare.One r)) r) +(rule (lower_float_compare (FloatCompare.Zero r)) (rv_seqz r)) + +(rule 1 (lower (fcmp cc x @ (value_type (ty_supported_vec ty)) y)) + (gen_expand_mask ty (gen_fcmp_mask ty cc x y))) + +;;;;; Rules for `func_addr`;;;;;;;;; +(rule + (lower (func_addr (func_ref_data _ name _))) + (load_ext_name name 0)) + +;;;;; Rules for `fcvt_to_uint`;;;;;;;;; + +;; RISC-V float-to-integer conversion does not trap, but Cranelift semantics are +;; to trap. This manually performs checks for NaN and out-of-bounds values and +;; traps in such cases. +;; +;; TODO: could this perhaps be more optimal through inspection of the `fcsr`? +;; Unsure whether that needs to be preserved across function calls and/or would +;; cause other problems. Also unsure whether it's actually more performant. +(rule (lower (has_type ity (fcvt_to_uint v @ (value_type fty)))) + (let ((_ InstOutput (gen_trapz (rv_feq fty v v) (TrapCode.BadConversionToInteger))) + (min FReg (imm fty (fcvt_umin_bound fty $false))) + (_ InstOutput (gen_trapnz (rv_fle fty v min) (TrapCode.IntegerOverflow))) + (max FReg (imm fty (fcvt_umax_bound fty ity $false))) + (_ InstOutput (gen_trapnz (rv_fge fty v max) (TrapCode.IntegerOverflow)))) + (lower_inbounds_fcvt_to_uint ity fty v))) + +(decl lower_inbounds_fcvt_to_uint (Type Type FReg) XReg) +(rule 0 (lower_inbounds_fcvt_to_uint (fits_in_32 _) fty v) + (rv_fcvtwu fty (FRM.RTZ) v)) +(rule 1 (lower_inbounds_fcvt_to_uint $I64 fty v) + (rv_fcvtlu fty (FRM.RTZ) v)) + +;;;;; Rules for `fcvt_to_sint`;;;;;;;;; + +;; NB: see above with `fcvt_to_uint` as this is similar +(rule (lower (has_type ity (fcvt_to_sint v @ (value_type fty)))) + (let ((_ InstOutput (gen_trapz (rv_feq fty v v) (TrapCode.BadConversionToInteger))) + (min FReg (imm fty (fcvt_smin_bound fty ity $false))) + (_ InstOutput (gen_trapnz (rv_fle fty v min) (TrapCode.IntegerOverflow))) + (max FReg (imm fty (fcvt_smax_bound fty ity $false))) + (_ InstOutput (gen_trapnz (rv_fge fty v max) (TrapCode.IntegerOverflow)))) + (lower_inbounds_fcvt_to_sint ity fty v))) + +(decl lower_inbounds_fcvt_to_sint (Type Type FReg) XReg) +(rule 0 (lower_inbounds_fcvt_to_sint (fits_in_32 _) fty v) + (rv_fcvtw fty (FRM.RTZ) v)) +(rule 1 (lower_inbounds_fcvt_to_sint $I64 fty v) + (rv_fcvtl fty (FRM.RTZ) v)) + +;;;;; Rules for `fcvt_to_sint_sat`;;;;;;;;; + +(rule 0 (lower (has_type to (fcvt_to_sint_sat v @ (value_type (ty_supported_float from))))) + (handle_fcvt_to_int_nan from v (lower_fcvt_to_sint_sat from to v))) + +;; Lowers to a `rv_fcvt*` instruction but handles 8/16-bit cases where the +;; float is clamped before the conversion. +(decl lower_fcvt_to_sint_sat (Type Type FReg) XReg) +(rule 0 (lower_fcvt_to_sint_sat ty (fits_in_16 out_ty) v) + (let ((max FReg (imm ty (fcvt_smax_bound ty out_ty $true))) + (min FReg (imm ty (fcvt_smin_bound ty out_ty $true))) + (clamped FReg (rv_fmin ty max (rv_fmax ty min v)))) + (rv_fcvtw ty (FRM.RTZ) clamped))) +(rule 1 (lower_fcvt_to_sint_sat ty $I32 v) (rv_fcvtw ty (FRM.RTZ) v)) +(rule 1 (lower_fcvt_to_sint_sat ty $I64 v) (rv_fcvtl ty (FRM.RTZ) v)) + +(decl fcvt_smax_bound (Type Type bool) u64) +(extern constructor fcvt_smax_bound fcvt_smax_bound) +(decl fcvt_smin_bound (Type Type bool) u64) +(extern constructor fcvt_smin_bound fcvt_smin_bound) + +;; RISC-V float-to-int conversions generate the same output for NaN and +Inf, +;; but Cranelift semantics are to produce 0 for NaN instead. This helper +;; translates these semantics by taking the float being converted (with the type +;; specified) and the native RISC-V output as an `XReg`. The returned `XReg` +;; will be zeroed out if the float is NaN. +;; +;; This is done by comparing the float to itself, generating 0 if it's NaN. This +;; bit is then negated to become either all-ones or all-zeros which is then +;; and-ed against the native output. That'll produce all zeros if the input is +;; NaN or the native output otherwise. +(decl handle_fcvt_to_int_nan (Type FReg XReg) XReg) +(rule (handle_fcvt_to_int_nan ty freg xreg) + (let ((is_not_nan XReg (rv_feq ty freg freg)) + (not_nan_mask XReg (rv_neg is_not_nan))) + (rv_and xreg not_nan_mask))) + +(rule 1 (lower (has_type (ty_supported_vec _) (fcvt_to_sint_sat v @ (value_type from_ty)))) + (if-let zero (i8_to_imm5 0)) + (let ((is_nan VReg (rv_vmfne_vv v v (unmasked) from_ty)) + (cvt VReg (rv_vfcvt_rtz_x_f_v v (unmasked) from_ty))) + (rv_vmerge_vim cvt zero is_nan from_ty))) + +;;;;; Rules for `fcvt_to_uint_sat`;;;;;;;;; + +(rule 0 (lower (has_type to (fcvt_to_uint_sat v @ (value_type (ty_supported_float from))))) + (handle_fcvt_to_int_nan from v (lower_fcvt_to_uint_sat from to v))) + +;; Lowers to a `rv_fcvt*` instruction but handles 8/16-bit cases where the +;; float is clamped before the conversion. +(decl lower_fcvt_to_uint_sat (Type Type FReg) XReg) +(rule 0 (lower_fcvt_to_uint_sat ty (fits_in_16 out_ty) v) + (let ((max FReg (imm ty (fcvt_umax_bound ty out_ty $true))) + (min FReg (rv_fmvdx (zero_reg))) + (clamped FReg (rv_fmin ty max (rv_fmax ty min v)))) + (rv_fcvtwu ty (FRM.RTZ) clamped))) +(rule 1 (lower_fcvt_to_uint_sat ty $I32 v) (rv_fcvtwu ty (FRM.RTZ) v)) +(rule 1 (lower_fcvt_to_uint_sat ty $I64 v) (rv_fcvtlu ty (FRM.RTZ) v)) + +(decl fcvt_umax_bound (Type Type bool) u64) +(extern constructor fcvt_umax_bound fcvt_umax_bound) +(decl fcvt_umin_bound (Type bool) u64) +(extern constructor fcvt_umin_bound fcvt_umin_bound) + +(rule 1 (lower (has_type (ty_supported_vec _) (fcvt_to_uint_sat v @ (value_type from_ty)))) + (if-let zero (i8_to_imm5 0)) + (let ((is_nan VReg (rv_vmfne_vv v v (unmasked) from_ty)) + (cvt VReg (rv_vfcvt_rtz_xu_f_v v (unmasked) from_ty))) + (rv_vmerge_vim cvt zero is_nan from_ty))) + +;;;;; Rules for `fcvt_from_sint`;;;;;;;;; +(rule 0 (lower (has_type $F32 (fcvt_from_sint v @ (value_type (fits_in_16 ty))))) + (rv_fcvtsl (FRM.RNE) (sext v))) + +(rule 1 (lower (has_type $F32 (fcvt_from_sint v @ (value_type $I32)))) + (rv_fcvtsw (FRM.RNE) v)) + +(rule 1 (lower (has_type $F32 (fcvt_from_sint v @ (value_type $I64)))) + (rv_fcvtsl (FRM.RNE) v)) + +(rule 0 (lower (has_type $F64 (fcvt_from_sint v @ (value_type (fits_in_16 ty))))) + (rv_fcvtdl (FRM.RNE) (sext v))) + +(rule 1 (lower (has_type $F64 (fcvt_from_sint v @ (value_type $I32)))) + (rv_fcvtdw v)) + +(rule 1 (lower (has_type $F64 (fcvt_from_sint v @ (value_type $I64)))) + (rv_fcvtdl (FRM.RNE) v)) + +(rule 2 (lower (has_type (ty_supported_vec _) (fcvt_from_sint v @ (value_type from_ty)))) + (rv_vfcvt_f_x_v v (unmasked) from_ty)) + +;;;;; Rules for `fcvt_from_uint`;;;;;;;;; +(rule 0 (lower (has_type $F32 (fcvt_from_uint v @ (value_type (fits_in_16 ty))))) + (rv_fcvtslu (FRM.RNE) (zext v))) + +(rule 1 (lower (has_type $F32 (fcvt_from_uint v @ (value_type $I32)))) + (rv_fcvtswu (FRM.RNE) v)) + +(rule 1 (lower (has_type $F32 (fcvt_from_uint v @ (value_type $I64)))) + (rv_fcvtslu (FRM.RNE) v)) + +(rule 0 (lower (has_type $F64 (fcvt_from_uint v @ (value_type (fits_in_16 ty))))) + (rv_fcvtdlu (FRM.RNE) (zext v))) + +(rule 1 (lower (has_type $F64 (fcvt_from_uint v @ (value_type $I32)))) + (rv_fcvtdwu v)) + +(rule 1 (lower (has_type $F64 (fcvt_from_uint v @ (value_type $I64)))) + (rv_fcvtdlu (FRM.RNE) v)) + +(rule 2 (lower (has_type (ty_supported_vec _) (fcvt_from_uint v @ (value_type from_ty)))) + (rv_vfcvt_f_xu_v v (unmasked) from_ty)) + +;;;;; Rules for `symbol_value`;;;;;;;;; +(rule + (lower (symbol_value (symbol_value_data name _ offset))) + (load_ext_name name offset)) + +;;;;; Rules for `tls_value` ;;;;;;;;;;;;;; + +(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value (symbol_value_data name _ _)))) + (elf_tls_get_addr name)) + +;;;;; Rules for `bitcast`;;;;;;;;; + +;; These rules should probably be handled in `gen_bitcast`, but it's convenient to have that return +;; a single register, instead of a `ValueRegs` +(rule 3 (lower (has_type $I128 (bitcast _ v @ (value_type (ty_supported_vec _))))) + (value_regs + (gen_extractlane $I64X2 v 0) + (gen_extractlane $I64X2 v 1))) + +;; Move the high half into a vector register, and then use vslide1up to move it up and +;; insert the lower half in one instruction. +(rule 2 (lower (has_type (ty_supported_vec _) (bitcast _ v @ (value_type $I128)))) + (let ((lo XReg (value_regs_get v 0)) + (hi XReg (value_regs_get v 1)) + (vstate VState (vstate_from_type $I64X2)) + (vec VReg (rv_vmv_sx hi vstate))) + (rv_vslide1up_vx vec vec lo (unmasked) vstate))) + +;; `gen_bitcast` below only works with single register values, so handle I128 +;; specially here. +(rule 1 (lower (has_type $I128 (bitcast _ v @ (value_type $I128)))) + v) + +(rule 0 (lower (has_type out_ty (bitcast _ v @ (value_type in_ty)))) + (gen_bitcast v in_ty out_ty)) + +;;;;; Rules for `ceil`;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (ceil x))) + (gen_float_round (FRM.RUP) x ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (ceil x))) + (gen_vec_round x (FRM.RUP) ty)) + +;;;;; Rules for `floor`;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (floor x))) + (gen_float_round (FRM.RDN) x ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (floor x))) + (gen_vec_round x (FRM.RDN) ty)) + +;;;;; Rules for `trunc`;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (trunc x))) + (gen_float_round (FRM.RTZ) x ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (trunc x))) + (gen_vec_round x (FRM.RTZ) ty)) + +;;;;; Rules for `nearest`;;;;;;;;; +(rule 0 (lower (has_type (ty_supported_float ty) (nearest x))) + (gen_float_round (FRM.RNE) x ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (nearest x))) + (gen_vec_round x (FRM.RNE) ty)) + + +;;;;; Rules for `select_spectre_guard`;;;;;;;;; + +;; SelectSpectreGuard is equivalent to Select, but we should not use a branch based +;; lowering for it. Instead we use a conditional move based lowering. +;; +;; We don't have cmov's in RISC-V either, but we can emulate those using bitwise +;; operations, which is what we do below. + +;; Base case: use `gen_bmask` to generate a 0 mask or -1 mask from the value of +;; `cmp`. This is then used with some bit twiddling to produce the final result. +(rule 0 (lower (has_type (fits_in_64 _) (select_spectre_guard cmp x y))) + (let ((mask XReg (gen_bmask cmp))) + (rv_or (rv_and mask x) (rv_andn y mask)))) +(rule 1 (lower (has_type $I128 (select_spectre_guard cmp x y))) + (let ((mask XReg (gen_bmask cmp))) + (value_regs + (rv_or (rv_and mask (value_regs_get x 0)) (rv_andn (value_regs_get y 0) mask)) + (rv_or (rv_and mask (value_regs_get x 1)) (rv_andn (value_regs_get y 1) mask))))) + +;; Special case when an argument is the constant zero as some ands and ors +;; can be folded away. +(rule 2 (lower (has_type (fits_in_64 _) (select_spectre_guard cmp (i64_from_iconst 0) y))) + (rv_andn y (gen_bmask cmp))) +(rule 3 (lower (has_type (fits_in_64 _) (select_spectre_guard cmp x (i64_from_iconst 0)))) + (rv_and x (gen_bmask cmp))) + +;;;;; Rules for `bmask`;;;;;;;;; +(rule + (lower (has_type oty (bmask x))) + (lower_bmask x oty)) + +;; N.B.: the Ret itself is generated by the ABI. +(rule (lower (return args)) + (lower_return args)) + +;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;; + +(rule (lower (get_frame_pointer)) + (gen_mov_from_preg (fp_reg))) + +(rule (lower (get_stack_pointer)) + (gen_mov_from_preg (sp_reg))) + +(rule (lower (get_return_address)) + (load_ra)) + +;;; Rules for `iabs` ;;;;;;;;;;;;; + +;; I64 and lower +;; Generate the following code: +;; sext.{b,h,w} a0, a0 +;; neg a1, a0 +;; max a0, a0, a1 +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iabs x))) + (let ((extended XReg (sext x)) + (negated XReg (rv_neg extended))) + (gen_select_xreg (cmp_gt extended negated) extended negated))) + +;; For vectors we generate the same code, but with vector instructions +;; we can skip the sign extension, since the vector unit will only process +;; Element Sized chunks. +(rule 1 (lower (has_type (ty_supported_vec ty) (iabs x))) + (let ((negated VReg (rv_vneg_v x (unmasked) ty))) + (rv_vmax_vv x negated (unmasked) ty))) + +;;;; Rules for calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (call (func_ref_data sig_ref extname dist) inputs)) + (gen_call sig_ref extname dist inputs)) + +(rule (lower (call_indirect sig_ref val inputs)) + (gen_call_indirect sig_ref val inputs)) + +;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (return_call (func_ref_data sig_ref extname dist) args)) + (gen_return_call sig_ref extname dist args)) + +(rule (lower (return_call_indirect sig_ref callee args)) + (gen_return_call_indirect sig_ref callee args)) + + +;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx))) + (gen_extractlane ty x idx)) + +;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We can insert a lane by using a masked splat from an X register. +;; Build a mask that is only enabled in the lane we want to insert. +;; Then use a masked splat (vmerge) to insert the value. +(rule 0 (lower (insertlane vec @ (value_type (ty_supported_vec ty)) + val @ (value_type (ty_int _)) + (u8_from_uimm8 lane))) + (let ((mask VReg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vmerge_vxm vec val mask ty))) + +;; Similar to above, but using the float variants of the instructions. +(rule 1 (lower (insertlane vec @ (value_type (ty_supported_vec ty)) + val @ (value_type (ty_supported_float _)) + (u8_from_uimm8 lane))) + (let ((mask VReg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vfmerge_vfm vec val mask ty))) + +;; If we are inserting from an Imm5 const we can use the immediate +;; variant of vmerge. +(rule 2 (lower (insertlane vec @ (value_type (ty_supported_vec ty)) + (i64_from_iconst (imm5_from_i64 imm)) + (u8_from_uimm8 lane))) + (let ((mask VReg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vmerge_vim vec imm mask ty))) + +;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type ty (splat n @ (value_type (ty_supported_float _))))) + (rv_vfmv_vf n ty)) + +(rule 1 (lower (has_type ty (splat n @ (value_type (ty_int_ref_scalar_64 _))))) + (rv_vmv_vx n ty)) + +(rule 2 (lower (has_type ty (splat (iconst (u64_from_imm64 (imm5_from_u64 imm)))))) + (rv_vmv_vi imm ty)) + +;; TODO: We can splat out more patterns by using for example a vmv.v.i i8x16 for +;; a i64x2 const with a compatible bit pattern. The AArch64 Backend does something +;; similar in its splat rules. +;; TODO: Look through bitcasts when splatting out registers. We can use +;; `vmv.v.x` in a `(splat.f32x4 (bitcast.f32 val))`. And vice versa for integers. + +;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (uadd_sat x y))) + (rv_vsaddu_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (uadd_sat x (splat y)))) + (rv_vsaddu_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (uadd_sat (splat x) y))) + (rv_vsaddu_vx y x (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (uadd_sat x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vsaddu_vi x y_imm (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (uadd_sat x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vsaddu_vi y x_imm (unmasked) ty)) + +;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (sadd_sat x y))) + (rv_vsadd_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (sadd_sat x (splat y)))) + (rv_vsadd_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (sadd_sat (splat x) y))) + (rv_vsadd_vx y x (unmasked) ty)) + +(rule 3 (lower (has_type (ty_supported_vec ty) (sadd_sat x y))) + (if-let y_imm (replicated_imm5 y)) + (rv_vsadd_vi x y_imm (unmasked) ty)) + +(rule 4 (lower (has_type (ty_supported_vec ty) (sadd_sat x y))) + (if-let x_imm (replicated_imm5 x)) + (rv_vsadd_vi y x_imm (unmasked) ty)) + +;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (usub_sat x y))) + (rv_vssubu_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (usub_sat x (splat y)))) + (rv_vssubu_vx x y (unmasked) ty)) + +;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (ssub_sat x y))) + (rv_vssub_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (ssub_sat x (splat y)))) + (rv_vssub_vx x y (unmasked) ty)) + +;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Here we do a Vector Reduce operation. Get the unsigned minimum value of any +;; lane in the vector. The fixed input to the reduce operation is a 1. +;; This way, if any lane is 0, the result will be 0. Otherwise, the result will +;; be a 1. +;; The reduce operation leaves the result in the lowest lane, we then move it +;; into the destination X register. +(rule (lower (vall_true x @ (value_type (ty_supported_vec ty)))) + (if-let one (i8_to_imm5 1)) + ;; We don't need to broadcast the immediate into all lanes, only into lane 0. + ;; I did it this way since it uses one less instruction than with a vmv.s.x. + (let ((fixed VReg (rv_vmv_vi one ty)) + (min VReg (rv_vredminu_vs x fixed (unmasked) ty))) + (rv_vmv_xs min ty))) + + +;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Here we do a Vector Reduce operation. Get the unsigned maximum value of the +;; input vector register. Move the max to an X register, and do a `snez` on it +;; to ensure its either 1 or 0. +(rule (lower (vany_true x @ (value_type (ty_supported_vec ty)))) + (let ((max VReg (rv_vredmaxu_vs x x (unmasked) ty)) + (x_max XReg (rv_vmv_xs max ty))) + (rv_snez x_max))) + + +;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; To check if the MSB of a lane is set, we do a `vmslt` with zero, this sets +;; the mask bit to 1 if the value is negative (MSB 1) and 0 if not. We can then +;; just move that mask to an X Register. +;; +;; We must ensure that the move to the X register has a SEW with enough bits +;; to hold the full mask. Additionally, in some cases (e.g. i64x2) we are going +;; to read some tail bits. These are undefined, so we need to further mask them +;; off. +(rule (lower (vhigh_bits x @ (value_type (ty_supported_vec ty)))) + (let ((mask VReg (rv_vmslt_vx x (zero_reg) (unmasked) ty)) + ;; Here we only need I64X1, but emit an AVL of 2 since it + ;; saves one vector state change in the case of I64X2. + ;; + ;; TODO: For types that have more lanes than element bits, we can + ;; use the original type as a VState and avoid a state change. + (x_mask XReg (rv_vmv_xs mask (vstate_from_type $I64X2)))) + (gen_andi x_mask (ty_lane_mask ty)))) + +;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (swizzle x y))) + (rv_vrgather_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (swizzle x (splat y)))) + (rv_vrgather_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (swizzle x y))) + (if-let y_imm (replicated_uimm5 y)) + (rv_vrgather_vi x y_imm (unmasked) ty)) + +;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Use a vrgather to load all 0-15 lanes from x. And then modify the mask to load all +;; 16-31 lanes from y. Finally, use a vor to combine the two vectors. +;; +;; vrgather will insert a 0 for lanes that are out of bounds, so we can let it load +;; negative and out of bounds indexes. +(rule (lower (has_type (ty_supported_vec ty @ $I8X16) (shuffle x y (vconst_from_immediate mask)))) + (if-let neg16 (i8_to_imm5 -16)) + (let ((x_mask VReg (gen_constant ty mask)) + (x_lanes VReg (rv_vrgather_vv x x_mask (unmasked) ty)) + (y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty)) + (y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty))) + (rv_vor_vv x_lanes y_lanes (unmasked) ty))) + +;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Slide down half the vector, and do a signed extension. +(rule 0 (lower (has_type (ty_supported_vec out_ty) (swiden_high x @ (value_type in_ty)))) + (rv_vsext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_supported_vec out_ty) (swiden_high (swiden_high x @ (value_type in_ty))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vsext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_supported_vec out_ty) (swiden_high (swiden_high (swiden_high x @ (value_type in_ty)))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vsext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Slide down half the vector, and do a zero extension. +(rule 0 (lower (has_type (ty_supported_vec out_ty) (uwiden_high x @ (value_type in_ty)))) + (rv_vzext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_supported_vec out_ty) (uwiden_high (uwiden_high x @ (value_type in_ty))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vzext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_supported_vec out_ty) (uwiden_high (uwiden_high (uwiden_high x @ (value_type in_ty)))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vzext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec out_ty) (swiden_low x))) + (rv_vsext_vf2 x (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_supported_vec out_ty) (swiden_low (swiden_low x)))) + (rv_vsext_vf4 x (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_supported_vec out_ty) (swiden_low (swiden_low (swiden_low x))))) + (rv_vsext_vf8 x (unmasked) out_ty)) + +;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec out_ty) (uwiden_low x))) + (rv_vzext_vf2 x (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_supported_vec out_ty) (uwiden_low (uwiden_low x)))) + (rv_vzext_vf4 x (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_supported_vec out_ty) (uwiden_low (uwiden_low (uwiden_low x))))) + (rv_vzext_vf8 x (unmasked) out_ty)) + +;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We don't have a dedicated instruction for this, rearrange the register elements +;; and use a vadd. +;; +;; We do this by building two masks, one for the even elements and one for the odd +;; elements. Using vcompress we can extract the elements and group them together. +;; +;; This is likely not the optimal way of doing this. LLVM does this using a bunch +;; of vrgathers (See: https://godbolt.org/z/jq8Wj8WG4), that doesn't seem to be +;; too much better than this. +;; +;; However V8 does something better. They use 2 vcompresses using LMUL2, that means +;; that they can do the whole thing in 3 instructions (2 vcompress + vadd). We don't +;; support LMUL > 1, so we can't do that. +(rule (lower (has_type (ty_supported_vec ty) (iadd_pairwise x y))) + (if-let half_size (u64_to_uimm5 (u64_udiv (ty_lane_count ty) 2))) + (let ((odd_mask VReg (gen_vec_mask 0x5555555555555555)) + (lhs_lo VReg (rv_vcompress_vm x odd_mask ty)) + (lhs_hi VReg (rv_vcompress_vm y odd_mask ty)) + (lhs VReg (rv_vslideup_vvi lhs_lo lhs_hi half_size (unmasked) ty)) + + (even_mask VReg (gen_vec_mask 0xAAAAAAAAAAAAAAAA)) + (rhs_lo VReg (rv_vcompress_vm x even_mask ty)) + (rhs_hi VReg (rv_vcompress_vm y even_mask ty)) + (rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty))) + (rv_vadd_vv lhs rhs (unmasked) ty))) + +;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2 +;; +;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book +;; +;; The floor average of two integers without overflow can be computed as: +;; t = (x & y) + ((x ^ y) >> 1) +;; +;; The right shift should be a logical shift if the integers are unsigned. +;; +;; We are however interested in the ceiling average (x + y + 1). For that +;; we use a special rounding mode in the right shift instruction. +;; +;; For the right shift instruction we use `vssrl` which is a Scaling Shift +;; Right Logical instruction using the `vxrm` fixed-point rounding mode. The +;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)). +;; Which is coincidentally the rounding mode we want for `avg_round`. +(rule (lower (has_type (ty_supported_vec ty) (avg_round x y))) + (if-let one (u64_to_uimm5 1)) + (let ((lhs VReg (rv_vand_vv x y (unmasked) ty)) + (xor VReg (rv_vxor_vv x y (unmasked) ty)) + (rhs VReg (rv_vssrl_vi xor one (unmasked) ty))) + (rv_vadd_vv lhs rhs (unmasked) ty))) + +;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (scalar_to_vector x))) + (if (ty_vector_float ty)) + (let ((zero VReg (rv_vmv_vx (zero_reg) ty)) + (elem VReg (rv_vfmv_sf x ty)) + (mask VReg (gen_vec_mask 1))) + (rv_vmerge_vvm zero elem mask ty))) + +(rule 1 (lower (has_type (ty_supported_vec ty) (scalar_to_vector x))) + (if (ty_vector_not_float ty)) + (let ((zero VReg (rv_vmv_vx (zero_reg) ty)) + (mask VReg (gen_vec_mask 1))) + (rv_vmerge_vxm zero x mask ty))) + +(rule 2 (lower (has_type (ty_supported_vec ty) (scalar_to_vector (imm5_from_value x)))) + (let ((zero VReg (rv_vmv_vx (zero_reg) ty)) + (mask VReg (gen_vec_mask 1))) + (rv_vmerge_vim zero x mask ty))) + +;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_supported_vec ty) (sqmul_round_sat x y))) + (rv_vsmul_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_supported_vec ty) (sqmul_round_sat x (splat y)))) + (rv_vsmul_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_supported_vec ty) (sqmul_round_sat (splat x) y))) + (rv_vsmul_vx y x (unmasked) ty)) + +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (ty_supported_vec out_ty) (snarrow x @ (value_type in_ty) y))) + (if-let lane_diff (u64_to_uimm5 (u64_udiv (ty_lane_count out_ty) 2))) + (if-let zero (u64_to_uimm5 0)) + (let ((x_clip VReg (rv_vnclip_wi x zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty)))) + (y_clip VReg (rv_vnclip_wi y zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty))))) + (rv_vslideup_vvi x_clip y_clip lane_diff (unmasked) out_ty))) + +;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (ty_supported_vec out_ty) (uunarrow x @ (value_type in_ty) y))) + (if-let lane_diff (u64_to_uimm5 (u64_udiv (ty_lane_count out_ty) 2))) + (if-let zero (u64_to_uimm5 0)) + (let ((x_clip VReg (rv_vnclipu_wi x zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty)))) + (y_clip VReg (rv_vnclipu_wi y zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty))))) + (rv_vslideup_vvi x_clip y_clip lane_diff (unmasked) out_ty))) + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We don't have a instruction that saturates a signed source into an unsigned destination. +;; To correct for this we just remove negative values using `vmax` and then use the normal +;; unsigned to unsigned narrowing instruction. + +(rule (lower (has_type (ty_supported_vec out_ty) (unarrow x @ (value_type in_ty) y))) + (if-let lane_diff (u64_to_uimm5 (u64_udiv (ty_lane_count out_ty) 2))) + (if-let zero (u64_to_uimm5 0)) + (let ((x_pos VReg (rv_vmax_vx x (zero_reg) (unmasked) in_ty)) + (y_pos VReg (rv_vmax_vx y (zero_reg) (unmasked) in_ty)) + (x_clip VReg (rv_vnclipu_wi x_pos zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty)))) + (y_clip VReg (rv_vnclipu_wi y_pos zero (unmasked) (vstate_mf2 (ty_half_lanes out_ty))))) + (rv_vslideup_vvi x_clip y_clip lane_diff (unmasked) out_ty))) diff --git a/hbcb/src/lower.rs b/hbcb/src/lower.rs new file mode 100644 index 0000000..6e12183 --- /dev/null +++ b/hbcb/src/lower.rs @@ -0,0 +1,36 @@ +//! Lowering rules for Riscv64. +use { + crate::{inst::*, Riscv64Backend}, + cranelift_codegen::{ + ir::Inst as IRInst, + machinst::{lower::*, *}, + }, +}; +pub mod isle; + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for Riscv64Backend { + type FactFlowState = (); + type MInst = Inst; + + fn lower(&self, ctx: &mut Lower, ir_inst: IRInst) -> Option { + isle::lower(ctx, self, ir_inst) + } + + fn lower_branch( + &self, + ctx: &mut Lower, + ir_inst: IRInst, + targets: &[MachLabel], + ) -> Option<()> { + isle::lower_branch(ctx, self, ir_inst, targets) + } + + fn maybe_pinned_reg(&self) -> Option { + // pinned register is a register that you want put anything in it. + // right now riscv64 not support this feature. + None + } +} diff --git a/hbcb/src/lower/isle.rs b/hbcb/src/lower/isle.rs new file mode 100644 index 0000000..8075156 --- /dev/null +++ b/hbcb/src/lower/isle.rs @@ -0,0 +1,730 @@ +//! ISLE integration glue code for riscv64 lowering. + +// Pull in the ISLE generated code. +#[allow(unused)] +pub mod generated_code; +// Types that the generated ISLE code uses via `use super::*`. +use { + self::generated_code::{FpuOPWidth, VecAluOpRR, VecLmul}, + crate::{inst::*, Riscv64Backend}, + cranelift_codegen::{ + abi::Riscv64ABICallSite, + ir::{ + immediates::*, types::*, AtomicRmwOp, BlockCall, ExternalName, Inst, InstructionData, + MemFlags, Opcode, TrapCode, Value, ValueList, + }, + isa::{self}, + lower::args::{FReg, VReg, WritableFReg, WritableVReg, WritableXReg, XReg}, + machinst::{ + isle::*, ArgPair, CallInfo, InstOutput, IsTailCall, MachInst, Reg, VCodeConstant, + VCodeConstantData, + }, + }, + generated_code::MInst, + regalloc2::PReg, + std::{boxed::Box, vec::Vec}, +}; + +type BoxCallInfo = Box>; +type BoxCallIndInfo = Box>; +type BoxReturnCallInfo = Box>; +type BoxReturnCallIndInfo = Box>; +type BoxExternalName = Box; +type VecMachLabel = Vec; +type VecArgPair = Vec; + +pub(crate) struct RV64IsleContext<'a, 'b, I, B> +where + I: VCodeInst, + B: LowerBackend, +{ + pub lower_ctx: &'a mut Lower<'b, I>, + pub backend: &'a B, + /// Precalucated value for the minimum vector register size. Will be 0 if + /// vectors are not supported. + min_vec_reg_size: u64, +} + +impl<'a, 'b> RV64IsleContext<'a, 'b, MInst, Riscv64Backend> { + fn new(lower_ctx: &'a mut Lower<'b, MInst>, backend: &'a Riscv64Backend) -> Self { + Self { lower_ctx, backend, min_vec_reg_size: backend.isa_flags.min_vec_reg_size() } + } +} + +impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> { + isle_lower_prelude_methods!(); + + isle_prelude_caller_methods!(Riscv64MachineDeps, Riscv64ABICallSite); + + fn gen_return_call( + &mut self, + callee_sig: SigRef, + callee: ExternalName, + distance: RelocDistance, + args: ValueSlice, + ) -> InstOutput { + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let call_site = Riscv64ABICallSite::from_func( + self.lower_ctx.sigs(), + callee_sig, + &callee, + IsTailCall::Yes, + distance, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() + } + + fn gen_return_call_indirect( + &mut self, + callee_sig: SigRef, + callee: Value, + args: ValueSlice, + ) -> InstOutput { + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let callee = self.put_in_reg(callee); + + let call_site = Riscv64ABICallSite::from_ptr( + self.lower_ctx.sigs(), + callee_sig, + callee, + IsTailCall::Yes, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() + } + + fn fpu_op_width_from_ty(&mut self, ty: Type) -> FpuOPWidth { + match ty { + F16 => FpuOPWidth::H, + F32 => FpuOPWidth::S, + F64 => FpuOPWidth::D, + F128 => FpuOPWidth::Q, + _ => unimplemented!("Unimplemented FPU Op Width: {ty}"), + } + } + + fn vreg_new(&mut self, r: Reg) -> VReg { + VReg::new(r).unwrap() + } + + fn writable_vreg_new(&mut self, r: WritableReg) -> WritableVReg { + r.map(|wr| VReg::new(wr).unwrap()) + } + + fn writable_vreg_to_vreg(&mut self, arg0: WritableVReg) -> VReg { + arg0.to_reg() + } + + fn writable_vreg_to_writable_reg(&mut self, arg0: WritableVReg) -> WritableReg { + arg0.map(|vr| vr.to_reg()) + } + + fn vreg_to_reg(&mut self, arg0: VReg) -> Reg { + *arg0 + } + + fn xreg_new(&mut self, r: Reg) -> XReg { + XReg::new(r).unwrap() + } + + fn writable_xreg_new(&mut self, r: WritableReg) -> WritableXReg { + r.map(|wr| XReg::new(wr).unwrap()) + } + + fn writable_xreg_to_xreg(&mut self, arg0: WritableXReg) -> XReg { + arg0.to_reg() + } + + fn writable_xreg_to_writable_reg(&mut self, arg0: WritableXReg) -> WritableReg { + arg0.map(|xr| xr.to_reg()) + } + + fn xreg_to_reg(&mut self, arg0: XReg) -> Reg { + *arg0 + } + + fn freg_new(&mut self, r: Reg) -> FReg { + FReg::new(r).unwrap() + } + + fn writable_freg_new(&mut self, r: WritableReg) -> WritableFReg { + r.map(|wr| FReg::new(wr).unwrap()) + } + + fn writable_freg_to_freg(&mut self, arg0: WritableFReg) -> FReg { + arg0.to_reg() + } + + fn writable_freg_to_writable_reg(&mut self, arg0: WritableFReg) -> WritableReg { + arg0.map(|fr| fr.to_reg()) + } + + fn freg_to_reg(&mut self, arg0: FReg) -> Reg { + *arg0 + } + + fn min_vec_reg_size(&mut self) -> u64 { + self.min_vec_reg_size + } + + #[inline] + fn ty_vec_fits_in_register(&mut self, ty: Type) -> Option { + if ty.is_vector() && (ty.bits() as u64) <= self.min_vec_reg_size() { + Some(ty) + } else { + None + } + } + + fn ty_supported(&mut self, ty: Type) -> Option { + let lane_type = ty.lane_type(); + let supported = match ty { + // Scalar integers are always supported + ty if ty.is_int() => true, + // Floating point types depend on certain extensions + F16 => self.backend.isa_flags.has_zfh(), + // F32 depends on the F extension + F32 => self.backend.isa_flags.has_f(), + // F64 depends on the D extension + F64 => self.backend.isa_flags.has_d(), + + // The base vector extension supports all integer types, up to 64 bits + // as long as they fit in a register + ty if self.ty_vec_fits_in_register(ty).is_some() + && lane_type.is_int() + && lane_type.bits() <= 64 => + { + true + } + + // If the vector type has floating point lanes then the spec states: + // + // Vector instructions where any floating-point vector operand’s EEW is not a + // supported floating-point type width (which includes when FLEN < SEW) are reserved. + // + // So we also have to check if we support the scalar version of the type. + ty if self.ty_vec_fits_in_register(ty).is_some() + && lane_type.is_float() + && self.ty_supported(lane_type).is_some() + // Additionally the base V spec only supports 32 and 64 bit floating point types. + && (lane_type.bits() == 32 || lane_type.bits() == 64) => + { + true + } + + // Otherwise do not match + _ => false, + }; + + if supported { + Some(ty) + } else { + None + } + } + + fn ty_supported_float(&mut self, ty: Type) -> Option { + self.ty_supported(ty).filter(|ty| ty.is_float()) + } + + fn ty_supported_vec(&mut self, ty: Type) -> Option { + self.ty_supported(ty).filter(|ty| ty.is_vector()) + } + + fn load_ra(&mut self) -> Reg { + if self.backend.flags.preserve_frame_pointers() { + let tmp = self.temp_writable_reg(I64); + self.emit(&MInst::Load { + rd: tmp, + op: LoadOP::Ld, + flags: MemFlags::trusted(), + from: AMode::FPOffset(8), + }); + tmp.to_reg() + } else { + link_reg() + } + } + + fn label_to_br_target(&mut self, label: MachLabel) -> CondBrTarget { + CondBrTarget::Label(label) + } + + fn imm12_and(&mut self, imm: Imm12, x: u64) -> Imm12 { + Imm12::from_i16(imm.as_i16() & (x as i16)) + } + + fn fli_constant_from_u64(&mut self, ty: Type, imm: u64) -> Option { + FliConstant::maybe_from_u64(ty, imm) + } + + fn fli_constant_from_negated_u64(&mut self, ty: Type, imm: u64) -> Option { + let negated_imm = match ty { + F64 => imm ^ 0x8000000000000000, + F32 => imm ^ 0x80000000, + _ => unimplemented!(), + }; + + FliConstant::maybe_from_u64(ty, negated_imm) + } + + fn i64_generate_imm(&mut self, imm: i64) -> Option<(Imm20, Imm12)> { + MInst::generate_imm(imm as u64) + } + + fn i64_shift_for_lui(&mut self, imm: i64) -> Option<(u64, Imm12)> { + let trailing = imm.trailing_zeros(); + if trailing < 12 { + return None; + } + + let shift = Imm12::from_i16(trailing as i16 - 12); + let base = (imm as u64) >> trailing; + Some((base, shift)) + } + + fn i64_shift(&mut self, imm: i64) -> Option<(i64, Imm12)> { + let trailing = imm.trailing_zeros(); + // We can do without this condition but in this case there is no need to go further + if trailing == 0 { + return None; + } + + let shift = Imm12::from_i16(trailing as i16); + let base = imm >> trailing; + Some((base, shift)) + } + + #[inline] + fn emit(&mut self, arg0: &MInst) -> Unit { + self.lower_ctx.emit(arg0.clone()); + } + + #[inline] + fn imm12_from_u64(&mut self, arg0: u64) -> Option { + Imm12::maybe_from_u64(arg0) + } + + #[inline] + fn imm12_from_i64(&mut self, arg0: i64) -> Option { + Imm12::maybe_from_i64(arg0) + } + + #[inline] + fn imm12_is_zero(&mut self, imm: Imm12) -> Option<()> { + if imm.as_i16() == 0 { + Some(()) + } else { + None + } + } + + #[inline] + fn imm20_from_u64(&mut self, arg0: u64) -> Option { + Imm20::maybe_from_u64(arg0) + } + + #[inline] + fn imm20_from_i64(&mut self, arg0: i64) -> Option { + Imm20::maybe_from_i64(arg0) + } + + #[inline] + fn imm20_is_zero(&mut self, imm: Imm20) -> Option<()> { + if imm.as_i32() == 0 { + Some(()) + } else { + None + } + } + + #[inline] + fn imm5_from_u64(&mut self, arg0: u64) -> Option { + Imm5::maybe_from_i8(i8::try_from(arg0 as i64).ok()?) + } + + #[inline] + fn imm5_from_i64(&mut self, arg0: i64) -> Option { + Imm5::maybe_from_i8(i8::try_from(arg0).ok()?) + } + + #[inline] + fn i8_to_imm5(&mut self, arg0: i8) -> Option { + Imm5::maybe_from_i8(arg0) + } + + #[inline] + fn uimm5_bitcast_to_imm5(&mut self, arg0: UImm5) -> Imm5 { + Imm5::from_bits(arg0.bits() as u8) + } + + #[inline] + fn uimm5_from_u8(&mut self, arg0: u8) -> Option { + UImm5::maybe_from_u8(arg0) + } + + #[inline] + fn uimm5_from_u64(&mut self, arg0: u64) -> Option { + arg0.try_into().ok().and_then(UImm5::maybe_from_u8) + } + + #[inline] + fn writable_zero_reg(&mut self) -> WritableReg { + writable_zero_reg() + } + + #[inline] + fn zero_reg(&mut self) -> XReg { + XReg::new(zero_reg()).unwrap() + } + + fn is_non_zero_reg(&mut self, reg: XReg) -> Option<()> { + if reg != self.zero_reg() { + Some(()) + } else { + None + } + } + + fn is_zero_reg(&mut self, reg: XReg) -> Option<()> { + if reg == self.zero_reg() { + Some(()) + } else { + None + } + } + + #[inline] + fn imm_from_bits(&mut self, val: u64) -> Imm12 { + Imm12::maybe_from_u64(val).unwrap() + } + + #[inline] + fn imm_from_neg_bits(&mut self, val: i64) -> Imm12 { + Imm12::maybe_from_i64(val).unwrap() + } + + fn frm_bits(&mut self, frm: &FRM) -> UImm5 { + UImm5::maybe_from_u8(frm.bits()).unwrap() + } + + fn u8_as_i32(&mut self, x: u8) -> i32 { + x as i32 + } + + fn imm12_const(&mut self, val: i32) -> Imm12 { + if let Some(res) = Imm12::maybe_from_i64(val as i64) { + res + } else { + panic!("Unable to make an Imm12 value from {val}") + } + } + + fn imm12_const_add(&mut self, val: i32, add: i32) -> Imm12 { + Imm12::maybe_from_i64((val + add) as i64).unwrap() + } + + fn imm12_add(&mut self, val: Imm12, add: i32) -> Option { + Imm12::maybe_from_i64((i32::from(val.as_i16()) + add).into()) + } + + // + fn gen_shamt(&mut self, ty: Type, shamt: XReg) -> ValueRegs { + let ty_bits = if ty.bits() > 64 { 64 } else { ty.bits() }; + let ty_bits = i16::try_from(ty_bits).unwrap(); + let shamt = { + let tmp = self.temp_writable_reg(I64); + self.emit(&MInst::AluRRImm12 { + alu_op: AluOPRRI::Andi, + rd: tmp, + rs: shamt.to_reg(), + imm12: Imm12::from_i16(ty_bits - 1), + }); + tmp.to_reg() + }; + let len_sub_shamt = { + let tmp = self.temp_writable_reg(I64); + self.emit(&MInst::load_imm12(tmp, Imm12::from_i16(ty_bits))); + let len_sub_shamt = self.temp_writable_reg(I64); + self.emit(&MInst::AluRRR { + alu_op: AluOPRRR::Sub, + rd: len_sub_shamt, + rs1: tmp.to_reg(), + rs2: shamt, + }); + len_sub_shamt.to_reg() + }; + ValueRegs::two(shamt, len_sub_shamt) + } + + fn has_v(&mut self) -> bool { + self.backend.isa_flags.has_v() + } + + fn has_m(&mut self) -> bool { + self.backend.isa_flags.has_m() + } + + fn has_zfa(&mut self) -> bool { + self.backend.isa_flags.has_zfa() + } + + fn has_zfh(&mut self) -> bool { + self.backend.isa_flags.has_zfh() + } + + fn has_zbkb(&mut self) -> bool { + self.backend.isa_flags.has_zbkb() + } + + fn has_zba(&mut self) -> bool { + self.backend.isa_flags.has_zba() + } + + fn has_zbb(&mut self) -> bool { + self.backend.isa_flags.has_zbb() + } + + fn has_zbc(&mut self) -> bool { + self.backend.isa_flags.has_zbc() + } + + fn has_zbs(&mut self) -> bool { + self.backend.isa_flags.has_zbs() + } + + fn has_zicond(&mut self) -> bool { + self.backend.isa_flags.has_zicond() + } + + fn gen_reg_offset_amode(&mut self, base: Reg, offset: i64) -> AMode { + AMode::RegOffset(base, offset) + } + + fn gen_sp_offset_amode(&mut self, offset: i64) -> AMode { + AMode::SPOffset(offset) + } + + fn gen_fp_offset_amode(&mut self, offset: i64) -> AMode { + AMode::FPOffset(offset) + } + + fn gen_stack_slot_amode(&mut self, ss: StackSlot, offset: i64) -> AMode { + // Offset from beginning of stackslot area. + let stack_off = self.lower_ctx.abi().sized_stackslot_offsets()[ss] as i64; + let sp_off: i64 = stack_off + offset; + AMode::SlotOffset(sp_off) + } + + fn gen_const_amode(&mut self, c: VCodeConstant) -> AMode { + AMode::Const(c) + } + + fn valid_atomic_transaction(&mut self, ty: Type) -> Option { + if ty.is_int() && ty.bits() <= 64 { + Some(ty) + } else { + None + } + } + + fn is_atomic_rmw_max_etc(&mut self, op: &AtomicRmwOp) -> Option<(AtomicRmwOp, bool)> { + let op = *op; + match op { + crate::ir::AtomicRmwOp::Umin => Some((op, false)), + crate::ir::AtomicRmwOp::Umax => Some((op, false)), + crate::ir::AtomicRmwOp::Smin => Some((op, true)), + crate::ir::AtomicRmwOp::Smax => Some((op, true)), + _ => None, + } + } + + fn sinkable_inst(&mut self, val: Value) -> Option { + self.is_sinkable_inst(val) + } + + fn load_op(&mut self, ty: Type) -> LoadOP { + LoadOP::from_type(ty) + } + + fn store_op(&mut self, ty: Type) -> StoreOP { + StoreOP::from_type(ty) + } + + fn load_ext_name(&mut self, name: ExternalName, offset: i64) -> Reg { + let tmp = self.temp_writable_reg(I64); + self.emit(&MInst::LoadExtName { rd: tmp, name: Box::new(name), offset }); + tmp.to_reg() + } + + fn gen_stack_addr(&mut self, slot: StackSlot, offset: Offset32) -> Reg { + let result = self.temp_writable_reg(I64); + let i = self.lower_ctx.abi().sized_stackslot_addr(slot, i64::from(offset) as u32, result); + self.emit(&i); + result.to_reg() + } + + fn atomic_amo(&mut self) -> AMO { + AMO::SeqCst + } + + fn lower_br_table(&mut self, index: Reg, targets: &[MachLabel]) -> Unit { + let tmp1 = self.temp_writable_reg(I64); + let tmp2 = self.temp_writable_reg(I64); + self.emit(&MInst::BrTable { index, tmp1, tmp2, targets: targets.to_vec() }); + } + + fn fp_reg(&mut self) -> PReg { + px_reg(8) + } + + fn sp_reg(&mut self) -> PReg { + px_reg(2) + } + + #[inline] + fn int_compare(&mut self, kind: &IntCC, rs1: XReg, rs2: XReg) -> IntegerCompare { + IntegerCompare { kind: *kind, rs1: rs1.to_reg(), rs2: rs2.to_reg() } + } + + #[inline] + fn int_compare_decompose(&mut self, cmp: IntegerCompare) -> (IntCC, XReg, XReg) { + (cmp.kind, self.xreg_new(cmp.rs1), self.xreg_new(cmp.rs2)) + } + + #[inline] + fn vstate_from_type(&mut self, ty: Type) -> VState { + VState::from_type(ty) + } + + #[inline] + fn vstate_mf2(&mut self, vs: VState) -> VState { + VState { vtype: VType { lmul: VecLmul::LmulF2, ..vs.vtype }, ..vs } + } + + fn vec_alu_rr_dst_type(&mut self, op: &VecAluOpRR) -> Type { + MInst::canonical_type_for_rc(op.dst_regclass()) + } + + fn bclr_imm(&mut self, ty: Type, i: u64) -> Option { + // Only consider those bits in the immediate which are up to the width + // of `ty`. + let neg = !i & (u64::MAX >> (64 - ty.bits())); + if neg.count_ones() != 1 { + return None; + } + Imm12::maybe_from_u64(neg.trailing_zeros().into()) + } + + fn binvi_imm(&mut self, i: u64) -> Option { + if i.count_ones() != 1 { + return None; + } + Imm12::maybe_from_u64(i.trailing_zeros().into()) + } + + fn bseti_imm(&mut self, i: u64) -> Option { + self.binvi_imm(i) + } + + fn fcvt_smin_bound(&mut self, float: Type, int: Type, saturating: bool) -> u64 { + match (int, float) { + // Saturating cases for larger integers are handled using the + // `fcvt.{w,d}.{s,d}` instruction directly, that automatically + // saturates up/down to the correct limit. + // + // NB: i32/i64 don't use this function because the native RISC-V + // instruction does everything we already need, so only cases for + // i8/i16 are listed here. + (I8, F32) if saturating => f32::from(i8::MIN).to_bits().into(), + (I8, F64) if saturating => f64::from(i8::MIN).to_bits(), + (I16, F32) if saturating => f32::from(i16::MIN).to_bits().into(), + (I16, F64) if saturating => f64::from(i16::MIN).to_bits(), + + (_, F32) if !saturating => f32_cvt_to_int_bounds(true, int.bits()).0.to_bits().into(), + (_, F64) if !saturating => f64_cvt_to_int_bounds(true, int.bits()).0.to_bits(), + _ => unimplemented!(), + } + } + + fn fcvt_smax_bound(&mut self, float: Type, int: Type, saturating: bool) -> u64 { + // NB: see `fcvt_smin_bound` for some more comments + match (int, float) { + (I8, F32) if saturating => f32::from(i8::MAX).to_bits().into(), + (I8, F64) if saturating => f64::from(i8::MAX).to_bits(), + (I16, F32) if saturating => f32::from(i16::MAX).to_bits().into(), + (I16, F64) if saturating => f64::from(i16::MAX).to_bits(), + + (_, F32) if !saturating => f32_cvt_to_int_bounds(true, int.bits()).1.to_bits().into(), + (_, F64) if !saturating => f64_cvt_to_int_bounds(true, int.bits()).1.to_bits(), + _ => unimplemented!(), + } + } + + fn fcvt_umax_bound(&mut self, float: Type, int: Type, saturating: bool) -> u64 { + // NB: see `fcvt_smin_bound` for some more comments + match (int, float) { + (I8, F32) if saturating => f32::from(u8::MAX).to_bits().into(), + (I8, F64) if saturating => f64::from(u8::MAX).to_bits(), + (I16, F32) if saturating => f32::from(u16::MAX).to_bits().into(), + (I16, F64) if saturating => f64::from(u16::MAX).to_bits(), + + (_, F32) if !saturating => f32_cvt_to_int_bounds(false, int.bits()).1.to_bits().into(), + (_, F64) if !saturating => f64_cvt_to_int_bounds(false, int.bits()).1.to_bits(), + _ => unimplemented!(), + } + } + + fn fcvt_umin_bound(&mut self, float: Type, saturating: bool) -> u64 { + assert!(!saturating); + match float { + F32 => (-1.0f32).to_bits().into(), + F64 => (-1.0f64).to_bits(), + _ => unimplemented!(), + } + } +} + +/// The main entry point for lowering with ISLE. +pub(crate) fn lower( + lower_ctx: &mut Lower, + backend: &Riscv64Backend, + inst: Inst, +) -> Option { + // TODO: reuse the ISLE context across lowerings so we can reuse its + // internal heap allocations. + let mut isle_ctx = RV64IsleContext::new(lower_ctx, backend); + generated_code::constructor_lower(&mut isle_ctx, inst) +} + +/// The main entry point for branch lowering with ISLE. +pub(crate) fn lower_branch( + lower_ctx: &mut Lower, + backend: &Riscv64Backend, + branch: Inst, + targets: &[MachLabel], +) -> Option<()> { + // TODO: reuse the ISLE context across lowerings so we can reuse its + // internal heap allocations. + let mut isle_ctx = RV64IsleContext::new(lower_ctx, backend); + generated_code::constructor_lower_branch(&mut isle_ctx, branch, targets) +} diff --git a/hbcb/src/lower/isle/generated_code.rs b/hbcb/src/lower/isle/generated_code.rs new file mode 100644 index 0000000..d5d1fea --- /dev/null +++ b/hbcb/src/lower/isle/generated_code.rs @@ -0,0 +1,9 @@ +// See https://github.com/rust-lang/rust/issues/47995: we cannot use `#![...]` attributes inside of +// the generated ISLE source below because we include!() it. We must include!() it because its path +// depends on an environment variable; and also because of this, we can't do the `#[path = "..."] +// mod generated_code;` trick either. +#![allow(dead_code, unreachable_code, unreachable_patterns)] +#![allow(unused_imports, unused_variables, non_snake_case, unused_mut)] +#![allow(irrefutable_let_patterns, clippy::clone_on_copy)] + +include!(concat!(env!("ISLE_DIR"), "/isle_riscv64.rs")); diff --git a/hbcb/src/prelude.isle b/hbcb/src/prelude.isle new file mode 100644 index 0000000..413ff00 --- /dev/null +++ b/hbcb/src/prelude.isle @@ -0,0 +1,752 @@ +;; This is a prelude of standard definitions for ISLE, the instruction-selector +;; DSL, as we use it bound to our interfaces. +;; +;; Note that all `extern` functions here are typically defined in the +;; `isle_prelude_methods` macro defined in `src/isa/isle.rs` + +;;;; Primitive and External Types ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; `()` +(type Unit (primitive Unit)) + +(decl pure unit () Unit) +(extern constructor unit unit) + +(type bool (primitive bool)) +(extern const $true bool) +(extern const $false bool) + +(type u8 (primitive u8)) +(type u16 (primitive u16)) +(type u32 (primitive u32)) +(type u64 (primitive u64)) +(type u128 (primitive u128)) +(type usize (primitive usize)) + +(type i8 (primitive i8)) +(type i16 (primitive i16)) +(type i32 (primitive i32)) +(type i64 (primitive i64)) +(type i128 (primitive i128)) +(type isize (primitive isize)) + +;; `cranelift-entity`-based identifiers. +(type Type (primitive Type)) +(type Value (primitive Value)) +(type ValueList (primitive ValueList)) +(type BlockCall (primitive BlockCall)) + +;; ISLE representation of `&[Value]`. +(type ValueSlice (primitive ValueSlice)) + +;; Extract the type of a `Value`. +(decl value_type (Type) Value) +(extern extractor infallible value_type value_type) + +;; Extractor that matches a `u32` only if non-negative. +(decl u32_nonnegative (u32) u32) +(extern extractor u32_nonnegative u32_nonnegative) + +;; Extractor that pulls apart an Offset32 into a i32 with the raw +;; signed-32-bit twos-complement bits. +(decl offset32 (i32) Offset32) +(extern extractor infallible offset32 offset32) + +;; Pure/fallible constructor that tests if one u32 is less than or +;; equal to another. +(decl pure partial u32_lteq (u32 u32) Unit) +(extern constructor u32_lteq u32_lteq) + +;; Pure/fallible constructor that tests if one u8 is less than or +;; equal to another. +(decl pure partial u8_lteq (u8 u8) Unit) +(extern constructor u8_lteq u8_lteq) + +;; Pure/fallible constructor that tests if one u8 is strictly less +;; than another. +(decl pure partial u8_lt (u8 u8) Unit) +(extern constructor u8_lt u8_lt) + +;;;; Primitive Type Conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl pure u8_as_i8 (u8) i8) +(extern constructor u8_as_i8 u8_as_i8) + +(decl pure u8_as_u32 (u8) u32) +(extern constructor u8_as_u32 u8_as_u32) +(convert u8 u32 u8_as_u32) + +(decl pure u8_as_u64 (u8) u64) +(extern constructor u8_as_u64 u8_as_u64) +(convert u8 u64 u8_as_u64) + +(decl pure u16_as_i16 (u16) i16) +(extern constructor u16_as_i16 u16_as_i16) + +(decl pure u16_as_u32 (u16) u32) +(extern constructor u16_as_u32 u16_as_u32) +(convert u16 u32 u16_as_u32) + +(decl pure u16_as_u64 (u16) u64) +(extern constructor u16_as_u64 u16_as_u64) +(convert u16 u64 u16_as_u64) + +(decl pure u64_as_u8 (u64) u8) +(extern constructor u64_as_u8 u64_as_u8) + +(decl pure u64_as_u16 (u64) u16) +(extern constructor u64_as_u16 u64_as_u16) + +(decl pure u64_as_i64 (u64) i64) +(extern constructor u64_as_i64 u64_as_i64) + +(decl pure partial u16_try_from_u64 (u64) u16) +(extern constructor u16_try_from_u64 u16_try_from_u64) + +(decl pure partial u32_try_from_u64 (u64) u32) +(extern constructor u32_try_from_u64 u32_try_from_u64) + +(decl pure partial i8_try_from_u64 (u64) i8) +(extern constructor i8_try_from_u64 i8_try_from_u64) + +(decl pure partial i16_try_from_u64 (u64) i16) +(extern constructor i16_try_from_u64 i16_try_from_u64) + +(decl pure partial i32_try_from_u64 (u64) i32) +(extern constructor i32_try_from_u64 i32_try_from_u64) + +(decl pure u32_as_u64 (u32) u64) +(extern constructor u32_as_u64 u32_as_u64) +(convert u32 u64 u32_as_u64) + +(decl pure i32_as_i64 (i32) i64) +(extern constructor i32_as_i64 i32_as_i64) +(convert i32 i64 i32_as_i64) + +(decl pure i64_as_u64 (i64) u64) +(extern constructor i64_as_u64 i64_as_u64) + +(decl pure i64_neg (i64) i64) +(extern constructor i64_neg i64_neg) + +(decl pure i8_neg (i8) i8) +(extern constructor i8_neg i8_neg) + +(decl u128_as_u64 (u64) u128) +(extern extractor u128_as_u64 u128_as_u64) + +(decl u64_as_u32 (u32) u64) +(extern extractor u64_as_u32 u64_as_u32) + +(decl u32_as_u16 (u16) u32) +(extern extractor u32_as_u16 u32_as_u16) + +(decl pure u64_as_i32 (u64) i32) +(extern constructor u64_as_i32 u64_as_i32) + +;;;; Primitive Arithmetic ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl pure u8_and (u8 u8) u8) +(extern constructor u8_and u8_and) + +(decl pure u8_shl (u8 u8) u8) +(extern constructor u8_shl u8_shl) + +(decl pure u8_shr (u8 u8) u8) +(extern constructor u8_shr u8_shr) + +(decl pure u8_sub (u8 u8) u8) +(extern constructor u8_sub u8_sub) + +(decl pure u32_add (u32 u32) u32) +(extern constructor u32_add u32_add) + +(decl pure u32_sub (u32 u32) u32) +(extern constructor u32_sub u32_sub) + +(decl pure u32_and (u32 u32) u32) +(extern constructor u32_and u32_and) + +(decl pure u32_shl (u32 u32) u32) +(extern constructor u32_shl u32_shl) + +;; Pure/fallible constructor that tries to add two `u32`s, interpreted +;; as signed values, and fails to match on overflow. +(decl pure partial s32_add_fallible (i32 i32) i32) +(extern constructor s32_add_fallible s32_add_fallible) + +(decl pure u64_add (u64 u64) u64) +(extern constructor u64_add u64_add) + +(decl pure u64_sub (u64 u64) u64) +(extern constructor u64_sub u64_sub) + +(decl pure u64_mul (u64 u64) u64) +(extern constructor u64_mul u64_mul) + +(decl pure partial u64_sdiv (u64 u64) u64) +(extern constructor u64_sdiv u64_sdiv) + +(decl pure partial u64_udiv (u64 u64) u64) +(extern constructor u64_udiv u64_udiv) + +(decl pure u64_and (u64 u64) u64) +(extern constructor u64_and u64_and) + +(decl pure u64_or (u64 u64) u64) +(extern constructor u64_or u64_or) + +(decl pure u64_xor (u64 u64) u64) +(extern constructor u64_xor u64_xor) + +(decl pure u64_shl (u64 u64) u64) +(extern constructor u64_shl u64_shl) + +(decl pure imm64_shl (Type Imm64 Imm64) Imm64) +(extern constructor imm64_shl imm64_shl) + +(decl pure imm64_ushr (Type Imm64 Imm64) Imm64) +(extern constructor imm64_ushr imm64_ushr) + +(decl pure imm64_sshr (Type Imm64 Imm64) Imm64) +(extern constructor imm64_sshr imm64_sshr) + +(decl pure u64_not (u64) u64) +(extern constructor u64_not u64_not) + +(decl pure u64_eq (u64 u64) bool) +(extern constructor u64_eq u64_eq) + +(decl pure u64_le (u64 u64) bool) +(extern constructor u64_le u64_le) + +(decl pure u64_lt (u64 u64) bool) +(extern constructor u64_lt u64_lt) + +(decl pure i64_shr (i64 i64) i64) +(extern constructor i64_shr i64_shr) + +(decl pure i64_ctz (i64) i64) +(extern constructor i64_ctz i64_ctz) + +;; Sign extends a u64 from ty bits up to 64bits +(decl pure i64_sextend_u64 (Type u64) i64) +(extern constructor i64_sextend_u64 i64_sextend_u64) + +(decl pure i64_sextend_imm64 (Type Imm64) i64) +(extern constructor i64_sextend_imm64 i64_sextend_imm64) + +(decl pure u64_uextend_imm64 (Type Imm64) u64) +(extern constructor u64_uextend_imm64 u64_uextend_imm64) + +(decl pure imm64_icmp (Type IntCC Imm64 Imm64) Imm64) +(extern constructor imm64_icmp imm64_icmp) + +(decl u64_is_zero (bool) u64) +(extern extractor infallible u64_is_zero u64_is_zero) + +(decl i64_is_zero (bool) i64) +(extern extractor infallible i64_is_zero i64_is_zero) + +(decl u64_zero () u64) +(extractor (u64_zero) (u64_is_zero $true)) + +(decl u64_nonzero (u64) u64) +(extractor (u64_nonzero x) (and (u64_is_zero $false) x)) + +(decl i64_nonzero (i64) i64) +(extractor (i64_nonzero x) (and (i64_is_zero $false) x)) + +(decl pure u64_is_odd (u64) bool) +(extern constructor u64_is_odd u64_is_odd) + +;; Each of these extractors tests whether the upper half of the input equals the +;; lower half of the input +(decl u128_replicated_u64 (u64) u128) +(extern extractor u128_replicated_u64 u128_replicated_u64) +(decl u64_replicated_u32 (u64) u64) +(extern extractor u64_replicated_u32 u64_replicated_u32) +(decl u32_replicated_u16 (u64) u64) +(extern extractor u32_replicated_u16 u32_replicated_u16) +(decl u16_replicated_u8 (u8) u64) +(extern extractor u16_replicated_u8 u16_replicated_u8) + +;; Floating point operations + +(decl pure partial f16_min (Ieee16 Ieee16) Ieee16) +(extern constructor f16_min f16_min) +(decl pure partial f16_max (Ieee16 Ieee16) Ieee16) +(extern constructor f16_max f16_max) +(decl pure f16_neg (Ieee16) Ieee16) +(extern constructor f16_neg f16_neg) +(decl pure f16_abs (Ieee16) Ieee16) +(extern constructor f16_abs f16_abs) +(decl pure f16_copysign (Ieee16 Ieee16) Ieee16) +(extern constructor f16_copysign f16_copysign) +(decl pure partial f32_add (Ieee32 Ieee32) Ieee32) +(extern constructor f32_add f32_add) +(decl pure partial f32_sub (Ieee32 Ieee32) Ieee32) +(extern constructor f32_sub f32_sub) +(decl pure partial f32_mul (Ieee32 Ieee32) Ieee32) +(extern constructor f32_mul f32_mul) +(decl pure partial f32_div (Ieee32 Ieee32) Ieee32) +(extern constructor f32_div f32_div) +(decl pure partial f32_sqrt (Ieee32) Ieee32) +(extern constructor f32_sqrt f32_sqrt) +(decl pure partial f32_ceil (Ieee32) Ieee32) +(extern constructor f32_ceil f32_ceil) +(decl pure partial f32_floor (Ieee32) Ieee32) +(extern constructor f32_floor f32_floor) +(decl pure partial f32_trunc (Ieee32) Ieee32) +(extern constructor f32_trunc f32_trunc) +(decl pure partial f32_nearest (Ieee32) Ieee32) +(extern constructor f32_nearest f32_nearest) +(decl pure partial f32_min (Ieee32 Ieee32) Ieee32) +(extern constructor f32_min f32_min) +(decl pure partial f32_max (Ieee32 Ieee32) Ieee32) +(extern constructor f32_max f32_max) +(decl pure f32_neg (Ieee32) Ieee32) +(extern constructor f32_neg f32_neg) +(decl pure f32_abs (Ieee32) Ieee32) +(extern constructor f32_abs f32_abs) +(decl pure f32_copysign (Ieee32 Ieee32) Ieee32) +(extern constructor f32_copysign f32_copysign) +(decl pure partial f64_add (Ieee64 Ieee64) Ieee64) +(extern constructor f64_add f64_add) +(decl pure partial f64_sub (Ieee64 Ieee64) Ieee64) +(extern constructor f64_sub f64_sub) +(decl pure partial f64_mul (Ieee64 Ieee64) Ieee64) +(extern constructor f64_mul f64_mul) +(decl pure partial f64_div (Ieee64 Ieee64) Ieee64) +(extern constructor f64_div f64_div) +(decl pure partial f64_sqrt (Ieee64) Ieee64) +(extern constructor f64_sqrt f64_sqrt) +(decl pure partial f64_ceil (Ieee64) Ieee64) +(extern constructor f64_ceil f64_ceil) +(decl pure partial f64_floor (Ieee64) Ieee64) +(extern constructor f64_floor f64_floor) +(decl pure partial f64_trunc (Ieee64) Ieee64) +(extern constructor f64_trunc f64_trunc) +(decl pure partial f64_nearest (Ieee64) Ieee64) +(extern constructor f64_nearest f64_nearest) +(decl pure partial f64_min (Ieee64 Ieee64) Ieee64) +(extern constructor f64_min f64_min) +(decl pure partial f64_max (Ieee64 Ieee64) Ieee64) +(extern constructor f64_max f64_max) +(decl pure f64_neg (Ieee64) Ieee64) +(extern constructor f64_neg f64_neg) +(decl pure f64_abs (Ieee64) Ieee64) +(extern constructor f64_abs f64_abs) +(decl pure f64_copysign (Ieee64 Ieee64) Ieee64) +(extern constructor f64_copysign f64_copysign) +(decl pure partial f128_min (Ieee128 Ieee128) Ieee128) +(extern constructor f128_min f128_min) +(decl pure partial f128_max (Ieee128 Ieee128) Ieee128) +(extern constructor f128_max f128_max) +(decl pure f128_neg (Ieee128) Ieee128) +(extern constructor f128_neg f128_neg) +(decl pure f128_abs (Ieee128) Ieee128) +(extern constructor f128_abs f128_abs) +(decl pure f128_copysign (Ieee128 Ieee128) Ieee128) +(extern constructor f128_copysign f128_copysign) +(type Ieee128 (primitive Ieee128)) + +;;;; `cranelift_codegen::ir::Type` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(extern const $I8 Type) +(extern const $I16 Type) +(extern const $I32 Type) +(extern const $I64 Type) +(extern const $I128 Type) + +(extern const $F16 Type) +(extern const $F32 Type) +(extern const $F64 Type) +(extern const $F128 Type) + +(extern const $I8X8 Type) +(extern const $I8X16 Type) +(extern const $I16X4 Type) +(extern const $I16X8 Type) +(extern const $I32X2 Type) +(extern const $I32X4 Type) +(extern const $I64X2 Type) + +(extern const $F32X4 Type) +(extern const $F64X2 Type) + +(extern const $I32X4XN Type) + +;; Get the unsigned minimum value for a given type. +;; This always zero, but is included for completeness. +(decl pure ty_umin (Type) u64) +(extern constructor ty_umin ty_umin) + +;; Get the unsigned maximum value for a given type. +(decl pure ty_umax (Type) u64) +(extern constructor ty_umax ty_umax) + +;; Get the signed minimum value for a given type. +(decl pure ty_smin (Type) u64) +(extern constructor ty_smin ty_smin) + +;; Get the signed maximum value for a given type. +(decl pure ty_smax (Type) u64) +(extern constructor ty_smax ty_smax) + +;; Get the bit width of a given type. +(decl pure ty_bits (Type) u8) +(extern constructor ty_bits ty_bits) + +;; Get the bit width of a given type. +(decl pure ty_bits_u16 (Type) u16) +(extern constructor ty_bits_u16 ty_bits_u16) + +;; Get the bit width of a given type. +(decl pure ty_bits_u64 (Type) u64) +(extern constructor ty_bits_u64 ty_bits_u64) + +;; Get a mask for the width of a given type. +(decl pure ty_mask (Type) u64) +(extern constructor ty_mask ty_mask) + +;; Get a mask that is set for each lane in a given type. +(decl pure ty_lane_mask (Type) u64) +(extern constructor ty_lane_mask ty_lane_mask) + +;; Get the number of lanes for a given type. +(decl pure ty_lane_count (Type) u64) +(extern constructor ty_lane_count ty_lane_count) + +;; Get the byte width of a given type. +(decl pure ty_bytes (Type) u16) +(extern constructor ty_bytes ty_bytes) + +;; Get the type of each lane in the given type. +(decl pure lane_type (Type) Type) +(extern constructor lane_type lane_type) + +;; Get a type with the same element type, but half the number of lanes. +(decl pure partial ty_half_lanes (Type) Type) +(extern constructor ty_half_lanes ty_half_lanes) + +;; Get a type with the same number of lanes but a lane type that is half as small. +(decl pure partial ty_half_width (Type) Type) +(extern constructor ty_half_width ty_half_width) + +;; Generate a mask for the maximum shift amount for a given type. i.e 31 for I32. +(decl pure ty_shift_mask (Type) u64) +(rule (ty_shift_mask ty) (u64_sub (ty_bits (lane_type ty)) 1)) + +;; Compare two types for equality. +(decl pure ty_equal (Type Type) bool) +(extern constructor ty_equal ty_equal) + +;;;; `cranelift_codegen::ir::MemFlags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; `MemFlags::trusted` +(decl pure mem_flags_trusted () MemFlags) +(extern constructor mem_flags_trusted mem_flags_trusted) + +;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Swap args of an IntCC flag. +(decl intcc_swap_args (IntCC) IntCC) +(extern constructor intcc_swap_args intcc_swap_args) + +;; Complement an IntCC flag. +(decl intcc_complement (IntCC) IntCC) +(extern constructor intcc_complement intcc_complement) + +;; This is a direct import of `IntCC::without_equal`. +;; Get the corresponding IntCC with the equal component removed. +;; For conditions without a zero component, this is a no-op. +(decl pure intcc_without_eq (IntCC) IntCC) +(extern constructor intcc_without_eq intcc_without_eq) + +;; Swap args of a FloatCC flag. +(decl floatcc_swap_args (FloatCC) FloatCC) +(extern constructor floatcc_swap_args floatcc_swap_args) + +;; Complement a FloatCC flag. +(decl floatcc_complement (FloatCC) FloatCC) +(extern constructor floatcc_complement floatcc_complement) + +;; True when this FloatCC involves an unordered comparison. +(decl pure floatcc_unordered (FloatCC) bool) +(extern constructor floatcc_unordered floatcc_unordered) + +;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl eq (Type Value Value) Value) +(extractor (eq ty x y) (icmp ty (IntCC.Equal) x y)) + +(decl ne (Type Value Value) Value) +(extractor (ne ty x y) (icmp ty (IntCC.NotEqual) x y)) + +(decl ult (Type Value Value) Value) +(extractor (ult ty x y) (icmp ty (IntCC.UnsignedLessThan) x y)) + +(decl ule (Type Value Value) Value) +(extractor (ule ty x y) (icmp ty (IntCC.UnsignedLessThanOrEqual) x y)) + +(decl ugt (Type Value Value) Value) +(extractor (ugt ty x y) (icmp ty (IntCC.UnsignedGreaterThan) x y)) + +(decl uge (Type Value Value) Value) +(extractor (uge ty x y) (icmp ty (IntCC.UnsignedGreaterThanOrEqual) x y)) + +(decl slt (Type Value Value) Value) +(extractor (slt ty x y) (icmp ty (IntCC.SignedLessThan) x y)) + +(decl sle (Type Value Value) Value) +(extractor (sle ty x y) (icmp ty (IntCC.SignedLessThanOrEqual) x y)) + +(decl sgt (Type Value Value) Value) +(extractor (sgt ty x y) (icmp ty (IntCC.SignedGreaterThan) x y)) + +(decl sge (Type Value Value) Value) +(extractor (sge ty x y) (icmp ty (IntCC.SignedGreaterThanOrEqual) x y)) + +;; An extractor that only matches types that can fit in 16 bits. +(decl fits_in_16 (Type) Type) +(extern extractor fits_in_16 fits_in_16) + +;; An extractor that only matches types that can fit in 32 bits. +(decl fits_in_32 (Type) Type) +(extern extractor fits_in_32 fits_in_32) + +;; An extractor that only matches types that can fit in 32 bits. +(decl lane_fits_in_32 (Type) Type) +(extern extractor lane_fits_in_32 lane_fits_in_32) + +;; An extractor that only matches types that can fit in 64 bits. +(decl fits_in_64 (Type) Type) +(extern extractor fits_in_64 fits_in_64) + +;; An extractor that only matches types that fit in exactly 32 bits. +(decl ty_32 (Type) Type) +(extern extractor ty_32 ty_32) + +;; An extractor that only matches types that fit in exactly 64 bits. +(decl ty_64 (Type) Type) +(extern extractor ty_64 ty_64) + +;; A pure constructor/extractor that only matches scalar integers, and +;; references that can fit in 64 bits. +(decl pure partial ty_int_ref_scalar_64 (Type) Type) +(extern constructor ty_int_ref_scalar_64 ty_int_ref_scalar_64) +(extern extractor ty_int_ref_scalar_64 ty_int_ref_scalar_64_extract) + +;; An extractor that matches 32- and 64-bit types only. +(decl ty_32_or_64 (Type) Type) +(extern extractor ty_32_or_64 ty_32_or_64) + +;; An extractor that matches 8- and 16-bit types only. +(decl ty_8_or_16 (Type) Type) +(extern extractor ty_8_or_16 ty_8_or_16) + +;; An extractor that matches 16- and 32-bit types only. +(decl ty_16_or_32 (Type) Type) +(extern extractor ty_16_or_32 ty_16_or_32) + +;; An extractor that matches int types that fit in 32 bits. +(decl int_fits_in_32 (Type) Type) +(extern extractor int_fits_in_32 int_fits_in_32) + +;; An extractor that matches I64. +(decl ty_int_ref_64 (Type) Type) +(extern extractor ty_int_ref_64 ty_int_ref_64) + +;; An extractor that matches int or reference types bigger than 16 bits but at most 64 bits. +(decl ty_int_ref_16_to_64 (Type) Type) +(extern extractor ty_int_ref_16_to_64 ty_int_ref_16_to_64) + +;; An extractor that only matches integers. +(decl ty_int (Type) Type) +(extern extractor ty_int ty_int) + +;; An extractor that only matches scalar types, float or int or ref's. +(decl ty_scalar (Type) Type) +(extern extractor ty_scalar ty_scalar) + +;; An extractor that only matches scalar floating-point types--F32 or F64. +(decl ty_scalar_float (Type) Type) +(extern extractor ty_scalar_float ty_scalar_float) + +;; An extractor that matches scalar floating-point types or vector types. +(decl ty_float_or_vec (Type) Type) +(extern extractor ty_float_or_vec ty_float_or_vec) + +;; A pure constructor that only matches vector floating-point types. +(decl pure partial ty_vector_float (Type) Type) +(extern constructor ty_vector_float ty_vector_float) + +;; A pure constructor that only matches vector types with lanes which +;; are not floating-point. +(decl pure partial ty_vector_not_float (Type) Type) +(extern constructor ty_vector_not_float ty_vector_not_float) + +;; A pure constructor/extractor that only matches 64-bit vector types. +(decl pure partial ty_vec64 (Type) Type) +(extern constructor ty_vec64 ty_vec64_ctor) +(extern extractor ty_vec64 ty_vec64) + +;; An extractor that only matches 128-bit vector types. +(decl ty_vec128 (Type) Type) +(extern extractor ty_vec128 ty_vec128) + +;; An extractor that only matches dynamic vector types with a 64-bit +;; base type. +(decl ty_dyn_vec64 (Type) Type) +(extern extractor ty_dyn_vec64 ty_dyn_vec64) + +;; An extractor that only matches dynamic vector types with a 128-bit +;; base type. +(decl ty_dyn_vec128 (Type) Type) +(extern extractor ty_dyn_vec128 ty_dyn_vec128) + +;; An extractor that only matches 64-bit vector types with integer +;; lanes (I8X8, I16X4, I32X2) +(decl ty_vec64_int (Type) Type) +(extern extractor ty_vec64_int ty_vec64_int) + +;; An extractor that only matches 128-bit vector types with integer +;; lanes (I8X16, I16X8, I32X4, I64X2). +(decl ty_vec128_int (Type) Type) +(extern extractor ty_vec128_int ty_vec128_int) + +;; An extractor that only matches types that can be a 64-bit address. +(decl ty_addr64 (Type) Type) +(extern extractor ty_addr64 ty_addr64) + +;; A pure constructor that matches everything except vectors with size 32X2. +(decl pure partial not_vec32x2 (Type) Type) +(extern constructor not_vec32x2 not_vec32x2) + +;; An extractor that matches everything except I64X2 +(decl not_i64x2 () Type) +(extern extractor not_i64x2 not_i64x2) + +;; Extract a `u8` from an `Uimm8`. +(decl u8_from_uimm8 (u8) Uimm8) +(extern extractor infallible u8_from_uimm8 u8_from_uimm8) + +;; Extract a `u64` from a `bool`. +(decl u64_from_bool (u64) bool) +(extern extractor infallible u64_from_bool u64_from_bool) + +;; Extract a `u64` from an `Imm64`. +(decl u64_from_imm64 (u64) Imm64) +(extern extractor infallible u64_from_imm64 u64_from_imm64) + +;; Extract a `u64` from an `Imm64` which is not zero. +(decl nonzero_u64_from_imm64 (u64) Imm64) +(extern extractor nonzero_u64_from_imm64 nonzero_u64_from_imm64) + +;; If the given `Imm64` is a power-of-two, extract its log2 value. +(decl imm64_power_of_two (u64) Imm64) +(extern extractor imm64_power_of_two imm64_power_of_two) + +;; Create a new Imm64. +(decl pure imm64 (u64) Imm64) +(extern constructor imm64 imm64) + +;; Create a new Imm64, masked to the width of the given type. +(decl pure imm64_masked (Type u64) Imm64) +(extern constructor imm64_masked imm64_masked) + +;; Extract a `u16` from an `Ieee16`. +(decl u16_from_ieee16 (u16) Ieee16) +(extern extractor infallible u16_from_ieee16 u16_from_ieee16) + +;; Extract a `u32` from an `Ieee32`. +(decl u32_from_ieee32 (u32) Ieee32) +(extern extractor infallible u32_from_ieee32 u32_from_ieee32) + +;; Extract a `u64` from an `Ieee64`. +(decl u64_from_ieee64 (u64) Ieee64) +(extern extractor infallible u64_from_ieee64 u64_from_ieee64) + +;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given +;; type. Will only match when there is more than one lane. +(decl multi_lane (u32 u32) Type) +(extern extractor multi_lane multi_lane) + +;; Match a dynamic-lane type, extracting (# bits per lane) from the given +;; type. +(decl dynamic_lane (u32 u32) Type) +(extern extractor dynamic_lane dynamic_lane) + +;; An extractor that only matches 64-bit dynamic vector types with integer +;; lanes (I8X8XN, I16X4XN, I32X2XN) +(decl ty_dyn64_int (Type) Type) +(extern extractor ty_dyn64_int ty_dyn64_int) + +;; An extractor that only matches 128-bit dynamic vector types with integer +;; lanes (I8X16XN, I16X8XN, I32X4XN, I64X2XN). +(decl ty_dyn128_int (Type) Type) +(extern extractor ty_dyn128_int ty_dyn128_int) + +;; Convert an `Offset32` to a primitive number. +(decl pure offset32_to_i32 (Offset32) i32) +(extern constructor offset32_to_i32 offset32_to_i32) + +;; Convert a number to an `Offset32` +(decl pure i32_to_offset32 (i32) Offset32) +(extern constructor i32_to_offset32 i32_to_offset32) + +;; This is a direct import of `IntCC::unsigned`. +;; Get the corresponding IntCC with the signed component removed. +;; For conditions without a signed component, this is a no-op. +(decl pure intcc_unsigned (IntCC) IntCC) +(extern constructor intcc_unsigned intcc_unsigned) + +;; Pure constructor that only matches signed integer cond codes. +(decl pure partial signed_cond_code (IntCC) IntCC) +(extern constructor signed_cond_code signed_cond_code) + +;;;; Helpers for Working with TrapCode ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl pure trap_code_division_by_zero () TrapCode) +(extern constructor trap_code_division_by_zero trap_code_division_by_zero) + +(decl pure trap_code_integer_overflow () TrapCode) +(extern constructor trap_code_integer_overflow trap_code_integer_overflow) + +(decl pure trap_code_bad_conversion_to_integer () TrapCode) +(extern constructor trap_code_bad_conversion_to_integer trap_code_bad_conversion_to_integer) + +;;;; Helpers for tail recursion loops ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; A range of integers to loop through. +(type Range (primitive Range)) + +;; Create a new range from `start` through `end` (exclusive). +(decl pure range (usize usize) Range) +(extern constructor range range) + +;; A view on the current state of the range. +(type RangeView extern + (enum + (Empty) + (NonEmpty (index usize) (rest Range)))) + +;; View the current state of the range. +(decl range_view (RangeView) Range) +(extern extractor infallible range_view range_view) + +;; Extractor to test whether a range is empty. +(decl range_empty () Range) +(extractor (range_empty) (range_view (RangeView.Empty))) + +;; Extractor to return the first value in the range, and a sub-range +;; containing the remaining values. +(decl range_unwrap (usize Range) Range) +(extractor (range_unwrap index rest) (range_view (RangeView.NonEmpty index rest))) + +;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(convert Offset32 i32 offset32_to_i32) +(convert i32 Offset32 i32_to_offset32) + diff --git a/hbcb/src/prelude_lower.isle b/hbcb/src/prelude_lower.isle new file mode 100644 index 0000000..ec34312 --- /dev/null +++ b/hbcb/src/prelude_lower.isle @@ -0,0 +1,1082 @@ +;; Prelude definitions specific to lowering environments (backends) in +;; ISLE. + +;;;; Primitive and External Types ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; `cranelift-entity`-based identifiers. +(type Inst (primitive Inst)) + +;; ISLE representation of `Vec` +(type VecMask extern (enum)) + +(type ValueRegs (primitive ValueRegs)) +(type WritableValueRegs (primitive WritableValueRegs)) + +;; Instruction lowering result: a vector of `ValueRegs`. +(type InstOutput (primitive InstOutput)) +;; (Mutable) builder to incrementally construct an `InstOutput`. +(type InstOutputBuilder extern (enum)) + +;; Type to hold multiple Regs +(type MultiReg + (enum + (Empty) + (One (a Reg)) + (Two (a Reg) (b Reg)) + (Three (a Reg) (b Reg) (c Reg)) + (Four (a Reg) (b Reg) (c Reg) (d Reg)) + )) + +;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(type Reg (primitive Reg)) +(type WritableReg (primitive WritableReg)) +(type OptionWritableReg (primitive OptionWritableReg)) +(type VecReg extern (enum)) +(type VecWritableReg extern (enum)) +(type PReg (primitive PReg)) + +;; Construct a `ValueRegs` of one register. +(decl value_reg (Reg) ValueRegs) +(extern constructor value_reg value_reg) + +;; Construct a `WritableValueRegs` of one register. +(decl writable_value_reg (WritableReg) WritableValueRegs) +(extern constructor writable_value_reg writable_value_reg) + +;; Construct a `ValueRegs` of two registers. +(decl value_regs (Reg Reg) ValueRegs) +(extern constructor value_regs value_regs) + +;; Construct a `WritableValueRegs` of two registers. +(decl writable_value_regs (WritableReg WritableReg) WritableValueRegs) +(extern constructor writable_value_regs writable_value_regs) + +;; Construct an empty `ValueRegs` containing only invalid register sentinels. +(decl value_regs_invalid () ValueRegs) +(extern constructor value_regs_invalid value_regs_invalid) + +;; Construct an empty `InstOutput`. +(decl output_none () InstOutput) +(extern constructor output_none output_none) + +;; Construct a single-element `InstOutput`. +(decl output (ValueRegs) InstOutput) +(extern constructor output output) + +;; Construct a two-element `InstOutput`. +(decl output_pair (ValueRegs ValueRegs) InstOutput) +(extern constructor output_pair output_pair) + +;; Construct a single-element `InstOutput` from a single register. +(decl output_reg (Reg) InstOutput) +(rule (output_reg reg) (output (value_reg reg))) + +;; Construct a single-element `InstOutput` from a value. +(decl output_value (Value) InstOutput) +(rule (output_value val) (output (put_in_regs val))) + +;; Initially empty `InstOutput` builder. +(decl output_builder_new () InstOutputBuilder) +(extern constructor output_builder_new output_builder_new) + +;; Append a `ValueRegs` to an `InstOutput` under construction. +(decl output_builder_push (InstOutputBuilder ValueRegs) Unit) +(extern constructor output_builder_push output_builder_push) + +;; Finish building an `InstOutput` incrementally. +(decl output_builder_finish (InstOutputBuilder) InstOutput) +(extern constructor output_builder_finish output_builder_finish) + +;; Get a temporary register for writing. +(decl temp_writable_reg (Type) WritableReg) +(extern constructor temp_writable_reg temp_writable_reg) + +;; Get a temporary register for reading. +(decl temp_reg (Type) Reg) +(rule (temp_reg ty) + (writable_reg_to_reg (temp_writable_reg ty))) + +(decl is_valid_reg (bool) Reg) +(extern extractor infallible is_valid_reg is_valid_reg) + +;; Get or match the invalid register. +(decl invalid_reg () Reg) +(extern constructor invalid_reg invalid_reg) +(extractor (invalid_reg) (is_valid_reg $false)) + +;; Match any register but the invalid register. +(decl valid_reg (Reg) Reg) +(extractor (valid_reg reg) (and (is_valid_reg $true) reg)) + +;; Mark this value as used, to ensure that it gets lowered. +(decl mark_value_used (Value) Unit) +(extern constructor mark_value_used mark_value_used) + +;; Put the given value into a register. +;; +;; Asserts that the value fits into a single register, and doesn't require +;; multiple registers for its representation (like `i128` on x64 for example). +;; +;; As a side effect, this marks the value as used. +(decl put_in_reg (Value) Reg) +(extern constructor put_in_reg put_in_reg) + +;; Put the given value into one or more registers. +;; +;; As a side effect, this marks the value as used. +(decl put_in_regs (Value) ValueRegs) +(extern constructor put_in_regs put_in_regs) + +;; If the given reg is a real register, cause the value in reg to be in a virtual +;; reg, by copying it into a new virtual reg. +(decl ensure_in_vreg (Reg Type) Reg) +(extern constructor ensure_in_vreg ensure_in_vreg) + +;; Get the `n`th register inside a `ValueRegs`. +(decl value_regs_get (ValueRegs usize) Reg) +(extern constructor value_regs_get value_regs_get) + +;; Get the number of registers in a `ValueRegs`. +(decl pure value_regs_len (ValueRegs) usize) +(extern constructor value_regs_len value_regs_len) + +;; Get a range for the number of regs in a `ValueRegs`. +(decl value_regs_range (ValueRegs) Range) +(rule (value_regs_range regs) (range 0 (value_regs_len regs))) + +;; Put the value into one or more registers and return the first register. +;; +;; Unlike `put_in_reg`, this does not assert that the value fits in a single +;; register. This is useful for things like a `i128` shift amount, where we mask +;; the shift amount to the bit width of the value being shifted, and so the high +;; half of the `i128` won't ever be used. +;; +;; As a side effect, this marks that value as used. +(decl lo_reg (Value) Reg) +(rule (lo_reg val) + (let ((regs ValueRegs (put_in_regs val))) + (value_regs_get regs 0))) + +;; Convert a `PReg` into a `Reg`. +(decl preg_to_reg (PReg) Reg) +(extern constructor preg_to_reg preg_to_reg) + +;; Convert a MultiReg with three registers into an InstOutput containing +;; one ValueRegs containing the first two regs and one containing the third reg +(decl multi_reg_to_pair_and_single (MultiReg) InstOutput) +(rule (multi_reg_to_pair_and_single (MultiReg.Three a b c)) + (output_pair (value_regs a b) c)) + +;; Convert a MultiReg with two registers into an InstOutput containing one ValueRegs with both regs +(decl multi_reg_to_pair (MultiReg) InstOutput) +(rule (multi_reg_to_pair (MultiReg.Two a b)) + (value_regs a b)) + +;; Convert a MultiReg with one register into an InstOutput containing one ValueRegs with the register +(decl multi_reg_to_single (MultiReg) InstOutput) +(rule (multi_reg_to_single (MultiReg.One a)) + (value_reg a)) + +;; Add a range fact to a register, when compiling with +;; proof-carrying-code enabled. +(decl add_range_fact (Reg u16 u64 u64) Reg) +(extern constructor add_range_fact add_range_fact) + +;;;; Common Mach Types ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(type MachLabel (primitive MachLabel)) +(type ValueLabel (primitive ValueLabel)) +(type UnwindInst (primitive UnwindInst)) +(type ExternalName (primitive ExternalName)) +(type BoxExternalName (primitive BoxExternalName)) +(type RelocDistance (primitive RelocDistance)) +(type VecArgPair extern (enum)) +(type VecRetPair extern (enum)) +(type CallArgList extern (enum)) +(type MachLabelSlice extern (enum)) +(type BoxVecMachLabel extern (enum)) + +;; Extract a the target from a MachLabelSlice with exactly one target. +(decl single_target (MachLabel) MachLabelSlice) +(extern extractor single_target single_target) + +;; Extract a the targets from a MachLabelSlice with exactly two targets. +(decl two_targets (MachLabel MachLabel) MachLabelSlice) +(extern extractor two_targets two_targets) + +;; Extract the default target and jump table from a MachLabelSlice. +(decl jump_table_targets (MachLabel BoxVecMachLabel) MachLabelSlice) +(extern extractor jump_table_targets jump_table_targets) + +;; The size of the jump table. +(decl jump_table_size (BoxVecMachLabel) u32) +(extern constructor jump_table_size jump_table_size) + +;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Extractor to get a `ValueSlice` out of a `ValueList`. +(decl value_list_slice (ValueSlice) ValueList) +(extern extractor infallible value_list_slice value_list_slice) + +;; Extractor to test whether a `ValueSlice` is empty. +(decl value_slice_empty () ValueSlice) +(extern extractor value_slice_empty value_slice_empty) + +;; Extractor to split a `ValueSlice` into its first element plus a tail. +(decl value_slice_unwrap (Value ValueSlice) ValueSlice) +(extern extractor value_slice_unwrap value_slice_unwrap) + +;; Return the length of a `ValueSlice`. +(decl value_slice_len (ValueSlice) usize) +(extern constructor value_slice_len value_slice_len) + +;; Return any element of a `ValueSlice`. +(decl value_slice_get (ValueSlice usize) Value) +(extern constructor value_slice_get value_slice_get) + +;; Extractor to get the first element from a value list, along with its tail as +;; a `ValueSlice`. +(decl unwrap_head_value_list_1 (Value ValueSlice) ValueList) +(extractor (unwrap_head_value_list_1 head tail) + (value_list_slice (value_slice_unwrap head tail))) + +;; Extractor to get the first two elements from a value list, along with its +;; tail as a `ValueSlice`. +(decl unwrap_head_value_list_2 (Value Value ValueSlice) ValueList) +(extractor (unwrap_head_value_list_2 head1 head2 tail) + (value_list_slice (value_slice_unwrap head1 (value_slice_unwrap head2 tail)))) + +;; Turn a `Writable` into a `Reg` via `Writable::to_reg`. +(decl pure writable_reg_to_reg (WritableReg) Reg) +(extern constructor writable_reg_to_reg writable_reg_to_reg) + +;; Extract the result values for the given instruction. +(decl inst_results (ValueSlice) Inst) +(extern extractor infallible inst_results inst_results) + +;; Returns whether the given value is unused in this function and is a dead +;; result. +(decl pure value_is_unused (Value) bool) +(extern constructor value_is_unused value_is_unused) + +;; Extract the first result value of the given instruction. +(decl first_result (Value) Inst) +(extern extractor first_result first_result) + +;; Extract the `InstructionData` for an `Inst`. +(decl inst_data (InstructionData) Inst) +(extern extractor infallible inst_data inst_data) + +;; Extract the type of the instruction's first result. +(decl result_type (Type) Inst) +(extractor (result_type ty) + (first_result (value_type ty))) + +;; Extract the type of the instruction's first result and pass along the +;; instruction as well. +(decl has_type (Type Inst) Inst) +(extractor (has_type ty inst) + (and (result_type ty) + inst)) + +;; Match the instruction that defines the given value, if any. +(decl def_inst (Inst) Value) +(extern extractor def_inst def_inst) + +;; Extract a constant `u64` from a value defined by an `iconst`. +(decl u64_from_iconst (u64) Value) +(extractor (u64_from_iconst x) + (def_inst (iconst (u64_from_imm64 x)))) + +;; Extract a constant `i32` from a value defined by an `iconst`. +;; The value is sign extended to 32 bits. +(decl i32_from_iconst (i32) Value) +(extern extractor i32_from_iconst i32_from_iconst) + +;; Extract a constant `i64` from a value defined by an `iconst`. +;; The value is sign extended to 64 bits. +(decl i64_from_iconst (i64) Value) +(extern extractor i64_from_iconst i64_from_iconst) + +;; Match any zero value for iconst, fconst32, fconst64, vconst and splat. +(decl pure partial zero_value (Value) Value) +(extern constructor zero_value zero_value) + +;; Match a sinkable instruction from a value operand. +(decl pure partial is_sinkable_inst (Value) Inst) +(extern constructor is_sinkable_inst is_sinkable_inst) + +;; Match a uextend or any other instruction, "seeing through" the uextend if +;; present. +(decl maybe_uextend (Value) Value) +(extern extractor maybe_uextend maybe_uextend) + +;; Get an unsigned 8-bit immediate in a u8 from an Imm64, if possible. +(decl uimm8 (u8) Imm64) +(extern extractor uimm8 uimm8) + +;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Emit an instruction. +;; +;; This is low-level and side-effectful; it should only be used as an +;; implementation detail by helpers that preserve the SSA facade themselves. + +(decl emit (MInst) Unit) +(extern constructor emit emit) + +;; Sink an instruction. +;; +;; This is a side-effectful operation that notifies the context that the +;; instruction has been sunk into another instruction, and no longer needs to +;; be lowered. +(decl sink_inst (Inst) Unit) +(extern constructor sink_inst sink_inst) + +;; Constant pool emission. + +(type VCodeConstant (primitive VCodeConstant)) + +;; Add a u64 little-endian constant to the in-memory constant pool and +;; return a VCodeConstant index that refers to it. This is +;; side-effecting but idempotent (constants are deduplicated). +(decl emit_u64_le_const (u64) VCodeConstant) +(extern constructor emit_u64_le_const emit_u64_le_const) + +;; Add a u128 little-endian constant to the in-memory constant pool and +;; return a VCodeConstant index that refers to it. This is +;; side-effecting but idempotent (constants are deduplicated). +(decl emit_u128_le_const (u128) VCodeConstant) +(extern constructor emit_u128_le_const emit_u128_le_const) + +;; Fetch the VCodeConstant associated with a Constant. +(decl const_to_vconst (Constant) VCodeConstant) +(extern constructor const_to_vconst const_to_vconst) + +;;;; Helpers for Side-Effectful Instructions Without Results ;;;;;;;;;;;;;;;;;;; + +(type SideEffectNoResult (enum + (Inst (inst MInst)) + (Inst2 (inst1 MInst) + (inst2 MInst)) + (Inst3 (inst1 MInst) + (inst2 MInst) + (inst3 MInst)))) + +;; Emit given side-effectful instruction. +(decl emit_side_effect (SideEffectNoResult) Unit) +(rule (emit_side_effect (SideEffectNoResult.Inst inst)) + (emit inst)) +(rule (emit_side_effect (SideEffectNoResult.Inst2 inst1 inst2)) + (let ((_ Unit (emit inst1))) + (emit inst2))) +(rule (emit_side_effect (SideEffectNoResult.Inst3 inst1 inst2 inst3)) + (let ((_ Unit (emit inst1)) + (_ Unit (emit inst2))) + (emit inst3))) + +;; Create an empty `InstOutput`, but do emit the given side-effectful +;; instruction. +(decl side_effect (SideEffectNoResult) InstOutput) +(rule (side_effect inst) + (let ((_ Unit (emit_side_effect inst))) + (output_none))) + +(decl side_effect_concat (SideEffectNoResult SideEffectNoResult) SideEffectNoResult) +(rule (side_effect_concat (SideEffectNoResult.Inst inst1) (SideEffectNoResult.Inst inst2)) + (SideEffectNoResult.Inst2 inst1 inst2)) +(rule (side_effect_concat (SideEffectNoResult.Inst inst1) (SideEffectNoResult.Inst2 inst2 inst3)) + (SideEffectNoResult.Inst3 inst1 inst2 inst3)) +(rule (side_effect_concat (SideEffectNoResult.Inst2 inst1 inst2) (SideEffectNoResult.Inst inst3)) + (SideEffectNoResult.Inst3 inst1 inst2 inst3)) + +;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Newtype wrapper around `MInst` for instructions that are used for their +;; effect on flags. +;; +;; Variant determines how result is given when combined with a +;; ConsumesFlags. See `with_flags` below for more. +(type ProducesFlags (enum + ;; For cases where the flags have been produced by another + ;; instruction, and we have out-of-band reasons to know + ;; that they won't be clobbered by the time we depend on + ;; them. + (AlreadyExistingFlags) + (ProducesFlagsSideEffect (inst MInst)) + (ProducesFlagsTwiceSideEffect (inst1 MInst) (inst2 MInst)) + ;; Not directly combinable with a ConsumesFlags; + ;; used in s390x and unwrapped directly by `trapif`. + (ProducesFlagsReturnsReg (inst MInst) (result Reg)) + (ProducesFlagsReturnsResultWithConsumer (inst MInst) (result Reg)))) + +;; Chain another producer to a `ProducesFlags`. +(decl produces_flags_concat (ProducesFlags ProducesFlags) ProducesFlags) +(rule (produces_flags_concat (ProducesFlags.ProducesFlagsSideEffect inst1) (ProducesFlags.ProducesFlagsSideEffect inst2)) + (ProducesFlags.ProducesFlagsTwiceSideEffect inst1 inst2)) + +;; Newtype wrapper around `MInst` for instructions that consume and produce flags +(type ConsumesAndProducesFlags (enum + (SideEffect (inst MInst)) + (ReturnsReg (inst MInst) (result Reg)))) + +;; Newtype wrapper around `MInst` for instructions that consume flags. +;; +;; Variant determines how result is given when combined with a +;; ProducesFlags. See `with_flags` below for more. +(type ConsumesFlags (enum + (ConsumesFlagsSideEffect (inst MInst)) + (ConsumesFlagsSideEffect2 (inst1 MInst) (inst2 MInst)) + (ConsumesFlagsReturnsResultWithProducer (inst MInst) (result Reg)) + (ConsumesFlagsReturnsReg (inst MInst) (result Reg)) + (ConsumesFlagsTwiceReturnsValueRegs (inst1 MInst) + (inst2 MInst) + (result ValueRegs)) + (ConsumesFlagsFourTimesReturnsValueRegs (inst1 MInst) + (inst2 MInst) + (inst3 MInst) + (inst4 MInst) + (result ValueRegs)))) + + + +;; Get the produced register out of a ProducesFlags. +(decl produces_flags_get_reg (ProducesFlags) Reg) +(rule (produces_flags_get_reg (ProducesFlags.ProducesFlagsReturnsReg _ reg)) reg) +(rule (produces_flags_get_reg (ProducesFlags.ProducesFlagsReturnsResultWithConsumer _ reg)) reg) + +;; Modify a ProducesFlags to use it only for its side-effect, ignoring +;; its result. +(decl produces_flags_ignore (ProducesFlags) ProducesFlags) +(rule (produces_flags_ignore (ProducesFlags.ProducesFlagsReturnsReg inst _)) + (ProducesFlags.ProducesFlagsSideEffect inst)) +(rule (produces_flags_ignore (ProducesFlags.ProducesFlagsReturnsResultWithConsumer inst _)) + (ProducesFlags.ProducesFlagsSideEffect inst)) + +;; Helper for combining two flags-consumer instructions that return a +;; single Reg, giving a ConsumesFlags that returns both values in a +;; ValueRegs. +(decl consumes_flags_concat (ConsumesFlags ConsumesFlags) ConsumesFlags) +(rule (consumes_flags_concat (ConsumesFlags.ConsumesFlagsReturnsReg inst1 reg1) + (ConsumesFlags.ConsumesFlagsReturnsReg inst2 reg2)) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs + inst1 + inst2 + (value_regs reg1 reg2))) +(rule (consumes_flags_concat + (ConsumesFlags.ConsumesFlagsSideEffect inst1) + (ConsumesFlags.ConsumesFlagsSideEffect inst2)) + (ConsumesFlags.ConsumesFlagsSideEffect2 inst1 inst2)) + +;; Combine flags-producing and -consuming instructions together, ensuring that +;; they are emitted back-to-back and no other instructions can be emitted +;; between them and potentially clobber the flags. +;; +;; Returns a `ValueRegs` according to the specific combination of ProducesFlags and ConsumesFlags modes: +;; - SideEffect + ReturnsReg --> ValueReg with one Reg from consumer +;; - SideEffect + ReturnsValueRegs --> ValueReg as given from consumer +;; - ReturnsResultWithProducer + ReturnsResultWithConsumer --> ValueReg with low part from producer, high part from consumer +;; +;; See `with_flags_reg` below for a variant that extracts out just the lower Reg. +(decl with_flags (ProducesFlags ConsumesFlags) ValueRegs) + +(rule (with_flags (ProducesFlags.ProducesFlagsReturnsResultWithConsumer producer_inst producer_result) + (ConsumesFlags.ConsumesFlagsReturnsResultWithProducer consumer_inst consumer_result)) + (let ((_x Unit (emit producer_inst)) + (_y Unit (emit consumer_inst))) + (value_regs producer_result consumer_result))) + +;; A flag-producer that also produces a result, paired with a consumer that has +;; no results. +(rule (with_flags (ProducesFlags.ProducesFlagsReturnsResultWithConsumer producer_inst producer_result) + (ConsumesFlags.ConsumesFlagsSideEffect consumer_inst)) + (let ((_ Unit (emit producer_inst)) + (_ Unit (emit consumer_inst))) + (value_reg producer_result))) + +(rule (with_flags (ProducesFlags.ProducesFlagsSideEffect producer_inst) + (ConsumesFlags.ConsumesFlagsReturnsReg consumer_inst consumer_result)) + (let ((_x Unit (emit producer_inst)) + (_y Unit (emit consumer_inst))) + (value_reg consumer_result))) + +(rule (with_flags (ProducesFlags.ProducesFlagsSideEffect producer_inst) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consumer_inst_1 + consumer_inst_2 + consumer_result)) + ;; We must emit these instructions in order as the creator of + ;; the ConsumesFlags may be relying on dataflow dependencies + ;; amongst them. + (let ((_x Unit (emit producer_inst)) + (_y Unit (emit consumer_inst_1)) + (_z Unit (emit consumer_inst_2))) + consumer_result)) + +(rule (with_flags (ProducesFlags.ProducesFlagsSideEffect producer_inst) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consumer_inst_1 + consumer_inst_2 + consumer_inst_3 + consumer_inst_4 + consumer_result)) + ;; We must emit these instructions in order as the creator of + ;; the ConsumesFlags may be relying on dataflow dependencies + ;; amongst them. + (let ((_x Unit (emit producer_inst)) + (_y Unit (emit consumer_inst_1)) + (_z Unit (emit consumer_inst_2)) + (_w Unit (emit consumer_inst_3)) + (_v Unit (emit consumer_inst_4))) + consumer_result)) + +(rule (with_flags (ProducesFlags.ProducesFlagsTwiceSideEffect producer_inst1 producer_inst2) + (ConsumesFlags.ConsumesFlagsReturnsReg consumer_inst consumer_result)) + (let ((_ Unit (emit producer_inst1)) + (_ Unit (emit producer_inst2)) + (_ Unit (emit consumer_inst))) + (value_reg consumer_result))) + +(rule (with_flags (ProducesFlags.ProducesFlagsTwiceSideEffect producer_inst1 producer_inst2) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consumer_inst_1 + consumer_inst_2 + consumer_result)) + ;; We must emit these instructions in order as the creator of + ;; the ConsumesFlags may be relying on dataflow dependencies + ;; amongst them. + (let ((_ Unit (emit producer_inst1)) + (_ Unit (emit producer_inst2)) + (_ Unit (emit consumer_inst_1)) + (_ Unit (emit consumer_inst_2))) + consumer_result)) + +(rule (with_flags (ProducesFlags.ProducesFlagsTwiceSideEffect producer_inst1 producer_inst2) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consumer_inst_1 + consumer_inst_2 + consumer_inst_3 + consumer_inst_4 + consumer_result)) + ;; We must emit these instructions in order as the creator of + ;; the ConsumesFlags may be relying on dataflow dependencies + ;; amongst them. + (let ((_ Unit (emit producer_inst1)) + (_ Unit (emit producer_inst2)) + (_ Unit (emit consumer_inst_1)) + (_ Unit (emit consumer_inst_2)) + (_ Unit (emit consumer_inst_3)) + (_ Unit (emit consumer_inst_4))) + consumer_result)) + +(decl with_flags_reg (ProducesFlags ConsumesFlags) Reg) +(rule (with_flags_reg p c) + (let ((v ValueRegs (with_flags p c))) + (value_regs_get v 0))) + +;; Indicate that the current state of the flags register from the instruction +;; that produces this Value is relied on. +(decl flags_to_producesflags (Value) ProducesFlags) +(rule (flags_to_producesflags val) + (let ((_ Unit (mark_value_used val))) + (ProducesFlags.AlreadyExistingFlags))) + +;; Combine a flags-producing instruction and a flags-consuming instruction that +;; produces no results. +;; +;; This function handles the following case only: +;; - ProducesFlagsSideEffect + ConsumesFlagsSideEffect +(decl with_flags_side_effect (ProducesFlags ConsumesFlags) SideEffectNoResult) + +(rule (with_flags_side_effect + (ProducesFlags.AlreadyExistingFlags) + (ConsumesFlags.ConsumesFlagsSideEffect c)) + (SideEffectNoResult.Inst c)) + +(rule (with_flags_side_effect + (ProducesFlags.AlreadyExistingFlags) + (ConsumesFlags.ConsumesFlagsSideEffect2 c1 c2)) + (SideEffectNoResult.Inst2 c1 c2)) + +(rule (with_flags_side_effect + (ProducesFlags.ProducesFlagsSideEffect p) + (ConsumesFlags.ConsumesFlagsSideEffect c)) + (SideEffectNoResult.Inst2 p c)) + +(rule (with_flags_side_effect + (ProducesFlags.ProducesFlagsSideEffect p) + (ConsumesFlags.ConsumesFlagsSideEffect2 c1 c2)) + (SideEffectNoResult.Inst3 p c1 c2)) + +(rule (with_flags_side_effect + (ProducesFlags.ProducesFlagsTwiceSideEffect p1 p2) + (ConsumesFlags.ConsumesFlagsSideEffect c)) + (SideEffectNoResult.Inst3 p1 p2 c)) + +;; Combine flag-producing and -consuming instruction that allows more than two results to be returned +(decl with_flags_chained (ProducesFlags ConsumesAndProducesFlags ConsumesFlags) MultiReg) + +;; ProducesFlags.SideEffect + ConsumesAndProducesFlags.SideEffect with all possible ConsumeFlags options +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsSideEffect consume_inst)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Empty))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsSideEffect2 consume_inst1 consume_inst2)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Empty))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsReturnsReg consume_inst reg)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.One reg))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consume_inst1 consume_inst2 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Two (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consume_inst1 consume_inst2 consume_inst3 consume_inst4 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2)) + (_ Unit (emit consume_inst3)) + (_ Unit (emit consume_inst4))) + (MultiReg.Two (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + + +;; ProducesFlags.ReturnsReg + ConsumesAndProducesFlags.SideEffect with all possible ConsumeFlags options +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsSideEffect consume_inst)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.One prod_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsSideEffect2 consume_inst1 consume_inst2)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.One prod_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsReturnsReg consume_inst consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Two prod_result consume_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consume_inst1 consume_inst2 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Three prod_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.SideEffect middle_inst) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consume_inst1 consume_inst2 consume_inst3 consume_inst4 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2)) + (_ Unit (emit consume_inst3)) + (_ Unit (emit consume_inst4))) + (MultiReg.Three prod_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + + +;; ProducesFlags.SideEffect + ConsumesAndProducesFlags.ReturnsReg with all possible ConsumeFlags options +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect consume_inst)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.One middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect2 consume_inst1 consume_inst2)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.One middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsReturnsReg consume_inst consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Two middle_result consume_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consume_inst1 consume_inst2 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Three middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsSideEffect prod_inst) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consume_inst1 consume_inst2 consume_inst3 consume_inst4 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2)) + (_ Unit (emit consume_inst3)) + (_ Unit (emit consume_inst4))) + (MultiReg.Three middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + + +;; ProducesFlags.ReturnsReg + ConsumesAndProducesFlags.ReturnsReg with all possible ConsumeFlags options +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect consume_inst)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Two prod_result middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect2 consume_inst1 consume_inst2)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Two prod_result middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsReturnsReg consume_inst consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Three prod_result middle_result consume_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consume_inst1 consume_inst2 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Four prod_result middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsReg prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consume_inst1 consume_inst2 consume_inst3 consume_inst4 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2)) + (_ Unit (emit consume_inst3)) + (_ Unit (emit consume_inst4))) + (MultiReg.Four prod_result middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +;; ProducesFlags.ReturnsResultWithConsumer + ConsumesAndProducesFlags.ReturnsReg with all possible ConsumeFlags options +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect consume_inst)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Two prod_result middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsSideEffect2 consume_inst1 consume_inst2)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Two prod_result middle_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsReturnsReg consume_inst consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Three prod_result middle_result consume_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsReturnsResultWithProducer consume_inst consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst))) + (MultiReg.Three prod_result middle_result consume_result))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consume_inst1 consume_inst2 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2))) + (MultiReg.Four prod_result middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +(rule (with_flags_chained (ProducesFlags.ProducesFlagsReturnsResultWithConsumer prod_inst prod_result) + (ConsumesAndProducesFlags.ReturnsReg middle_inst middle_result) + (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consume_inst1 consume_inst2 consume_inst3 consume_inst4 consume_result)) + (let ((_ Unit (emit prod_inst)) + (_ Unit (emit middle_inst)) + (_ Unit (emit consume_inst1)) + (_ Unit (emit consume_inst2)) + (_ Unit (emit consume_inst3)) + (_ Unit (emit consume_inst4))) + (MultiReg.Four prod_result middle_result (value_regs_get consume_result 0) (value_regs_get consume_result 1)))) + +;;;; Helpers for accessing compilation flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; This definition should be kept up to date with the values defined in +;; cranelift/codegen/meta/src/shared/settings.rs +(type TlsModel extern (enum (None) (ElfGd) (Macho) (Coff))) + +(decl tls_model (TlsModel) Type) +(extern extractor infallible tls_model tls_model) + +(decl pure partial tls_model_is_elf_gd () Unit) +(extern constructor tls_model_is_elf_gd tls_model_is_elf_gd) + +(decl pure partial tls_model_is_macho () Unit) +(extern constructor tls_model_is_macho tls_model_is_macho) + +(decl pure partial tls_model_is_coff () Unit) +(extern constructor tls_model_is_coff tls_model_is_coff) + +(decl pure partial preserve_frame_pointers () Unit) +(extern constructor preserve_frame_pointers preserve_frame_pointers) + +;; This definition should be kept up to date with the values defined in +;; cranelift/codegen/meta/src/shared/settings.rs +(type StackSwitchModel extern (enum (None) (Basic) (UpdateWindowsTib))) + +(decl pure partial stack_switch_model () StackSwitchModel) +(extern constructor stack_switch_model stack_switch_model) + +;;;; Helpers for accessing instruction data ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl box_external_name (ExternalName) BoxExternalName) +(extern constructor box_external_name box_external_name) + +;; Accessor for `FuncRef`. + +(decl func_ref_data (SigRef ExternalName RelocDistance) FuncRef) +(extern extractor infallible func_ref_data func_ref_data) + +;; Accessor for `GlobalValue`. + +(decl symbol_value_data (ExternalName RelocDistance i64) GlobalValue) +(extern extractor symbol_value_data symbol_value_data) + +;; Accessor for `RelocDistance`. + +(decl reloc_distance_near () RelocDistance) +(extern extractor reloc_distance_near reloc_distance_near) + +;; Accessor for `Immediate` as a vector of u8 values. + +(decl vec_mask_from_immediate (VecMask) Immediate) +(extern extractor vec_mask_from_immediate vec_mask_from_immediate) + +;; Accessor for `Immediate` as u128. + +(decl u128_from_immediate (u128) Immediate) +(extern extractor u128_from_immediate u128_from_immediate) + +;; Extracts an `Immediate` as a `VCodeConstant`. + +(decl vconst_from_immediate (VCodeConstant) Immediate) +(extern extractor vconst_from_immediate vconst_from_immediate) + +;; Accessor for `Constant` as u128. + +(decl u128_from_constant (u128) Constant) +(extern extractor u128_from_constant u128_from_constant) + +;; Accessor for `Constant` as u64. + +(decl u64_from_constant (u64) Constant) +(extern extractor u64_from_constant u64_from_constant) + +;; Extracts lane indices, represented as u8's, if the immediate for a +;; `shuffle` instruction represents shuffling N-bit values. The u8 values +;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the +;; N-bit chunks of two concatenated 128-bit vectors starting from the +;; least-significant bits. +(decl shuffle64_from_imm (u8 u8) Immediate) +(extern extractor shuffle64_from_imm shuffle64_from_imm) +(decl shuffle32_from_imm (u8 u8 u8 u8) Immediate) +(extern extractor shuffle32_from_imm shuffle32_from_imm) +(decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate) +(extern extractor shuffle16_from_imm shuffle16_from_imm) + +;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Extractor to check for the special case that a `WritableValueRegs` +;; contains only a single register. +(decl only_writable_reg (WritableReg) WritableValueRegs) +(extern extractor only_writable_reg only_writable_reg) + +;; Get the `n`th register inside a `WritableValueRegs`. +(decl writable_regs_get (WritableValueRegs usize) WritableReg) +(extern constructor writable_regs_get writable_regs_get) + +;;;; Helpers for generating calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Type to hold information about a function call signature. +(type Sig (primitive Sig)) + +;; Information how to pass one argument or return value. +(type ABIArg extern (enum)) + +;; Information how to pass a single slot of one argument or return value. +(type ABIArgSlot extern + (enum + (Reg + (reg RealReg) + (ty Type) + (extension ArgumentExtension)) + (Stack + (offset i64) + (ty Type) + (extension ArgumentExtension)))) + +;; Physical register that may hold an argument or return value. +(type RealReg (primitive RealReg)) + +;; Instruction on whether and how to extend an argument value. +(type ArgumentExtension extern + (enum + (None) + (Uext) + (Sext))) + +;; Get the number of arguments expected. +(decl abi_num_args (Sig) usize) +(extern constructor abi_num_args abi_num_args) + +;; Get information specifying how to pass one argument. +(decl abi_get_arg (Sig usize) ABIArg) +(extern constructor abi_get_arg abi_get_arg) + +;; Get the number of return values expected. +(decl abi_num_rets (Sig) usize) +(extern constructor abi_num_rets abi_num_rets) + +;; Get information specifying how to pass one return value. +(decl abi_get_ret (Sig usize) ABIArg) +(extern constructor abi_get_ret abi_get_ret) + +;; Get information specifying how to pass the implicit pointer +;; to the return-value area on the stack, if required. +(decl abi_ret_arg (ABIArg) Sig) +(extern extractor abi_ret_arg abi_ret_arg) + +;; Succeeds if no implicit return-value area pointer is required. +(decl abi_no_ret_arg () Sig) +(extern extractor abi_no_ret_arg abi_no_ret_arg) + +;; Size of the argument area. +(decl abi_sized_stack_arg_space (Sig) i64) +(extern constructor abi_sized_stack_arg_space abi_sized_stack_arg_space) + +;; Size of the return-value area. +(decl abi_sized_stack_ret_space (Sig) i64) +(extern constructor abi_sized_stack_ret_space abi_sized_stack_ret_space) + +;; Incoming return area pointer (must be present). +(decl abi_unwrap_ret_area_ptr () Reg) +(extern constructor abi_unwrap_ret_area_ptr abi_unwrap_ret_area_ptr) + +;; StackSlot addr +(decl abi_stackslot_addr (WritableReg StackSlot Offset32) MInst) +(extern constructor abi_stackslot_addr abi_stackslot_addr) + +;; DynamicStackSlot addr +(decl abi_dynamic_stackslot_addr (WritableReg DynamicStackSlot) MInst) +(extern constructor abi_dynamic_stackslot_addr abi_dynamic_stackslot_addr) + +;; Extractor to detect the special case where an argument or +;; return value only requires a single slot to be passed. +(decl abi_arg_only_slot (ABIArgSlot) ABIArg) +(extern extractor abi_arg_only_slot abi_arg_only_slot) + +;; Extractor to detect the special case where a non-struct argument +;; is implicitly passed by reference using a hidden pointer. +(decl abi_arg_implicit_pointer (ABIArgSlot i64 Type) ABIArg) +(extern extractor abi_arg_implicit_pointer abi_arg_implicit_pointer) + +;; Convert a real register number into a virtual register. +(decl real_reg_to_reg (RealReg) Reg) +(extern constructor real_reg_to_reg real_reg_to_reg) + +;; Convert a real register number into a writable virtual register. +(decl real_reg_to_writable_reg (RealReg) WritableReg) +(extern constructor real_reg_to_writable_reg real_reg_to_writable_reg) + +;; Generate a move between two registers. +(decl gen_move (Type WritableReg Reg) MInst) +(extern constructor gen_move gen_move) + +;; Generate a return instruction +(decl lower_return (ValueSlice) InstOutput) +(rule (lower_return vals) + (let ((_ Unit (gen_return vals))) + (output_none))) + +(decl gen_return (ValueSlice) Unit) +(extern constructor gen_return gen_return) + +(decl gen_return_call (SigRef ExternalName RelocDistance ValueSlice) InstOutput) +(extern constructor gen_return_call gen_return_call) + +(decl gen_return_call_indirect (SigRef Value ValueSlice) InstOutput) +(extern constructor gen_return_call_indirect gen_return_call_indirect) + +;; Helper for extracting an immediate that's not 0 and not -1 from an imm64. +(decl pure partial safe_divisor_from_imm64 (Type Imm64) u64) +(extern constructor safe_divisor_from_imm64 safe_divisor_from_imm64) + +;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(convert Inst Value def_inst) +(convert Reg ValueRegs value_reg) +(convert WritableReg WritableValueRegs writable_value_reg) +(convert Value Reg put_in_reg) +(convert Value ValueRegs put_in_regs) +(convert WritableReg Reg writable_reg_to_reg) +(convert ValueRegs InstOutput output) +(convert Reg InstOutput output_reg) +(convert Value InstOutput output_value) +(convert ExternalName BoxExternalName box_external_name) +(convert PReg Reg preg_to_reg) + diff --git a/hbcb/src/prelude_opt.isle b/hbcb/src/prelude_opt.isle new file mode 100644 index 0000000..b8b9fc4 --- /dev/null +++ b/hbcb/src/prelude_opt.isle @@ -0,0 +1,123 @@ +;; Prelude definitions specific to the mid-end. + +;; Any `extern` definitions here are generally implemented in `src/opts.rs`. + +;;;;; eclass and enode access ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Extract any node(s) for the given eclass ID. +(decl multi inst_data (Type InstructionData) Value) +(extern extractor inst_data inst_data_etor) + +;; Identical to `inst_data`, just with a different ISLE type. +;; This is basically a manual version of `curry`/`uncurry` in Haskell: +;; to compose extractors the outer one needs to be single-parameter, +;; so this combines the two parameters of `inst_data` into one. +(type TypeAndInstructionData (primitive TypeAndInstructionData)) +(decl multi inst_data_tupled (TypeAndInstructionData) Value) +(extern extractor inst_data_tupled inst_data_tupled_etor) + +;; Construct a pure node, returning a new (or deduplicated +;; already-existing) eclass ID. +(decl make_inst (Type InstructionData) Value) +(extern constructor make_inst make_inst_ctor) + +;; Constructors for value arrays. +(decl value_array_2_ctor (Value Value) ValueArray2) +(extern constructor value_array_2_ctor value_array_2_ctor) +(decl value_array_3_ctor (Value Value Value) ValueArray3) +(extern constructor value_array_3_ctor value_array_3_ctor) + +(rule (eq ty x y) (icmp ty (IntCC.Equal) x y)) +(rule (ne ty x y) (icmp ty (IntCC.NotEqual) x y)) +(rule (ult ty x y) (icmp ty (IntCC.UnsignedLessThan) x y)) +(rule (ule ty x y) (icmp ty (IntCC.UnsignedLessThanOrEqual) x y)) +(rule (ugt ty x y) (icmp ty (IntCC.UnsignedGreaterThan) x y)) +(rule (uge ty x y) (icmp ty (IntCC.UnsignedGreaterThanOrEqual) x y)) +(rule (slt ty x y) (icmp ty (IntCC.SignedLessThan) x y)) +(rule (sle ty x y) (icmp ty (IntCC.SignedLessThanOrEqual) x y)) +(rule (sgt ty x y) (icmp ty (IntCC.SignedGreaterThan) x y)) +(rule (sge ty x y) (icmp ty (IntCC.SignedGreaterThanOrEqual) x y)) + +;; 3-way comparison, returning -1/0/+1 in I8 +(decl spaceship_s (Type Value Value) Value) +(rule (spaceship_s ty x y) (isub $I8 (sgt ty x y) (slt ty x y))) +(extractor (spaceship_s ty x y) (isub $I8 (sgt ty x y) (slt ty x y))) +(decl spaceship_u (Type Value Value) Value) +(rule (spaceship_u ty x y) (isub $I8 (ugt ty x y) (ult ty x y))) +(extractor (spaceship_u ty x y) (isub $I8 (ugt ty x y) (ult ty x y))) + +;;;;; optimization toplevel ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The main matcher rule invoked by the toplevel driver. +(decl multi simplify (Value) Value) + +;; Mark a node as requiring remat when used in a different block. +(decl remat (Value) Value) +(extern constructor remat remat) + +;; Mark a node as subsuming whatever else it's rewritten from -- this +;; is definitely preferable, not just a possible option. Useful for, +;; e.g., constant propagation where we arrive at a definite "final +;; answer". +(decl subsume (Value) Value) +(extern constructor subsume subsume) + +;;;;; constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl iconst_sextend_etor (Type i64) TypeAndInstructionData) +(extern extractor iconst_sextend_etor iconst_sextend_etor) + +;; Construct an `iconst` from an `i64` or Extract an `i64` from an `iconst` +;; by treating the constant as signed. +;; When extracting, smaller types get their value sign-extended to 64-bits, +;; so that `iconst.i8 255` will give you a `-1_i64`. +;; When constructing, the rule will fail if the value cannot be represented in +;; the target type. If it fits, it'll be masked accordingly in the constant. +(decl iconst_s (Type i64) Value) +(extractor (iconst_s ty c) (inst_data_tupled (iconst_sextend_etor ty c))) +(rule 0 (iconst_s ty c) + (if-let c_masked (u64_and (i64_as_u64 c) (ty_umax ty))) + (if-let c_reextended (i64_sextend_u64 ty c_masked)) + (if-let $true (u64_eq (i64_as_u64 c) (i64_as_u64 c_reextended))) + (iconst ty (imm64 c_masked))) +(rule 1 (iconst_s $I128 c) (sextend $I128 (iconst_s $I64 c))) + +;; Construct an `iconst` from a `u64` or Extract a `u64` from an `iconst` +;; by treating the constant as unsigned. +;; When extracting, smaller types get their value zero-extended to 64-bits, +;; so that `iconst.i8 255` will give you a `255_u64`. +;; When constructing, the rule will fail if the value cannot be represented in +;; the target type. +(decl iconst_u (Type u64) Value) +(extractor (iconst_u ty c) (iconst ty (u64_from_imm64 c))) +(rule 0 (iconst_u ty c) + (if-let $true (u64_le c (ty_umax ty))) + (iconst ty (imm64 c))) +(rule 1 (iconst_u $I128 c) (uextend $I128 (iconst_u $I64 c))) + +;; These take `Value`, rather than going through `inst_data_tupled`, because +;; most of the time they want to return the original `Value`, and it would be +;; a waste to need to re-GVN the instruction data in those cases. +(decl multi sextend_maybe_etor (Type Value) Value) +(extern extractor infallible sextend_maybe_etor sextend_maybe_etor) +(decl multi uextend_maybe_etor (Type Value) Value) +(extern extractor infallible uextend_maybe_etor uextend_maybe_etor) + +;; Match or Construct a possibly-`uextend`ed value. +;; Gives the extended-to type and inner value when matching something that was +;; extended, or the input value and its type when the value isn't an extension. +;; Useful to write a single pattern that can match things that may or may not +;; have undergone C's "usual arithmetic conversions". +;; When generating values, extending to the same type is invalid CLIF, +;; so this avoids doing that where there's no extension actually needed. +(decl uextend_maybe (Type Value) Value) +(extractor (uextend_maybe ty val) (uextend_maybe_etor ty val)) +(rule 0 (uextend_maybe ty val) (uextend ty val)) +(rule 1 (uextend_maybe ty val@(value_type ty)) val) + +;; Same as `uextend_maybe` above, just for `sextend`. +(decl sextend_maybe (Type Value) Value) +(extractor (sextend_maybe ty val) (sextend_maybe_etor ty val)) +(rule 0 (sextend_maybe ty val) (sextend ty val)) +(rule 1 (sextend_maybe ty val@(value_type ty)) val) + diff --git a/hbcb/src/settings.rs b/hbcb/src/settings.rs new file mode 100644 index 0000000..5cd68e3 --- /dev/null +++ b/hbcb/src/settings.rs @@ -0,0 +1,10 @@ +//! riscv64 Settings. + +use { + core::fmt, + cranelift_codegen::settings::{self, detail, Builder, Value}, +}; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +include!(concat!(env!("OUT_DIR"), "/settings-riscv64.rs"));