Skip to content

Commit d0c0611

Browse files
author
Gunter Schmidt
committed
feat: u64 for --bytes and --ignore-initial
fix: bumped up tempfile to "3.26.0" The variables for --bytes, --ignore-initial and line count where size 'usize', thus limiting the readable bytes on 32-bit systems. GNU cmp is compiled with LFS (Large File Support) and allows i64 values. This is now all u64, which works also on 32-bit systems with Rust. There is no reason to implement a 32-bit barrier for 32 bit machines. Additionally the --bytes limit can be set to 'u128' using the feature "cmp_bytes_limit_128_bit". The performance impact would be negligible, as there only few calculations each time a full block is read from the file.
1 parent 6f082c6 commit d0c0611

2 files changed

Lines changed: 53 additions & 32 deletions

File tree

Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ unicode-width = "0.2.0"
2626
pretty_assertions = "1.4.0"
2727
assert_cmd = "2.0.14"
2828
predicates = "3.1.0"
29-
tempfile = "3.10.1"
29+
tempfile = "3.26.0"
3030

3131
[profile.release]
3232
lto = "thin"
@@ -40,3 +40,9 @@ panic = "abort"
4040
[profile.dist]
4141
inherits = "release"
4242
lto = "thin"
43+
44+
[features]
45+
# default = ["cmp_bytes_limit_128_bit"]
46+
# allows to set the --bytes limit from u64 to u128, if limits larger than Exabyte are required.
47+
cmp_bytes_limit_128_bit = []
48+

src/cmp.rs

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,23 @@ use std::os::unix::fs::MetadataExt;
2020
#[cfg(target_os = "windows")]
2121
use std::os::windows::fs::MetadataExt;
2222

23+
/// for --bytes, so really large number limits can be expressed, like 1Y.
24+
#[cfg(not(feature = "cmp_bytes_limit_128_bit"))]
25+
pub type Bytes = u64;
26+
#[cfg(feature = "cmp_bytes_limit_128_bit")]
27+
pub type Bytes = u128;
28+
// ignore initial is currently limited to u64, as take(skip) is used.
29+
pub type IgnInit = u64;
30+
2331
#[derive(Clone, Debug, Default, Eq, PartialEq)]
2432
pub struct Params {
2533
executable: OsString,
2634
from: OsString,
2735
to: OsString,
2836
print_bytes: bool,
29-
skip_a: Option<usize>,
30-
skip_b: Option<usize>,
31-
max_bytes: Option<usize>,
37+
skip_a: Option<IgnInit>,
38+
skip_b: Option<IgnInit>,
39+
max_bytes: Option<Bytes>,
3240
verbose: bool,
3341
quiet: bool,
3442
}
@@ -66,13 +74,13 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
6674
};
6775
let executable_str = executable.to_string_lossy().to_string();
6876

69-
let parse_skip = |param: &str, skip_desc: &str| -> Result<usize, String> {
77+
let parse_skip = |param: &str, skip_desc: &str| -> Result<IgnInit, String> {
7078
let suffix_start = param
7179
.find(|b: char| !b.is_ascii_digit())
7280
.unwrap_or(param.len());
73-
let mut num = match param[..suffix_start].parse::<usize>() {
81+
let mut num = match param[..suffix_start].parse::<IgnInit>() {
7482
Ok(num) => num,
75-
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX,
83+
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => IgnInit::MAX,
7684
Err(_) => {
7785
return Err(format!(
7886
"{executable_str}: invalid --ignore-initial value '{skip_desc}'"
@@ -83,7 +91,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
8391
if suffix_start != param.len() {
8492
// Note that GNU cmp advertises supporting up to Y, but fails if you try
8593
// to actually use anything beyond E.
86-
let multiplier: usize = match &param[suffix_start..] {
94+
let multiplier: IgnInit = match &param[suffix_start..] {
8795
"kB" => 1_000,
8896
"K" => 1_024,
8997
"MB" => 1_000_000,
@@ -106,10 +114,11 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
106114
#[cfg(not(target_pointer_width = "64"))]
107115
usize::MAX
108116
}
109-
"ZB" => usize::MAX, // 1_000_000_000_000_000_000_000,
110-
"Z" => usize::MAX, // 1_180_591_620_717_411_303_424,
111-
"YB" => usize::MAX, // 1_000_000_000_000_000_000_000_000,
112-
"Y" => usize::MAX, // 1_208_925_819_614_629_174_706_176,
117+
// TODO setting usize:MAX does not mimic GNU cmp behavior, it should be an error.
118+
"ZB" => IgnInit::MAX, // 1_000_000_000_000_000_000_000,
119+
"Z" => IgnInit::MAX, // 1_180_591_620_717_411_303_424,
120+
"YB" => IgnInit::MAX, // 1_000_000_000_000_000_000_000_000,
121+
"Y" => IgnInit::MAX, // 1_208_925_819_614_629_174_706_176,
113122
_ => {
114123
return Err(format!(
115124
"{executable_str}: invalid --ignore-initial value '{skip_desc}'"
@@ -119,7 +128,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
119128

120129
num = match num.overflowing_mul(multiplier) {
121130
(n, false) => n,
122-
_ => usize::MAX,
131+
_ => IgnInit::MAX,
123132
}
124133
}
125134

@@ -173,9 +182,10 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
173182
let (_, arg) = param_str.split_once('=').unwrap();
174183
arg.to_string()
175184
};
176-
let max_bytes = match max_bytes.parse::<usize>() {
185+
let max_bytes = match max_bytes.parse::<Bytes>() {
177186
Ok(num) => num,
178-
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX,
187+
// TODO limit to MAX is dangerous, this should become an error like in GNU cmp.
188+
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => Bytes::MAX,
179189
Err(_) => {
180190
return Err(format!(
181191
"{executable_str}: invalid --bytes value '{max_bytes}'"
@@ -233,7 +243,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
233243
}
234244

235245
// Do as GNU cmp, and completely disable printing if we are
236-
// outputing to /dev/null.
246+
// outputting to /dev/null.
237247
#[cfg(not(target_os = "windows"))]
238248
if is_stdout_dev_null() {
239249
params.quiet = true;
@@ -285,7 +295,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
285295

286296
fn prepare_reader(
287297
path: &OsString,
288-
skip: &Option<usize>,
298+
skip: &Option<u64>,
289299
params: &Params,
290300
) -> Result<Box<dyn BufRead>, String> {
291301
let mut reader: Box<dyn BufRead> = if path == "-" {
@@ -326,7 +336,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
326336
let mut from = prepare_reader(&params.from, &params.skip_a, params)?;
327337
let mut to = prepare_reader(&params.to, &params.skip_b, params)?;
328338

329-
let mut offset_width = params.max_bytes.unwrap_or(usize::MAX);
339+
let mut offset_width = params.max_bytes.unwrap_or(Bytes::MAX);
330340

331341
if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
332342
#[cfg(not(target_os = "windows"))]
@@ -341,7 +351,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
341351
return Ok(Cmp::Different);
342352
}
343353

344-
let smaller = cmp::min(a_size, b_size) as usize;
354+
let smaller = cmp::min(a_size, b_size) as Bytes;
345355
offset_width = cmp::min(smaller, offset_width);
346356
}
347357

@@ -350,8 +360,8 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
350360
// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces
351361
let mut output = Vec::<u8>::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4);
352362

353-
let mut at_byte = 1;
354-
let mut at_line = 1;
363+
let mut at_byte: Bytes = 1;
364+
let mut at_line: u64 = 1;
355365
let mut start_of_line = true;
356366
let mut stdout = BufWriter::new(io::stdout().lock());
357367
let mut compare = Cmp::Equal;
@@ -401,8 +411,8 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
401411
if from_buf[..consumed] == to_buf[..consumed] {
402412
let last = from_buf[..consumed].last().unwrap();
403413

404-
at_byte += consumed;
405-
at_line += from_buf[..consumed].iter().filter(|&c| *c == b'\n').count();
414+
at_byte += consumed as Bytes;
415+
at_line += (from_buf[..consumed].iter().filter(|&c| *c == b'\n').count()) as u64;
406416

407417
start_of_line = *last == b'\n';
408418

@@ -590,7 +600,7 @@ fn format_visible_byte(byte: u8) -> String {
590600
fn format_verbose_difference(
591601
from_byte: u8,
592602
to_byte: u8,
593-
at_byte: usize,
603+
at_byte: Bytes,
594604
offset_width: usize,
595605
output: &mut Vec<u8>,
596606
params: &Params,
@@ -655,7 +665,7 @@ fn format_verbose_difference(
655665
}
656666

657667
#[inline]
658-
fn report_eof(at_byte: usize, at_line: usize, start_of_line: bool, eof_on: &str, params: &Params) {
668+
fn report_eof(at_byte: Bytes, at_line: u64, start_of_line: bool, eof_on: &str, params: &Params) {
659669
if params.quiet {
660670
return;
661671
}
@@ -707,7 +717,7 @@ fn is_posix_locale() -> bool {
707717
}
708718

709719
#[inline]
710-
fn report_difference(from_byte: u8, to_byte: u8, at_byte: usize, at_line: usize, params: &Params) {
720+
fn report_difference(from_byte: u8, to_byte: u8, at_byte: Bytes, at_line: u64, params: &Params) {
711721
if params.quiet {
712722
return;
713723
}
@@ -804,7 +814,7 @@ mod tests {
804814
from: os("foo"),
805815
to: os("bar"),
806816
skip_a: Some(1),
807-
skip_b: Some(usize::MAX),
817+
skip_b: Some(IgnInit::MAX),
808818
..Default::default()
809819
}),
810820
parse_params(
@@ -982,7 +992,7 @@ mod tests {
982992
executable: os("cmp"),
983993
from: os("foo"),
984994
to: os("bar"),
985-
max_bytes: Some(usize::MAX),
995+
max_bytes: Some(Bytes::MAX),
986996
..Default::default()
987997
}),
988998
parse_params(
@@ -999,6 +1009,7 @@ mod tests {
9991009
);
10001010

10011011
// Failure case
1012+
// TODO This is actually fine in GNU cmp. --bytes does not have a unit parser yet.
10021013
assert_eq!(
10031014
Err("cmp: invalid --bytes value '1K'".to_string()),
10041015
parse_params(
@@ -1044,8 +1055,8 @@ mod tests {
10441055
executable: os("cmp"),
10451056
from: os("foo"),
10461057
to: os("bar"),
1047-
skip_a: Some(usize::MAX),
1048-
skip_b: Some(usize::MAX),
1058+
skip_a: Some(IgnInit::MAX),
1059+
skip_b: Some(IgnInit::MAX),
10491060
..Default::default()
10501061
}),
10511062
parse_params(
@@ -1116,8 +1127,12 @@ mod tests {
11161127
.enumerate()
11171128
{
11181129
let values = [
1119-
1_000usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX),
1120-
1024usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX),
1130+
(1_000 as IgnInit)
1131+
.checked_pow((i + 1) as u32)
1132+
.unwrap_or(IgnInit::MAX),
1133+
(1024 as IgnInit)
1134+
.checked_pow((i + 1) as u32)
1135+
.unwrap_or(IgnInit::MAX),
11211136
];
11221137
for (j, v) in values.iter().enumerate() {
11231138
assert_eq!(

0 commit comments

Comments
 (0)