weezl/
encode.rs

Help
1//! A module for all encoding needs.
2use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult};
3use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE};
4
5use crate::alloc::{boxed::Box, vec::Vec};
6#[cfg(feature = "std")]
7use crate::error::StreamResult;
8#[cfg(feature = "std")]
9use std::io::{self, BufRead, Write};
10
11/// The state for encoding data with an LZW algorithm.
12///
13/// The same structure can be utilized with streams as well as your own buffers and driver logic.
14/// It may even be possible to mix them if you are sufficiently careful not to lose any written
15/// data in the process.
16///
17/// This is a sans-IO implementation, meaning that it only contains the state of the encoder and
18/// the caller will provide buffers for input and output data when calling the basic
19/// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*`
20/// methods for enoding with a particular style of common IO.
21///
22/// * [`encode`] for encoding once without any IO-loop.
23/// * [`into_async`] for encoding with the `futures` traits for asynchronous IO.
24/// * [`into_stream`] for encoding with the standard `io` traits.
25/// * [`into_vec`] for in-memory encoding.
26///
27/// [`encode_bytes`]: #method.encode_bytes
28/// [`encode`]: #method.encode
29/// [`into_async`]: #method.into_async
30/// [`into_stream`]: #method.into_stream
31/// [`into_vec`]: #method.into_vec
32pub struct Encoder {
33    /// Internally dispatch via a dynamic trait object. This did not have any significant
34    /// performance impact as we batch data internally and this pointer does not change after
35    /// creation!
36    state: Box<dyn Stateful + Send + 'static>,
37}
38
39/// A encoding stream sink.
40///
41/// See [`Encoder::into_stream`] on how to create this type.
42///
43/// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream
44#[cfg_attr(
45    not(feature = "std"),
46    deprecated = "This type is only useful with the `std` feature."
47)]
48#[cfg_attr(not(feature = "std"), allow(dead_code))]
49pub struct IntoStream<'d, W> {
50    encoder: &'d mut Encoder,
51    writer: W,
52    buffer: Option<StreamBuf<'d>>,
53    default_size: usize,
54}
55
56/// An async decoding sink.
57///
58/// See [`Encoder::into_async`] on how to create this type.
59///
60/// [`Encoder::into_async`]: struct.Encoder.html#method.into_async
61#[cfg(feature = "async")]
62pub struct IntoAsync<'d, W> {
63    encoder: &'d mut Encoder,
64    writer: W,
65    buffer: Option<StreamBuf<'d>>,
66    default_size: usize,
67}
68
69/// A encoding sink into a vector.
70///
71/// See [`Encoder::into_vec`] on how to create this type.
72///
73/// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec
74pub struct IntoVec<'d> {
75    encoder: &'d mut Encoder,
76    vector: &'d mut Vec<u8>,
77}
78
79trait Stateful {
80    fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult;
81    fn mark_ended(&mut self) -> bool;
82    /// Reset the state tracking if end code has been written.
83    fn restart(&mut self);
84    /// Reset the encoder to the beginning, dropping all buffers etc.
85    fn reset(&mut self);
86}
87
88struct EncodeState<B: Buffer> {
89    /// The configured minimal code size.
90    min_size: u8,
91    /// The current encoding symbol tree.
92    tree: Tree,
93    /// If we have pushed the end code.
94    has_ended: bool,
95    /// If tiff then bumps are a single code sooner.
96    is_tiff: bool,
97    /// The code corresponding to the currently read characters.
98    current_code: Code,
99    /// The clear code for resetting the dictionary.
100    clear_code: Code,
101    /// The bit buffer for encoding.
102    buffer: B,
103}
104
105struct MsbBuffer {
106    /// The current code length.
107    code_size: u8,
108    /// The buffer bits.
109    buffer: u64,
110    /// The number of valid buffer bits.
111    bits_in_buffer: u8,
112}
113
114struct LsbBuffer {
115    /// The current code length.
116    code_size: u8,
117    /// The buffer bits.
118    buffer: u64,
119    /// The number of valid buffer bits.
120    bits_in_buffer: u8,
121}
122
123trait Buffer {
124    fn new(size: u8) -> Self;
125    /// Reset the code size in the buffer.
126    fn reset(&mut self, min_size: u8);
127    /// Apply effects of a Clear Code.
128    fn clear(&mut self, min_size: u8);
129    /// Insert a code into the buffer.
130    fn buffer_code(&mut self, code: Code);
131    /// Push bytes if the buffer space is getting small.
132    fn push_out(&mut self, out: &mut &mut [u8]) -> bool;
133    /// Flush all full bytes, returning if at least one more byte remains.
134    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool;
135    /// Pad the buffer to a full byte.
136    fn buffer_pad(&mut self);
137    /// Increase the maximum code size.
138    fn bump_code_size(&mut self);
139    /// Return the maximum code with the current code size.
140    fn max_code(&self) -> Code;
141    /// Return the current code size in bits.
142    fn code_size(&self) -> u8;
143}
144
145/// One tree node for at most each code.
146/// To avoid using too much memory we keep nodes with few successors in optimized form. This form
147/// doesn't offer lookup by indexing but instead does a linear search.
148#[derive(Default)]
149struct Tree {
150    simples: Vec<Simple>,
151    complex: Vec<Full>,
152    keys: Vec<CompressedKey>,
153}
154
155#[derive(Clone, Copy)]
156enum FullKey {
157    NoSuccessor,
158    Simple(u16),
159    Full(u16),
160}
161
162#[derive(Clone, Copy)]
163struct CompressedKey(u16);
164
165const SHORT: usize = 16;
166
167#[derive(Clone, Copy)]
168struct Simple {
169    codes: [Code; SHORT],
170    chars: [u8; SHORT],
171    count: u8,
172}
173
174#[derive(Clone, Copy)]
175struct Full {
176    char_continuation: [Code; 256],
177}
178
179/// Describes the static parameters for creating a decoder.
180#[derive(Clone, Debug)]
181pub struct Configuration {
182    order: BitOrder,
183    size: u8,
184    tiff: bool,
185}
186
187impl Configuration {
188    /// Create a configuration to decode with the specified bit order and symbol size.
189    ///
190    /// # Panics
191    ///
192    /// The `size` needs to be in the interval `2..=12`.
193    pub fn new(order: BitOrder, size: u8) -> Self {
194        super::assert_encode_size(size);
195        Configuration {
196            order,
197            size,
198            tiff: false,
199        }
200    }
201
202    /// Create a configuration for a TIFF compatible decoder.
203    ///
204    /// # Panics
205    ///
206    /// The `size` needs to be in the interval `2..=12`.
207    pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self {
208        super::assert_encode_size(size);
209        Configuration {
210            order,
211            size,
212            tiff: true,
213        }
214    }
215
216    /// Create a new decoder with the define configuration.
217    pub fn build(self) -> Encoder {
218        Encoder {
219            state: Encoder::from_configuration(&self),
220        }
221    }
222}
223
224impl Encoder {
225    /// Create a new encoder with the specified bit order and symbol size.
226    ///
227    /// The algorithm for dynamically increasing the code symbol bit width is compatible with the
228    /// original specification. In particular you will need to specify an `Lsb` bit oder to encode
229    /// the data portion of a compressed `gif` image.
230    ///
231    /// # Panics
232    ///
233    /// The `size` needs to be in the interval `2..=12`.
234    pub fn new(order: BitOrder, size: u8) -> Self {
235        Configuration::new(order, size).build()
236    }
237
238    /// Create a TIFF compatible encoder with the specified bit order and symbol size.
239    ///
240    /// The algorithm for dynamically increasing the code symbol bit width is compatible with the
241    /// TIFF specification, which is a misinterpretation of the original algorithm for increasing
242    /// the code size. It switches one symbol sooner.
243    ///
244    /// # Panics
245    ///
246    /// The `size` needs to be in the interval `2..=12`.
247    pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self {
248        Configuration::with_tiff_size_switch(order, size).build()
249    }
250
251    fn from_configuration(cfg: &Configuration) -> Box<dyn Stateful + Send + 'static> {
252        match cfg.order {
253            BitOrder::Lsb => {
254                let mut state = EncodeState::<LsbBuffer>::new(cfg.size);
255                state.is_tiff = cfg.tiff;
256                Box::new(state)
257            }
258            BitOrder::Msb => {
259                let mut state = EncodeState::<MsbBuffer>::new(cfg.size);
260                state.is_tiff = cfg.tiff;
261                Box::new(state)
262            }
263        }
264    }
265
266    /// Encode some bytes from `inp` into `out`.
267    ///
268    /// See [`into_stream`] for high-level functions (this interface is only available with the
269    /// `std` feature) and [`finish`] for marking the input data as complete.
270    ///
271    /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and
272    /// all following ones will _not_ be consumed and the `status` of the result will signal an
273    /// error. The result will also indicate that all bytes up to but not including the offending
274    /// byte have been consumed. You may try again with a fixed byte.
275    ///
276    /// [`into_stream`]: #method.into_stream
277    /// [`finish`]: #method.finish
278    pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult {
279        self.state.advance(inp, out)
280    }
281
282    /// Encode a single chunk of data.
283    ///
284    /// This method will add an end marker to the encoded chunk.
285    ///
286    /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize
287    /// buffer size, to supply an existing vector, to control whether an end marker is required, or
288    /// to preserve partial data in the case of a decoding error.
289    ///
290    /// [`into_vec`]: #into_vec
291    ///
292    /// # Example
293    ///
294    /// ```
295    /// use weezl::{BitOrder, encode::Encoder};
296    ///
297    /// let data = b"Hello, world";
298    /// let encoded = Encoder::new(BitOrder::Msb, 9)
299    ///     .encode(data)
300    ///     .expect("All bytes valid for code size");
301    /// ```
302    pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> {
303        let mut output = Vec::new();
304        self.into_vec(&mut output).encode_all(data).status?;
305        Ok(output)
306    }
307
308    /// Construct a encoder into a writer.
309    #[cfg(feature = "std")]
310    pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> {
311        IntoStream {
312            encoder: self,
313            writer,
314            buffer: None,
315            default_size: STREAM_BUF_SIZE,
316        }
317    }
318
319    /// Construct a encoder into an async writer.
320    #[cfg(feature = "async")]
321    pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> {
322        IntoAsync {
323            encoder: self,
324            writer,
325            buffer: None,
326            default_size: STREAM_BUF_SIZE,
327        }
328    }
329
330    /// Construct an encoder into a vector.
331    ///
332    /// All encoded data is appended and the vector is __not__ cleared.
333    ///
334    /// Compared to `into_stream` this interface allows a high-level access to encoding without
335    /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the
336    /// special target exposes.
337    pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> {
338        IntoVec {
339            encoder: self,
340            vector: vec,
341        }
342    }
343
344    /// Mark the encoding as in the process of finishing.
345    ///
346    /// The next following call to `encode_bytes` which is able to consume the complete input will
347    /// also try to emit an end code. It's not recommended, but also not unsound, to use different
348    /// byte slices in different calls from this point forward and thus to 'delay' the actual end
349    /// of the data stream. The behaviour after the end marker has been written is unspecified but
350    /// sound.
351    pub fn finish(&mut self) {
352        self.state.mark_ended();
353    }
354
355    /// Undo marking this data stream as ending.
356    /// FIXME: clarify how this interacts with padding introduced after end code.
357    #[allow(dead_code)]
358    pub(crate) fn restart(&mut self) {
359        self.state.restart()
360    }
361
362    /// Reset all internal state.
363    ///
364    /// This produce an encoder as if just constructed with `new` but taking slightly less work. In
365    /// particular it will not deallocate any internal allocations. It will also avoid some
366    /// duplicate setup work.
367    pub fn reset(&mut self) {
368        self.state.reset()
369    }
370}
371
372#[cfg(feature = "std")]
373impl<'d, W: Write> IntoStream<'d, W> {
374    /// Encode data from a reader.
375    ///
376    /// This will drain the supplied reader. It will not encode an end marker after all data has
377    /// been processed.
378    pub fn encode(&mut self, read: impl BufRead) -> StreamResult {
379        self.encode_part(read, false)
380    }
381
382    /// Encode data from a reader and an end marker.
383    pub fn encode_all(mut self, read: impl BufRead) -> StreamResult {
384        self.encode_part(read, true)
385    }
386
387    /// Set the size of the intermediate encode buffer.
388    ///
389    /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is
390    /// available and any encoding method is called. No buffer is allocated if `set_buffer` has
391    /// been called. The buffer is reused.
392    ///
393    /// # Panics
394    /// This method panics if `size` is `0`.
395    pub fn set_buffer_size(&mut self, size: usize) {
396        assert_ne!(size, 0, "Attempted to set empty buffer");
397        self.default_size = size;
398    }
399
400    /// Use a particular buffer as an intermediate encode buffer.
401    ///
402    /// Calling this sets or replaces the buffer. When a buffer has been set then it is used
403    /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant
404    /// for efficient encoding as there is additional overhead from `write` calls each time the
405    /// buffer has been filled.
406    ///
407    /// # Panics
408    /// This method panics if the `buffer` is empty.
409    pub fn set_buffer(&mut self, buffer: &'d mut [u8]) {
410        assert_ne!(buffer.len(), 0, "Attempted to set empty buffer");
411        self.buffer = Some(StreamBuf::Borrowed(buffer));
412    }
413
414    fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult {
415        let IntoStream {
416            encoder,
417            writer,
418            buffer,
419            default_size,
420        } = self;
421        enum Progress {
422            Ok,
423            Done,
424        }
425
426        let mut bytes_read = 0;
427        let mut bytes_written = 0;
428
429        let read_bytes = &mut bytes_read;
430        let write_bytes = &mut bytes_written;
431
432        let outbuf: &mut [u8] =
433            match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } {
434                StreamBuf::Borrowed(slice) => &mut *slice,
435                StreamBuf::Owned(vec) => &mut *vec,
436            };
437        assert!(!outbuf.is_empty());
438
439        let once = move || {
440            let data = read.fill_buf()?;
441
442            if data.is_empty() {
443                if finish {
444                    encoder.finish();
445                } else {
446                    return Ok(Progress::Done);
447                }
448            }
449
450            let result = encoder.encode_bytes(data, &mut outbuf[..]);
451            *read_bytes += result.consumed_in;
452            *write_bytes += result.consumed_out;
453            read.consume(result.consumed_in);
454
455            let done = result.status.map_err(|err| {
456                io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err))
457            })?;
458
459            if let LzwStatus::Done = done {
460                writer.write_all(&outbuf[..result.consumed_out])?;
461                return Ok(Progress::Done);
462            }
463
464            if let LzwStatus::NoProgress = done {
465                return Err(io::Error::new(
466                    io::ErrorKind::UnexpectedEof,
467                    "No more data but no end marker detected",
468                ));
469            }
470
471            writer.write_all(&outbuf[..result.consumed_out])?;
472            Ok(Progress::Ok)
473        };
474
475        let status = core::iter::repeat_with(once)
476            // scan+fuse can be replaced with map_while
477            .scan((), |(), result| match result {
478                Ok(Progress::Ok) => Some(Ok(())),
479                Err(err) => Some(Err(err)),
480                Ok(Progress::Done) => None,
481            })
482            .fuse()
483            .collect();
484
485        StreamResult {
486            bytes_read,
487            bytes_written,
488            status,
489        }
490    }
491}
492
493impl IntoVec<'_> {
494    /// Encode data from a slice.
495    pub fn encode(&mut self, read: &[u8]) -> VectorResult {
496        self.encode_part(read, false)
497    }
498
499    /// Decode data from a reader, adding an end marker.
500    pub fn encode_all(mut self, read: &[u8]) -> VectorResult {
501        self.encode_part(read, true)
502    }
503
504    fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) {
505        const CHUNK_SIZE: usize = 1 << 12;
506        let decoder = &mut self.encoder;
507        let length = self.vector.len();
508
509        // Use the vector to do overflow checks and w/e.
510        self.vector.reserve(CHUNK_SIZE);
511        // FIXME: encoding into uninit buffer?
512        self.vector.resize(length + CHUNK_SIZE, 0u8);
513
514        (&mut self.vector[length..], decoder)
515    }
516
517    fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult {
518        let mut result = VectorResult {
519            consumed_in: 0,
520            consumed_out: 0,
521            status: Ok(LzwStatus::Ok),
522        };
523
524        enum Progress {
525            Ok,
526            Done,
527        }
528
529        // Converting to mutable refs to move into the `once` closure.
530        let read_bytes = &mut result.consumed_in;
531        let write_bytes = &mut result.consumed_out;
532        let mut data = part;
533
534        // A 64 MB buffer is quite large but should get alloc_zeroed.
535        // Note that the decoded size can be up to quadratic in code block.
536        let once = move || {
537            // Grab a new output buffer.
538            let (outbuf, encoder) = self.grab_buffer();
539
540            if finish {
541                encoder.finish();
542            }
543
544            // Decode as much of the buffer as fits.
545            let result = encoder.encode_bytes(data, &mut outbuf[..]);
546            // Do the bookkeeping and consume the buffer.
547            *read_bytes += result.consumed_in;
548            *write_bytes += result.consumed_out;
549            data = &data[result.consumed_in..];
550
551            let unfilled = outbuf.len() - result.consumed_out;
552            let filled = self.vector.len() - unfilled;
553            self.vector.truncate(filled);
554
555            // Handle the status in the result.
556            let done = result.status?;
557            if let LzwStatus::Done = done {
558                Ok(Progress::Done)
559            } else {
560                Ok(Progress::Ok)
561            }
562        };
563
564        // Decode chunks of input data until we're done.
565        let status: Result<(), _> = core::iter::repeat_with(once)
566            // scan+fuse can be replaced with map_while
567            .scan((), |(), result| match result {
568                Ok(Progress::Ok) => Some(Ok(())),
569                Err(err) => Some(Err(err)),
570                Ok(Progress::Done) => None,
571            })
572            .fuse()
573            .collect();
574
575        if let Err(err) = status {
576            result.status = Err(err);
577        }
578
579        result
580    }
581}
582
583// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would
584// trip over the usage of await, which is a reserved keyword in that edition/version. It only
585// contains an impl block.
586#[cfg(feature = "async")]
587#[path = "encode_into_async.rs"]
588mod impl_encode_into_async;
589
590impl<B: Buffer> EncodeState<B> {
591    fn new(min_size: u8) -> Self {
592        let clear_code = 1 << min_size;
593        let mut tree = Tree::default();
594        tree.init(min_size);
595        let mut state = EncodeState {
596            min_size,
597            tree,
598            has_ended: false,
599            is_tiff: false,
600            current_code: clear_code,
601            clear_code,
602            buffer: B::new(min_size),
603        };
604        state.buffer_code(clear_code);
605        state
606    }
607}
608
609impl<B: Buffer> Stateful for EncodeState<B> {
610    fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult {
611        let c_in = inp.len();
612        let c_out = out.len();
613        let mut status = Ok(LzwStatus::Ok);
614
615        'encoding: loop {
616            if self.push_out(&mut out) {
617                break;
618            }
619
620            if inp.is_empty() && self.has_ended {
621                let end = self.end_code();
622                if self.current_code != end {
623                    if self.current_code != self.clear_code {
624                        self.buffer_code(self.current_code);
625
626                        // When reading this code, the decoder will add an extra entry to its table
627                        // before reading th end code. Thusly, it may increase its code size based
628                        // on this additional entry.
629                        if self.tree.keys.len() + usize::from(self.is_tiff)
630                            > usize::from(self.buffer.max_code())
631                            && self.buffer.code_size() < MAX_CODESIZE
632                        {
633                            self.buffer.bump_code_size();
634                        }
635                    }
636                    self.buffer_code(end);
637                    self.current_code = end;
638                    self.buffer_pad();
639                }
640
641                break;
642            }
643
644            let mut next_code = None;
645            let mut bytes = inp.iter();
646            while let Some(&byte) = bytes.next() {
647                if self.min_size < 8 && byte >= 1 << self.min_size {
648                    status = Err(LzwError::InvalidCode);
649                    break 'encoding;
650                }
651
652                inp = bytes.as_slice();
653                match self.tree.iterate(self.current_code, byte) {
654                    Ok(code) => self.current_code = code,
655                    Err(_) => {
656                        next_code = Some(self.current_code);
657
658                        self.current_code = u16::from(byte);
659                        break;
660                    }
661                }
662            }
663
664            match next_code {
665                // No more bytes, no code produced.
666                None => break,
667                Some(code) => {
668                    self.buffer_code(code);
669
670                    if self.tree.keys.len() + usize::from(self.is_tiff)
671                        > usize::from(self.buffer.max_code()) + 1
672                        && self.buffer.code_size() < MAX_CODESIZE
673                    {
674                        self.buffer.bump_code_size();
675                    }
676
677                    if self.tree.keys.len() > MAX_ENTRIES {
678                        self.buffer_code(self.clear_code);
679                        self.tree.reset(self.min_size);
680                        self.buffer.clear(self.min_size);
681                    }
682                }
683            }
684        }
685
686        if inp.is_empty() && self.current_code == self.end_code() {
687            if !self.flush_out(&mut out) {
688                status = Ok(LzwStatus::Done);
689            }
690        }
691
692        BufferResult {
693            consumed_in: c_in - inp.len(),
694            consumed_out: c_out - out.len(),
695            status,
696        }
697    }
698
699    fn mark_ended(&mut self) -> bool {
700        core::mem::replace(&mut self.has_ended, true)
701    }
702
703    fn restart(&mut self) {
704        self.has_ended = false;
705    }
706
707    fn reset(&mut self) {
708        self.restart();
709        self.current_code = self.clear_code;
710        self.tree.reset(self.min_size);
711        self.buffer.reset(self.min_size);
712        self.buffer_code(self.clear_code);
713    }
714}
715
716impl<B: Buffer> EncodeState<B> {
717    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
718        self.buffer.push_out(out)
719    }
720
721    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
722        self.buffer.flush_out(out)
723    }
724
725    fn end_code(&self) -> Code {
726        self.clear_code + 1
727    }
728
729    fn buffer_pad(&mut self) {
730        self.buffer.buffer_pad();
731    }
732
733    fn buffer_code(&mut self, code: Code) {
734        self.buffer.buffer_code(code);
735    }
736}
737
738impl Buffer for MsbBuffer {
739    fn new(min_size: u8) -> Self {
740        MsbBuffer {
741            code_size: min_size + 1,
742            buffer: 0,
743            bits_in_buffer: 0,
744        }
745    }
746
747    fn reset(&mut self, min_size: u8) {
748        self.code_size = min_size + 1;
749        self.buffer = 0;
750        self.bits_in_buffer = 0;
751    }
752
753    fn clear(&mut self, min_size: u8) {
754        self.code_size = min_size + 1;
755    }
756
757    fn buffer_code(&mut self, code: Code) {
758        let shift = 64 - self.bits_in_buffer - self.code_size;
759        self.buffer |= u64::from(code) << shift;
760        self.bits_in_buffer += self.code_size;
761    }
762
763    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
764        if self.bits_in_buffer + 2 * self.code_size < 64 {
765            return false;
766        }
767
768        self.flush_out(out)
769    }
770
771    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
772        let want = usize::from(self.bits_in_buffer / 8);
773        let count = want.min((*out).len());
774        let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count);
775        *out = tail;
776
777        for b in bytes {
778            *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8;
779            self.buffer <<= 8;
780            self.bits_in_buffer -= 8;
781        }
782
783        count < want
784    }
785
786    fn buffer_pad(&mut self) {
787        let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7;
788        self.bits_in_buffer += to_byte;
789    }
790
791    fn bump_code_size(&mut self) {
792        self.code_size += 1;
793    }
794
795    fn max_code(&self) -> Code {
796        (1 << self.code_size) - 1
797    }
798
799    fn code_size(&self) -> u8 {
800        self.code_size
801    }
802}
803
804impl Buffer for LsbBuffer {
805    fn new(min_size: u8) -> Self {
806        LsbBuffer {
807            code_size: min_size + 1,
808            buffer: 0,
809            bits_in_buffer: 0,
810        }
811    }
812
813    fn reset(&mut self, min_size: u8) {
814        self.code_size = min_size + 1;
815        self.buffer = 0;
816        self.bits_in_buffer = 0;
817    }
818
819    fn clear(&mut self, min_size: u8) {
820        self.code_size = min_size + 1;
821    }
822
823    fn buffer_code(&mut self, code: Code) {
824        self.buffer |= u64::from(code) << self.bits_in_buffer;
825        self.bits_in_buffer += self.code_size;
826    }
827
828    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
829        if self.bits_in_buffer + 2 * self.code_size < 64 {
830            return false;
831        }
832
833        self.flush_out(out)
834    }
835
836    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
837        let want = usize::from(self.bits_in_buffer / 8);
838        let count = want.min((*out).len());
839        let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count);
840        *out = tail;
841
842        for b in bytes {
843            *b = (self.buffer & 0x0000_0000_0000_00ff) as u8;
844            self.buffer >>= 8;
845            self.bits_in_buffer -= 8;
846        }
847
848        count < want
849    }
850
851    fn buffer_pad(&mut self) {
852        let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7;
853        self.bits_in_buffer += to_byte;
854    }
855
856    fn bump_code_size(&mut self) {
857        self.code_size += 1;
858    }
859
860    fn max_code(&self) -> Code {
861        (1 << self.code_size) - 1
862    }
863
864    fn code_size(&self) -> u8 {
865        self.code_size
866    }
867}
868
869impl Tree {
870    fn init(&mut self, min_size: u8) {
871        // We need a way to represent the state of a currently empty buffer. We use the clear code
872        // for this, thus create one complex mapping that leads to the one-char base codes.
873        self.keys
874            .resize((1 << min_size) + 2, FullKey::NoSuccessor.into());
875        self.complex.push(Full {
876            char_continuation: [0; 256],
877        });
878        let map_of_begin = self.complex.last_mut().unwrap();
879        for ch in 0u16..256 {
880            map_of_begin.char_continuation[usize::from(ch)] = ch;
881        }
882        self.keys[1 << min_size] = FullKey::Full(0).into();
883    }
884
885    fn reset(&mut self, min_size: u8) {
886        self.simples.clear();
887        self.keys.truncate((1 << min_size) + 2);
888        // Keep entry for clear code.
889        self.complex.truncate(1);
890        // The first complex is not changed..
891        for k in self.keys[..(1 << min_size) + 2].iter_mut() {
892            *k = FullKey::NoSuccessor.into();
893        }
894        self.keys[1 << min_size] = FullKey::Full(0).into();
895    }
896
897    fn at_key(&self, code: Code, ch: u8) -> Option<Code> {
898        let key = self.keys[usize::from(code)];
899        match FullKey::from(key) {
900            FullKey::NoSuccessor => None,
901            FullKey::Simple(idx) => {
902                let nexts = &self.simples[usize::from(idx)];
903                let successors = nexts
904                    .codes
905                    .iter()
906                    .zip(nexts.chars.iter())
907                    .take(usize::from(nexts.count));
908                for (&scode, &sch) in successors {
909                    if sch == ch {
910                        return Some(scode);
911                    }
912                }
913
914                None
915            }
916            FullKey::Full(idx) => {
917                let full = &self.complex[usize::from(idx)];
918                let precode = full.char_continuation[usize::from(ch)];
919                if usize::from(precode) < MAX_ENTRIES {
920                    Some(precode)
921                } else {
922                    None
923                }
924            }
925        }
926    }
927
928    /// Iterate to the next char.
929    /// Return Ok when it was already in the tree or creates a new entry for it and returns Err.
930    fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> {
931        if let Some(next) = self.at_key(code, ch) {
932            Ok(next)
933        } else {
934            Err(self.append(code, ch))
935        }
936    }
937
938    fn append(&mut self, code: Code, ch: u8) -> Code {
939        let next: Code = self.keys.len() as u16;
940        let key = self.keys[usize::from(code)];
941        // TODO: with debug assertions, check for non-existence
942        match FullKey::from(key) {
943            FullKey::NoSuccessor => {
944                let new_key = FullKey::Simple(self.simples.len() as u16);
945                self.simples.push(Simple::default());
946                let simples = self.simples.last_mut().unwrap();
947                simples.codes[0] = next;
948                simples.chars[0] = ch;
949                simples.count = 1;
950                self.keys[usize::from(code)] = new_key.into();
951            }
952            FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => {
953                let nexts = &mut self.simples[usize::from(idx)];
954                let nidx = usize::from(nexts.count);
955                nexts.chars[nidx] = ch;
956                nexts.codes[nidx] = next;
957                nexts.count += 1;
958            }
959            FullKey::Simple(idx) => {
960                let new_key = FullKey::Full(self.complex.len() as u16);
961                let simples = &self.simples[usize::from(idx)];
962                self.complex.push(Full {
963                    char_continuation: [Code::max_value(); 256],
964                });
965                let full = self.complex.last_mut().unwrap();
966                for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) {
967                    full.char_continuation[usize::from(pch)] = pcont;
968                }
969                self.keys[usize::from(code)] = new_key.into();
970            }
971            FullKey::Full(idx) => {
972                let full = &mut self.complex[usize::from(idx)];
973                full.char_continuation[usize::from(ch)] = next;
974            }
975        }
976        self.keys.push(FullKey::NoSuccessor.into());
977        next
978    }
979}
980
981impl Default for FullKey {
982    fn default() -> Self {
983        FullKey::NoSuccessor
984    }
985}
986
987impl Default for Simple {
988    fn default() -> Self {
989        Simple {
990            codes: [0; SHORT],
991            chars: [0; SHORT],
992            count: 0,
993        }
994    }
995}
996
997impl From<CompressedKey> for FullKey {
998    fn from(CompressedKey(key): CompressedKey) -> Self {
999        match (key >> MAX_CODESIZE) & 0xf {
1000            0 => FullKey::Full(key & 0xfff),
1001            1 => FullKey::Simple(key & 0xfff),
1002            _ => FullKey::NoSuccessor,
1003        }
1004    }
1005}
1006
1007impl From<FullKey> for CompressedKey {
1008    fn from(full: FullKey) -> Self {
1009        CompressedKey(match full {
1010            FullKey::NoSuccessor => 0x2000,
1011            FullKey::Simple(code) => 0x1000 | code,
1012            FullKey::Full(code) => code,
1013        })
1014    }
1015}
1016
1017#[cfg(test)]
1018mod tests {
1019    use super::{BitOrder, Encoder, LzwError, LzwStatus};
1020    use crate::alloc::vec::Vec;
1021    use crate::decode::Decoder;
1022    #[cfg(feature = "std")]
1023    use crate::StreamBuf;
1024
1025    #[test]
1026    fn invalid_input_rejected() {
1027        const BIT_LEN: u8 = 2;
1028        let ref input = [0, 1 << BIT_LEN /* invalid */, 0];
1029        let ref mut target = [0u8; 128];
1030        let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN);
1031
1032        encoder.finish();
1033        // We require simulation of normality, that is byte-for-byte compression.
1034        let result = encoder.encode_bytes(input, target);
1035        assert!(if let Err(LzwError::InvalidCode) = result.status {
1036            true
1037        } else {
1038            false
1039        });
1040        assert_eq!(result.consumed_in, 1);
1041
1042        let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]);
1043        assert!(if let Ok(LzwStatus::Done) = fixed.status {
1044            true
1045        } else {
1046            false
1047        });
1048        assert_eq!(fixed.consumed_in, 2);
1049
1050        // Okay, now test we actually fixed it.
1051        let ref mut compare = [0u8; 4];
1052        let mut todo = &target[..result.consumed_out + fixed.consumed_out];
1053        let mut free = &mut compare[..];
1054        let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN);
1055
1056        // Decode with up to 16 rounds, far too much but inconsequential.
1057        for _ in 0..16 {
1058            if decoder.has_ended() {
1059                break;
1060            }
1061
1062            let result = decoder.decode_bytes(todo, free);
1063            assert!(result.status.is_ok());
1064            todo = &todo[result.consumed_in..];
1065            free = &mut free[result.consumed_out..];
1066        }
1067
1068        let remaining = { free }.len();
1069        let len = compare.len() - remaining;
1070        assert_eq!(todo, &[]);
1071        assert_eq!(compare[..len], [0, 1, 0]);
1072    }
1073
1074    #[test]
1075    #[should_panic]
1076    fn invalid_code_size_low() {
1077        let _ = Encoder::new(BitOrder::Msb, 1);
1078    }
1079
1080    #[test]
1081    #[should_panic]
1082    fn invalid_code_size_high() {
1083        let _ = Encoder::new(BitOrder::Msb, 14);
1084    }
1085
1086    fn make_decoded() -> Vec<u8> {
1087        const FILE: &'static [u8] =
1088            include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock"));
1089        return Vec::from(FILE);
1090    }
1091
1092    #[test]
1093    #[cfg(feature = "std")]
1094    fn into_stream_buffer_no_alloc() {
1095        let encoded = make_decoded();
1096        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1097
1098        let mut output = vec![];
1099        let mut buffer = [0; 512];
1100        let mut istream = encoder.into_stream(&mut output);
1101        istream.set_buffer(&mut buffer[..]);
1102        istream.encode(&encoded[..]).status.unwrap();
1103
1104        match istream.buffer {
1105            Some(StreamBuf::Borrowed(_)) => {}
1106            None => panic!("Decoded without buffer??"),
1107            Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"),
1108        }
1109    }
1110
1111    #[test]
1112    #[cfg(feature = "std")]
1113    fn into_stream_buffer_small_alloc() {
1114        struct WriteTap<W: std::io::Write>(W);
1115        const BUF_SIZE: usize = 512;
1116
1117        impl<W: std::io::Write> std::io::Write for WriteTap<W> {
1118            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1119                assert!(buf.len() <= BUF_SIZE);
1120                self.0.write(buf)
1121            }
1122            fn flush(&mut self) -> std::io::Result<()> {
1123                self.0.flush()
1124            }
1125        }
1126
1127        let encoded = make_decoded();
1128        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1129
1130        let mut output = vec![];
1131        let mut istream = encoder.into_stream(WriteTap(&mut output));
1132        istream.set_buffer_size(512);
1133        istream.encode(&encoded[..]).status.unwrap();
1134
1135        match istream.buffer {
1136            Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE),
1137            Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"),
1138            None => panic!("Decoded without buffer??"),
1139        }
1140    }
1141
1142    #[test]
1143    #[cfg(feature = "std")]
1144    fn reset() {
1145        let encoded = make_decoded();
1146        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1147        let mut reference = None;
1148
1149        for _ in 0..2 {
1150            let mut output = vec![];
1151            let mut buffer = [0; 512];
1152            let mut istream = encoder.into_stream(&mut output);
1153            istream.set_buffer(&mut buffer[..]);
1154            istream.encode_all(&encoded[..]).status.unwrap();
1155
1156            encoder.reset();
1157            if let Some(reference) = &reference {
1158                assert_eq!(output, *reference);
1159            } else {
1160                reference = Some(output);
1161            }
1162        }
1163    }
1164}
weezl/encode.rs

weezl/
encode.rs