zlib-ng

2026-06-22 06:26:26 +00:00 · 2023-06-27 11:45:54 +02:00
parent c9604efc4a
commit e4847c6b03
297 changed files with 89428 additions and 5 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ option(USE_GSSAPI              "Link with libgssapi for SPNEGO auth"      OFF)
   set(USE_HTTP_PARSER         "" CACHE STRING "Specifies the HTTP Parser implementation; either system or builtin.")
 #  set(USE_XDIFF               "" CACHE STRING "Specifies the xdiff implementation; either system or builtin.")
   set(REGEX_BACKEND           "" CACHE STRING "Regular expression implementation. One of regcomp_l, pcre2, pcre, regcomp, or builtin.")
-option(USE_BUNDLED_ZLIB        "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL" OFF)
+   set(USE_BUNDLED_ZLIB        "" CACHE STRING "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium/zlibg-ng. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL")

 # Debugging options
 option(USE_LEAK_CHECKER        "Run tests with leak checker"                           OFF)
--- a/cmake/SelectZlib.cmake
+++ b/cmake/SelectZlib.cmake
@@ -2,11 +2,11 @@
 include(SanitizeBool)

 SanitizeBool(USE_BUNDLED_ZLIB)
-if(USE_BUNDLED_ZLIB STREQUAL ON)
+if(USE_BUNDLED_ZLIB STREQUAL "ON")
 	set(USE_BUNDLED_ZLIB "Bundled")
 endif()

-if(USE_BUNDLED_ZLIB STREQUAL "OFF")
+if(USE_BUNDLED_ZLIB STREQUAL "OFF" OR USE_BUNDLED_ZLIB STREQUAL "")
 	find_package(ZLIB)
 	if(ZLIB_FOUND)
 		list(APPEND LIBGIT2_SYSTEM_INCLUDES ${ZLIB_INCLUDE_DIRS})
@@ -17,16 +17,26 @@ if(USE_BUNDLED_ZLIB STREQUAL "OFF")
 			list(APPEND LIBGIT2_PC_REQUIRES "zlib")
 		endif()
 		add_feature_info(zlib ON "using system zlib")
+	elseif(USE_BUNDLED_ZLIB STREQUAL "OFF")
+		message(FATAL_ERROR "zlib was not found")
 	else()
-		message(STATUS "zlib was not found; using bundled 3rd-party sources." )
+		message(WARNING "zlib was not found; using bundled 3rd-party sources." )
+		set(USE_BUNDLED_ZLIB "Bundled")
 	endif()
 endif()
+
 if(USE_BUNDLED_ZLIB STREQUAL "Chromium")
 	add_subdirectory("${PROJECT_SOURCE_DIR}/deps/chromium-zlib" "${PROJECT_BINARY_DIR}/deps/chromium-zlib")
 	list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/chromium-zlib")
 	list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:chromium_zlib>)
 	add_feature_info(zlib ON "using (Chromium) bundled zlib")
-elseif(USE_BUNDLED_ZLIB OR NOT ZLIB_FOUND)
+elseif(USE_BUNDLED_ZLIB STREQUAL "zlib-ng")
+	add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib-ng" "${PROJECT_BINARY_DIR}/deps/zlib-ng")
+	list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
+	list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
+	list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlibstatic>)
+	add_feature_info(zlib ON "using bundled zlib-ng")
+elseif(USE_BUNDLED_ZLIB)
 	add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib" "${PROJECT_BINARY_DIR}/deps/zlib")
 	list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib")
 	list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlib>)
--- a/deps/zlib-ng/CMakeLists.txt
+++ b/deps/zlib-ng/CMakeLists.txt
--- a/deps/zlib-ng/FAQ.zlib
+++ b/deps/zlib-ng/FAQ.zlib
@@ -0,0 +1,374 @@
+##
+# THIS IS AN UNMAINTAINED COPY OF THE ORIGINAL FILE DISTRIBUTED WITH ZLIB 1.2.11
+##
+
+
+
+
+                Frequently Asked Questions about zlib
+
+
+If your question is not there, please check the zlib home page
+https://zlib.net/ which may have more recent information.
+The latest zlib FAQ is at https://zlib.net/zlib_faq.html
+
+
+ 1. Is zlib Y2K-compliant?
+
+    Yes. zlib doesn't handle dates.
+
+ 2. Where can I get a Windows DLL version?
+
+    The zlib sources can be compiled without change to produce a DLL.  See the
+    file win32/DLL_FAQ.txt in the zlib distribution.  Pointers to the
+    precompiled DLL are found in the zlib web site at https://zlib.net/ .
+
+ 3. Where can I get a Visual Basic interface to zlib?
+
+    See
+        * https://marknelson.us/1997/01/01/zlib-engine/
+        * win32/DLL_FAQ.txt in the zlib distribution
+
+ 4. compress() returns Z_BUF_ERROR.
+
+    Make sure that before the call of compress(), the length of the compressed
+    buffer is equal to the available size of the compressed buffer and not
+    zero.  For Visual Basic, check that this parameter is passed by reference
+    ("as any"), not by value ("as long").
+
+ 5. deflate() or inflate() returns Z_BUF_ERROR.
+
+    Before making the call, make sure that avail_in and avail_out are not zero.
+    When setting the parameter flush equal to Z_FINISH, also make sure that
+    avail_out is big enough to allow processing all pending input.  Note that a
+    Z_BUF_ERROR is not fatal--another call to deflate() or inflate() can be
+    made with more input or output space.  A Z_BUF_ERROR may in fact be
+    unavoidable depending on how the functions are used, since it is not
+    possible to tell whether or not there is more output pending when
+    strm.avail_out returns with zero.  See https://zlib.net/zlib_how.html for a
+    heavily annotated example.
+
+ 6. Where's the zlib documentation (man pages, etc.)?
+
+    It's in zlib.h .  Examples of zlib usage are in the files test/example.c
+    and test/minigzip.c, with more in examples/ .
+
+ 7. Why don't you use GNU autoconf or libtool or ...?
+
+    Because we would like to keep zlib as a very small and simple package.
+    zlib is rather portable and doesn't need much configuration.
+
+ 8. I found a bug in zlib.
+
+    Most of the time, such problems are due to an incorrect usage of zlib.
+    Please try to reproduce the problem with a small program and send the
+    corresponding source to us at zlib@gzip.org .  Do not send multi-megabyte
+    data files without prior agreement.
+
+ 9. Why do I get "undefined reference to gzputc"?
+
+    If "make test" produces something like
+
+       example.o(.text+0x154): undefined reference to `gzputc'
+
+    check that you don't have old files libz.* in /usr/lib, /usr/local/lib or
+    /usr/X11R6/lib. Remove any old versions, then do "make install".
+
+10. I need a Delphi interface to zlib.
+
+    See the contrib/delphi directory in the zlib distribution.
+
+11. Can zlib handle .zip archives?
+
+    Not by itself, no.  See the directory contrib/minizip in the zlib
+    distribution.
+
+12. Can zlib handle .Z files?
+
+    No, sorry.  You have to spawn an uncompress or gunzip subprocess, or adapt
+    the code of uncompress on your own.
+
+13. How can I make a Unix shared library?
+
+    By default a shared (and a static) library is built for Unix.  So:
+
+    make distclean
+    ./configure
+    make
+
+14. How do I install a shared zlib library on Unix?
+
+    After the above, then:
+
+    make install
+
+    However, many flavors of Unix come with a shared zlib already installed.
+    Before going to the trouble of compiling a shared version of zlib and
+    trying to install it, you may want to check if it's already there!  If you
+    can #include <zlib.h>, it's there.  The -lz option will probably link to
+    it.  You can check the version at the top of zlib.h or with the
+    ZLIB_VERSION symbol defined in zlib.h .
+
+15. I have a question about OttoPDF.
+
+    We are not the authors of OttoPDF. The real author is on the OttoPDF web
+    site: Joel Hainley, jhainley@myndkryme.com.
+
+16. Can zlib decode Flate data in an Adobe PDF file?
+
+    Yes. See https://www.pdflib.com/ . To modify PDF forms, see
+    https://sourceforge.net/projects/acroformtool/ .
+
+17. Why am I getting this "register_frame_info not found" error on Solaris?
+
+    After installing zlib 1.1.4 on Solaris 2.6, running applications using zlib
+    generates an error such as:
+
+        ld.so.1: rpm: fatal: relocation error: file /usr/local/lib/libz.so:
+        symbol __register_frame_info: referenced symbol not found
+
+    The symbol __register_frame_info is not part of zlib, it is generated by
+    the C compiler (cc or gcc).  You must recompile applications using zlib
+    which have this problem.  This problem is specific to Solaris.  See
+    http://www.sunfreeware.com/ for Solaris versions of zlib and applications
+    using zlib.
+
+18. Why does gzip give an error on a file I make with compress/deflate?
+
+    The compress and deflate functions produce data in the zlib format, which
+    is different and incompatible with the gzip format.  The gz* functions in
+    zlib on the other hand use the gzip format.  Both the zlib and gzip formats
+    use the same compressed data format internally, but have different headers
+    and trailers around the compressed data.
+
+19. Ok, so why are there two different formats?
+
+    The gzip format was designed to retain the directory information about a
+    single file, such as the name and last modification date.  The zlib format
+    on the other hand was designed for in-memory and communication channel
+    applications, and has a much more compact header and trailer and uses a
+    faster integrity check than gzip.
+
+20. Well that's nice, but how do I make a gzip file in memory?
+
+    You can request that deflate write the gzip format instead of the zlib
+    format using deflateInit2().  You can also request that inflate decode the
+    gzip format using inflateInit2().  Read zlib.h for more details.
+
+21. Is zlib thread-safe?
+
+    Yes.  However any library routines that zlib uses and any application-
+    provided memory allocation routines must also be thread-safe.  zlib's gz*
+    functions use stdio library routines, and most of zlib's functions use the
+    library memory allocation routines by default.  zlib's *Init* functions
+    allow for the application to provide custom memory allocation routines.
+
+    Of course, you should only operate on any given zlib or gzip stream from a
+    single thread at a time.
+
+22. Can I use zlib in my commercial application?
+
+    Yes.  Please read the license in zlib.h.
+
+23. Is zlib under the GNU license?
+
+    No.  Please read the license in zlib.h.
+
+24. The license says that altered source versions must be "plainly marked". So
+    what exactly do I need to do to meet that requirement?
+
+    You need to change the ZLIB_VERSION and ZLIB_VERNUM #defines in zlib.h.  In
+    particular, the final version number needs to be changed to "f", and an
+    identification string should be appended to ZLIB_VERSION.  Version numbers
+    x.x.x.f are reserved for modifications to zlib by others than the zlib
+    maintainers.  For example, if the version of the base zlib you are altering
+    is "1.2.3.4", then in zlib.h you should change ZLIB_VERNUM to 0x123f, and
+    ZLIB_VERSION to something like "1.2.3.f-zachary-mods-v3".  You can also
+    update the version strings in deflate.c and inftrees.c.
+
+    For altered source distributions, you should also note the origin and
+    nature of the changes in zlib.h, as well as in ChangeLog and README, along
+    with the dates of the alterations.  The origin should include at least your
+    name (or your company's name), and an email address to contact for help or
+    issues with the library.
+
+    Note that distributing a compiled zlib library along with zlib.h and
+    zconf.h is also a source distribution, and so you should change
+    ZLIB_VERSION and ZLIB_VERNUM and note the origin and nature of the changes
+    in zlib.h as you would for a full source distribution.
+
+25. Will zlib work on a big-endian or little-endian architecture, and can I
+    exchange compressed data between them?
+
+    Yes and yes.
+
+26. Will zlib work on a 64-bit machine?
+
+    Yes.  It has been tested on 64-bit machines, and has no dependence on any
+    data types being limited to 32-bits in length.  If you have any
+    difficulties, please provide a complete problem report to zlib@gzip.org
+
+27. Will zlib decompress data from the PKWare Data Compression Library?
+
+    No.  The PKWare DCL uses a completely different compressed data format than
+    does PKZIP and zlib.  However, you can look in zlib's contrib/blast
+    directory for a possible solution to your problem.
+
+28. Can I access data randomly in a compressed stream?
+
+    No, not without some preparation.  If when compressing you periodically use
+    Z_FULL_FLUSH, carefully write all the pending data at those points, and
+    keep an index of those locations, then you can start decompression at those
+    points.  You have to be careful to not use Z_FULL_FLUSH too often, since it
+    can significantly degrade compression.  Alternatively, you can scan a
+    deflate stream once to generate an index, and then use that index for
+    random access.  See examples/zran.c .
+
+29. Does zlib work on MVS, OS/390, CICS, etc.?
+
+    It has in the past, but we have not heard of any recent evidence.  There
+    were working ports of zlib 1.1.4 to MVS, but those links no longer work.
+    If you know of recent, successful applications of zlib on these operating
+    systems, please let us know.  Thanks.
+
+30. Is there some simpler, easier to read version of inflate I can look at to
+    understand the deflate format?
+
+    First off, you should read RFC 1951.  Second, yes.  Look in zlib's
+    contrib/puff directory.
+
+31. Does zlib infringe on any patents?
+
+    As far as we know, no.  In fact, that was originally the whole point behind
+    zlib.  Look here for some more information:
+
+    https://www.gzip.org/#faq11
+
+32. Can zlib work with greater than 4 GB of data?
+
+    Yes.  inflate() and deflate() will process any amount of data correctly.
+    Each call of inflate() or deflate() is limited to input and output chunks
+    of the maximum value that can be stored in the compiler's "unsigned int"
+    type, but there is no limit to the number of chunks.  Note however that the
+    strm.total_in and strm_total_out counters may be limited to 4 GB.  These
+    counters are provided as a convenience and are not used internally by
+    inflate() or deflate().  The application can easily set up its own counters
+    updated after each call of inflate() or deflate() to count beyond 4 GB.
+    compress() and uncompress() may be limited to 4 GB, since they operate in a
+    single call.  gzseek() and gztell() may be limited to 4 GB depending on how
+    zlib is compiled.  See the zlibCompileFlags() function in zlib.h.
+
+    The word "may" appears several times above since there is a 4 GB limit only
+    if the compiler's "long" type is 32 bits.  If the compiler's "long" type is
+    64 bits, then the limit is 16 exabytes.
+
+33. Does zlib have any security vulnerabilities?
+
+    The only one that we are aware of is potentially in gzprintf().  If zlib is
+    compiled to use sprintf() or vsprintf(), then there is no protection
+    against a buffer overflow of an 8K string space (or other value as set by
+    gzbuffer()), other than the caller of gzprintf() assuring that the output
+    will not exceed 8K.  On the other hand, if zlib is compiled to use
+    snprintf() or vsnprintf(), which should normally be the case, then there is
+    no vulnerability.  The ./configure script will display warnings if an
+    insecure variation of sprintf() will be used by gzprintf().  Also the
+    zlibCompileFlags() function will return information on what variant of
+    sprintf() is used by gzprintf().
+
+    If you don't have snprintf() or vsnprintf() and would like one, you can
+    find a portable implementation here:
+
+        https://www.ijs.si/software/snprintf/
+
+    Note that you should be using the most recent version of zlib.  Versions
+    1.1.3 and before were subject to a double-free vulnerability, and versions
+    1.2.1 and 1.2.2 were subject to an access exception when decompressing
+    invalid compressed data.
+
+34. Is there a Java version of zlib?
+
+    Probably what you want is to use zlib in Java. zlib is already included
+    as part of the Java SDK in the java.util.zip package. If you really want
+    a version of zlib written in the Java language, look on the zlib home
+    page for links: https://zlib.net/ .
+
+35. I get this or that compiler or source-code scanner warning when I crank it
+    up to maximally-pedantic. Can't you guys write proper code?
+
+    Many years ago, we gave up attempting to avoid warnings on every compiler
+    in the universe.  It just got to be a waste of time, and some compilers
+    were downright silly as well as contradicted each other.  So now, we simply
+    make sure that the code always works.
+
+36. Valgrind (or some similar memory access checker) says that deflate is
+    performing a conditional jump that depends on an uninitialized value.
+    Isn't that a bug?
+
+    No.  That is intentional for performance reasons, and the output of deflate
+    is not affected.  This only started showing up recently since zlib 1.2.x
+    uses malloc() by default for allocations, whereas earlier versions used
+    calloc(), which zeros out the allocated memory.  Even though the code was
+    correct, versions 1.2.4 and later was changed to not stimulate these
+    checkers.
+
+37. Will zlib read the (insert any ancient or arcane format here) compressed
+    data format?
+
+    Probably not. Look in the comp.compression FAQ for pointers to various
+    formats and associated software.
+
+38. How can I encrypt/decrypt zip files with zlib?
+
+    zlib doesn't support encryption.  The original PKZIP encryption is very
+    weak and can be broken with freely available programs.  To get strong
+    encryption, use GnuPG, https://www.gnupg.org/ , which already includes zlib
+    compression.  For PKZIP compatible "encryption", look at
+    http://infozip.sourceforge.net/
+
+39. What's the difference between the "gzip" and "deflate" HTTP 1.1 encodings?
+
+    "gzip" is the gzip format, and "deflate" is the zlib format.  They should
+    probably have called the second one "zlib" instead to avoid confusion with
+    the raw deflate compressed data format.  While the HTTP 1.1 RFC 2616
+    correctly points to the zlib specification in RFC 1950 for the "deflate"
+    transfer encoding, there have been reports of servers and browsers that
+    incorrectly produce or expect raw deflate data per the deflate
+    specification in RFC 1951, most notably Microsoft.  So even though the
+    "deflate" transfer encoding using the zlib format would be the more
+    efficient approach (and in fact exactly what the zlib format was designed
+    for), using the "gzip" transfer encoding is probably more reliable due to
+    an unfortunate choice of name on the part of the HTTP 1.1 authors.
+
+    Bottom line: use the gzip format for HTTP 1.1 encoding.
+
+40. Does zlib support the new "Deflate64" format introduced by PKWare?
+
+    No.  PKWare has apparently decided to keep that format proprietary, since
+    they have not documented it as they have previous compression formats.  In
+    any case, the compression improvements are so modest compared to other more
+    modern approaches, that it's not worth the effort to implement.
+
+41. I'm having a problem with the zip functions in zlib, can you help?
+
+    There are no zip functions in zlib.  You are probably using minizip by
+    Giles Vollant, which is found in the contrib directory of zlib.  It is not
+    part of zlib.  In fact none of the stuff in contrib is part of zlib.  The
+    files in there are not supported by the zlib authors.  You need to contact
+    the authors of the respective contribution for help.
+
+42. The match.asm code in contrib is under the GNU General Public License.
+    Since it's part of zlib, doesn't that mean that all of zlib falls under the
+    GNU GPL?
+
+    No.  The files in contrib are not part of zlib.  They were contributed by
+    other authors and are provided as a convenience to the user within the zlib
+    distribution.  Each item in contrib has its own license.
+
+43. Is zlib subject to export controls?  What is its ECCN?
+
+    zlib is not subject to export controls, and so is classified as EAR99.
+
+44. Can you please sign these lengthy legal documents and fax them back to us
+    so that we can use your software in our product?
+
+    No. Go away. Shoo.
--- a/deps/zlib-ng/INDEX.md
+++ b/deps/zlib-ng/INDEX.md
@@ -0,0 +1,36 @@
+Contents
+--------
+
+| Name             | Description                                                    |
+|:-----------------|:---------------------------------------------------------------|
+| arch/            | Architecture-specific code                                     |
+| doc/             | Documentation for formats and algorithms                       |
+| test/example.c   | Zlib usages examples for build testing                         |
+| test/minigzip.c  | Minimal gzip-like functionality for build testing              |
+| test/infcover.c  | Inflate code coverage for build testing                        |
+| win32/           | Shared library version resources for Windows                   |
+| CMakeLists.txt   | Cmake build script                                             |
+| configure        | Bash configure/build script                                    |
+| adler32.c        | Compute the Adler-32 checksum of a data stream                 |
+| chunkset.*       | Inline functions to copy small data chunks                     |
+| compress.c       | Compress a memory buffer                                       |
+| deflate.*        | Compress data using the deflate algorithm                      |
+| deflate_fast.c   | Compress data using the deflate algorithm with fast strategy   |
+| deflate_medium.c | Compress data using the deflate algorithm with medium strategy |
+| deflate_slow.c   | Compress data using the deflate algorithm with slow strategy   |
+| functable.*      | Struct containing function pointers to optimized functions     |
+| gzguts.h         | Internal definitions for gzip operations                       |
+| gzlib.c          | Functions common to reading and writing gzip files             |
+| gzread.c         | Read gzip files                                                |
+| gzwrite.c        | Write gzip files                                               |
+| infback.*        | Inflate using a callback interface                             |
+| inflate.*        | Decompress data                                                |
+| inffast.*        | Decompress data with speed optimizations                       |
+| inffixed_tbl.h   | Table for decoding fixed codes                                 |
+| inftrees.h       | Generate Huffman trees for efficient decoding                  |
+| trees.*          | Output deflated data using Huffman coding                      |
+| uncompr.c        | Decompress a memory buffer                                     |
+| zconf.h.cmakein  | zconf.h template for cmake                                     |
+| zendian.h        | BYTE_ORDER for endian tests                                    |
+| zlib.map         | Linux symbol information                                       |
+| zlib.pc.in       | Pkg-config template                                            |
--- a/deps/zlib-ng/LICENSE.md
+++ b/deps/zlib-ng/LICENSE.md
@@ -0,0 +1,19 @@
+(C) 1995-2013 Jean-loup Gailly and Mark Adler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
--- a/deps/zlib-ng/Makefile.in
+++ b/deps/zlib-ng/Makefile.in
@@ -0,0 +1,395 @@
+# Makefile for zlib
+# Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+# To compile and test, type:
+#    ./configure; make test
+# Normally configure builds both a static and a shared library.
+# If you want to build just a static library, use: ./configure --static
+
+# To install /usr/local/lib/libz.* and /usr/local/include/zlib.h, type:
+#    make install
+# To install in $HOME instead of /usr/local, use:
+#    make install prefix=$HOME
+
+CC=cc
+
+CFLAGS=-O
+#CFLAGS=-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7
+#CFLAGS=-g -DZLIB_DEBUG
+#CFLAGS=-O3 -Wall -Wwrite-strings -Wpointer-arith -Wconversion \
+#           -Wstrict-prototypes -Wmissing-prototypes
+
+SFLAGS=-O
+LDFLAGS=-L.
+LIBNAME1=libz-ng
+LIBNAME2=zlib-ng
+SUFFIX=-ng
+TEST_LIBS=$(LIBNAME1).a
+LDSHARED=$(CC)
+LDSHAREDFLAGS=-shared
+
+VER=2.1.2
+VER1=2
+
+STATICLIB=$(LIBNAME1).a
+SHAREDLIB=$(LIBNAME1).so
+SHAREDLIBV=$(LIBNAME1).so.$(VER)
+SHAREDLIBM=$(LIBNAME1).so.$(VER1)
+IMPORTLIB=
+SHAREDTARGET=$(LIBNAME1).so.$(VER)
+PKGFILE=$(LIBNAME2).pc
+
+LIBS=$(STATICLIB) $(SHAREDTARGET)
+
+AR=ar
+ARFLAGS=rc
+DEFFILE=
+RC=
+RCFLAGS=
+RCOBJS=
+STRIP=
+RANLIB=ranlib
+LDCONFIG=ldconfig
+LDSHAREDLIBC=
+EXE=
+
+SRCDIR=.
+INCLUDES=-I$(SRCDIR)
+
+BUILDDIR=.
+
+ARCHDIR=arch/generic
+ARCH_STATIC_OBJS=
+ARCH_SHARED_OBJS=
+
+prefix = /usr/local
+exec_prefix = ${prefix}
+bindir = ${exec_prefix}/bin
+libdir = ${exec_prefix}/lib
+sharedlibdir = ${libdir}
+includedir = ${prefix}/include
+mandir = ${prefix}/share/man
+man3dir = ${mandir}/man3
+pkgconfigdir = ${libdir}/pkgconfig
+
+OBJZ = \
+	adler32.o \
+	adler32_fold.o \
+	chunkset.o \
+	compare256.o \
+	compress.o \
+	cpu_features.o \
+	crc32_braid.o \
+	crc32_braid_comb.o \
+	crc32_fold.o \
+	deflate.o \
+	deflate_fast.o \
+	deflate_huff.o \
+	deflate_medium.o \
+	deflate_quick.o \
+	deflate_rle.o \
+	deflate_slow.o \
+	deflate_stored.o \
+	functable.o \
+	infback.o \
+	inflate.o \
+	inftrees.o \
+	insert_string.o \
+	insert_string_roll.o \
+	slide_hash.o \
+	trees.o \
+	uncompr.o \
+	zutil.o \
+	$(ARCH_STATIC_OBJS)
+
+OBJG = \
+	gzlib.o \
+	gzread.o \
+	gzwrite.o
+
+TESTOBJG =
+OBJC = $(OBJZ) $(OBJG)
+
+PIC_OBJZ = \
+	adler32.lo \
+	adler32_fold.lo \
+	chunkset.lo \
+	compare256.lo \
+	compress.lo \
+	cpu_features.lo \
+	crc32_braid.lo \
+	crc32_braid_comb.lo \
+	crc32_fold.lo \
+	deflate.lo \
+	deflate_fast.lo \
+	deflate_huff.lo \
+	deflate_medium.lo \
+	deflate_quick.lo \
+	deflate_rle.lo \
+	deflate_slow.lo \
+	deflate_stored.lo \
+	functable.lo \
+	infback.lo \
+	inflate.lo \
+	inftrees.lo \
+	insert_string.lo \
+	insert_string_roll.lo \
+	slide_hash.lo \
+	trees.lo \
+	uncompr.lo \
+	zutil.lo \
+	$(ARCH_SHARED_OBJS)
+
+PIC_OBJG = \
+	gzlib.lo \
+	gzread.lo \
+	gzwrite.lo
+
+PIC_TESTOBJG =
+PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
+
+OBJS = $(OBJC)
+
+PIC_OBJS = $(PIC_OBJC)
+
+all: static shared
+
+static: example$(EXE) minigzip$(EXE) makefixed$(EXE) maketrees$(EXE) makecrct$(EXE)
+
+shared: examplesh$(EXE) minigzipsh$(EXE)
+
+check: test
+
+.SECONDARY:
+
+$(ARCHDIR)/%.o: $(SRCDIR)/$(ARCHDIR)/%.c
+	$(MAKE) -C $(ARCHDIR) $(notdir $@)
+
+$(ARCHDIR)/%.lo: $(SRCDIR)/$(ARCHDIR)/%.c
+	$(MAKE) -C $(ARCHDIR) $(notdir $@)
+
+%.o: $(ARCHDIR)/%.o
+	-cp $< $@
+
+%.lo: $(ARCHDIR)/%.lo
+	-cp $< $@
+
+test: all
+	$(MAKE) -C test
+
+infcover.o: $(SRCDIR)/test/infcover.c zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/test/infcover.c
+
+infcover$(EXE): infcover.o $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ infcover.o $(STATICLIB)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+cover: infcover$(EXE)
+	rm -f *.gcda
+	./infcover
+	gcov inf*.c
+
+$(STATICLIB): $(OBJS)
+	$(AR) $(ARFLAGS) $@ $(OBJS)
+	-@ ($(RANLIB) $@ || true) >/dev/null 2>&1
+
+example.o:
+	$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/example.c
+
+minigzip.o:
+	$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/minigzip.c
+
+makefixed.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makefixed.c
+
+maketrees.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/maketrees.c
+
+makecrct.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makecrct.c
+
+zlibrc.o: $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
+	$(RC) $(RCFLAGS) -o $@ $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
+
+.SUFFIXES: .lo
+
+%.o: $(SRCDIR)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $<
+
+%.lo: $(SRCDIR)/%.c
+	$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $<
+
+gzlib.o: $(SRCDIR)/gzlib.c
+	$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+gzlib.lo: $(SRCDIR)/gzlib.c
+	$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+gzread.o: gzread.c
+	$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+gzread.lo: gzread.c
+	$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+gzwrite.o: $(SRCDIR)/gzwrite.c
+	$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+gzwrite.lo: $(SRCDIR)/gzwrite.c
+	$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
+
+$(SHAREDTARGET): $(PIC_OBJS) $(DEFFILE) $(RCOBJS)
+ifneq ($(SHAREDTARGET),)
+	$(LDSHARED) $(CFLAGS) $(LDSHAREDFLAGS) $(LDFLAGS) -o $@ $(DEFFILE) $(PIC_OBJS) $(RCOBJS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+ifneq ($(SHAREDLIB),$(SHAREDTARGET))
+	rm -f $(SHAREDLIB) $(SHAREDLIBM)
+	ln -s $@ $(SHAREDLIB)
+	ln -s $@ $(SHAREDLIBM)
+endif
+endif
+
+example$(EXE): example.o $(TESTOBJG) $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+minigzip$(EXE): minigzip.o $(TESTOBJG) $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+minigzipsh$(EXE): minigzip.o $(PIC_TESTOBJG) $(SHAREDTARGET)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+
+examplesh$(EXE): example.o $(PIC_TESTOBJG) $(SHAREDTARGET)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+makefixed$(EXE): makefixed.o $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makefixed.o $(TEST_LIBS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+maketrees$(EXE): maketrees.o $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ maketrees.o $(TEST_LIBS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+makecrct$(EXE): makecrct.o $(STATICLIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makecrct.o $(TEST_LIBS) $(LDSHAREDLIBC)
+ifneq ($(STRIP),)
+	$(STRIP) $@
+endif
+
+install-shared: $(SHAREDTARGET)
+ifneq ($(SHAREDTARGET),)
+	-@if [ ! -d $(DESTDIR)$(sharedlibdir) ]; then mkdir -p $(DESTDIR)$(sharedlibdir); fi
+	rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
+	cp $(SHAREDTARGET) $(DESTDIR)$(sharedlibdir)
+	chmod 755 $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
+ifneq ($(SHAREDLIB),$(SHAREDTARGET))
+	rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
+	ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB)
+	ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
+	($(LDCONFIG) || true)  >/dev/null 2>&1
+# ldconfig is for Linux
+endif
+ifneq ($(IMPORTLIB),)
+	cp $(IMPORTLIB) $(DESTDIR)$(sharedlibdir)
+	chmod 644 $(DESTDIR)$(sharedlibdir)/$(IMPORTLIB)
+endif
+endif
+
+install-static: $(STATICLIB)
+	-@if [ ! -d $(DESTDIR)$(libdir)       ]; then mkdir -p $(DESTDIR)$(libdir); fi
+	rm -f $(DESTDIR)$(libdir)/$(STATICLIB)
+	cp $(STATICLIB) $(DESTDIR)$(libdir)
+	chmod 644 $(DESTDIR)$(libdir)/$(STATICLIB)
+	-@($(RANLIB) $(DESTDIR)$(libdir)/$(STATICLIB) || true) >/dev/null 2>&1
+# The ranlib in install-static is needed on NeXTSTEP which checks file times
+
+install-libs: install-shared install-static
+	-@if [ ! -d $(DESTDIR)$(man3dir)      ]; then mkdir -p $(DESTDIR)$(man3dir); fi
+	-@if [ ! -d $(DESTDIR)$(pkgconfigdir) ]; then mkdir -p $(DESTDIR)$(pkgconfigdir); fi
+	rm -f $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
+	cp $(PKGFILE) $(DESTDIR)$(pkgconfigdir)
+	chmod 644 $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
+
+install: install-libs
+	-@if [ ! -d $(DESTDIR)$(includedir)   ]; then mkdir -p $(DESTDIR)$(includedir); fi
+	rm -f $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
+	cp zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zlib$(SUFFIX).h
+	cp zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h
+	cp zlib_name_mangling$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
+	chmod 644 $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
+
+uninstall-static:
+	cd $(DESTDIR)$(libdir) && rm -f $(STATICLIB)
+
+uninstall-shared:
+ifneq ($(SHAREDLIB),)
+	cd $(DESTDIR)$(sharedlibdir) && rm -f $(SHAREDLIBV) $(SHAREDLIB) $(SHAREDLIBM)
+endif
+ifneq ($(IMPORTLIB),)
+	cd $(DESTDIR)$(sharedlibdir) && rm -f $(IMPORTLIB)
+endif
+
+uninstall: uninstall-static uninstall-shared
+	cd $(DESTDIR)$(includedir) && rm -f zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
+	cd $(DESTDIR)$(pkgconfigdir) && rm -f $(PKGFILE)
+
+mostlyclean: clean
+clean:
+	@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) clean; fi
+	@if [ -f test/Makefile ]; then $(MAKE) -C test clean; fi
+	rm -f *.o *.lo *~ \
+	   example$(EXE) minigzip$(EXE) minigzipsh$(EXE) \
+	   infcover makefixed$(EXE) maketrees$(EXE) makecrct$(EXE) \
+	   $(STATICLIB) $(IMPORTLIB) $(SHAREDLIB) $(SHAREDLIBV) $(SHAREDLIBM) \
+	   foo.gz so_locations \
+	   _match.s maketree
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+	rm -f a.out a.exe
+	rm -f *._h
+	rm -rf btmp1 btmp2 pkgtmp1 pkgtmp2
+
+maintainer-clean: distclean
+distclean: clean
+	@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) distclean; fi
+	@if [ -f test/Makefile ]; then $(MAKE) -C test distclean; fi
+	rm -f $(PKGFILE) configure.log zconf.h zconf.h.cmakein zlib$(SUFFIX).h zlib_name_mangling$(SUFFIX)}.h *.pc
+	-@rm -f .DS_Store
+# Reset Makefile if building inside source tree
+	@if [ -f Makefile.in ]; then \
+	printf 'all:\n\t-@echo "Please use ./configure first.  Thank you."\n' > Makefile ; \
+	printf '\ndistclean:\n\t$(MAKE) -f Makefile.in distclean\n' >> Makefile ; \
+	touch -r $(SRCDIR)/Makefile.in Makefile ; fi
+# Reset zconf.h and zconf.h.cmakein if building inside source tree
+	@if [ -f zconf.h.in ]; then \
+	cp -p $(SRCDIR)/zconf.h.in zconf.h ; \
+	grep -v '^#cmakedefine' $(SRCDIR)/zconf.h.in > zconf.h.cmakein &&\
+	touch -r $(SRCDIR)/zconf.h.in zconf.h.cmakein ; fi
+# Cleanup these files if building outside source tree
+	@if [ ! -f README.md ]; then rm -f Makefile; fi
+# Remove arch and test directory if building outside source tree
+	@if [ ! -f $(ARCHDIR)/Makefile.in ]; then rm -rf arch; fi
+	@if [ ! -f test/Makefile.in ]; then rm -rf test; fi
+
+tags:
+	etags $(SRCDIR)/*.[ch]
--- a/deps/zlib-ng/PORTING.md
+++ b/deps/zlib-ng/PORTING.md
@@ -0,0 +1,79 @@
+Porting applications to use zlib-ng
+===================================
+
+Zlib-ng can be used/compiled in two different modes, that require some
+consideration by the application developer.
+
+zlib-compat mode
+----------------
+Zlib-ng can be compiled in zlib-compat mode, suitable for zlib-replacement
+in a single application or system-wide.
+
+Please note that zlib-ng in zlib-compat mode tries to maintain both API and
+ABI compatibility with the original zlib. Any issues regarding compatibility
+can be reported as bugs.
+
+In certain instances you may not be able to simply replace the zlib library/dll
+files and expect the application to work. The application may need to be
+recompiled against the zlib-ng headers and libs to ensure full compatibility.
+
+It is also possible for the deflate output stream to differ from the original
+zlib due to algorithmic differences between the two libraries. Any tests or
+applications that depend on the exact length of the deflate stream being a
+certain value will need to be updated.
+
+**Advantages:**
+- Easy to port to, since it only requires a recompile of the application and
+  no changes to the application code.
+
+**Disadvantages:**
+- Can conflict with a system-installed zlib, as that can often be linked in
+  by another library you are linking into your application. This can cause
+  crashes or incorrect output.
+- If your application is pre-allocating a memory buffer and you are providing
+  deflate/inflate init with your own allocator that allocates from that buffer
+  (looking at you nginx), you should be aware that zlib-ng needs to allocate
+  more memory than stock zlib needs. The same problem exists with Intel’s and
+  Cloudflare’s zlib forks. Doing this is not recommended since it makes it
+  very hard to maintain compatibility over time.
+
+**Build Considerations:**
+- Compile against the *zlib.h* provided by zlib-ng
+- Configuration header is named *zconf.h*
+- Static library is *libz.a* on Unix and macOS, or *zlib.lib* on Windows
+- Shared library is *libz.so* on Unix, *libz.dylib* on macOS, or *zlib1.dll*
+  on Windows
+- Type `z_size_t` is *unsigned __int64* on 64-bit Windows, and *unsigned long* on 32-bit Windows, Unix and macOS
+- Type `z_uintmax_t` is *unsigned long* in zlib-compat mode, and *size_t* with zlib-ng API
+
+zlib-ng native mode
+-------------------
+Zlib-ng in native mode is suitable for co-existing with the standard zlib
+library, allowing applications to implement support and testing separately.
+
+The zlib-ng native has implemented some modernization and simplifications
+in its API, intended to make life easier for application developers.
+
+**Advantages:**
+- Does not conflict with other zlib implementations, and can co-exist as a
+  system library along with zlib.
+- In certain places zlib-ng native uses more appropriate data types, removing
+  the need for some workarounds in the API compared to zlib.
+
+**Disadvantages:**
+- Requires minor changes to applications to use the prefixed zlib-ng
+  function calls and structs. Usually this means a small prefix `zng_` has to be added.
+
+**Build Considerations:**
+- Compile against *zlib-ng.h*
+- Configuration header is named *zconf-ng.h*
+- Static library is *libz-ng.a* on Unix and macOS, or *zlib-ng.lib* on Windows
+- Shared library is *libz-ng.so* on Unix, *libz-ng.dylib* on macOS, or
+  *zlib-ng2.dll* on Windows
+- Type `z_size_t` is *size_t*
+
+zlib-ng compile-time detection
+------------------------------
+
+To distinguish zlib-ng from other zlib implementations at compile-time check for the
+existence of `ZLIBNG_VERSION` defined in the zlib header.
--- a/deps/zlib-ng/README.md
+++ b/deps/zlib-ng/README.md
@@ -0,0 +1,216 @@
+| CI | Stable | Develop |
+|:---|:-------|:--------|
+| GitHub Actions | [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20CMake/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20Configure/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20NMake/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) | [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20CMake/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20Configure/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20NMake/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) |
+| CodeFactor     | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
+| OSS-Fuzz       | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
+| Codecov        | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
+
+## zlib-ng
+*zlib data compression library for the next generation systems*
+
+Maintained by Hans Kristian Rosbach
+          aka Dead2 (zlib-ng àt circlestorm dót org)
+
+Features
+--------
+
+* Zlib compatible API with support for dual-linking
+* Modernized native API based on zlib API for ease of porting
+* Modern C11 syntax and a clean code layout
+* Deflate medium and quick algorithms based on Intel’s zlib fork
+* Support for CPU intrinsics when available
+  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
+  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
+  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
+  * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
+  * Compare256 implementations using SSE2, AVX2, Neon, & POWER9
+  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
+  * Support for hardware-accelerated deflate using IBM Z DFLTCC
+* Unaligned memory read/writes and large bit buffer improvements
+* Includes improvements from Cloudflare and Intel forks
+* Configure, CMake, and NMake build system support
+* Comprehensive set of CMake unit tests
+* Code sanitizers, fuzzing, and coverage
+* GitHub Actions continuous integration on Windows, macOS, and Linux
+  * Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
+
+
+History
+-------
+
+The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
+implemented into the official zlib repository.
+
+Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
+for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
+lower threshold for code change.
+
+zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
+That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
+for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
+
+Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
+cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
+
+I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
+smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
+The result is a better performing and easier to maintain zlib-ng.
+
+A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
+small and big improvements, or valuable testing.
+
+
+Build
+-----
+<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
+
+There are two ways to build zlib-ng:
+
+### Cmake
+
+To build zlib-ng using the cross-platform makefile generator cmake.
+
+```
+cmake .
+cmake --build . --config Release
+ctest --verbose -C Release
+```
+
+Alternatively, you can use the cmake configuration GUI tool ccmake:
+
+```
+ccmake .
+```
+
+### Configure
+
+To build zlib-ng using the bash configure script:
+
+```
+./configure
+make
+make test
+```
+
+Build Options
+-------------
+
+| CMake                    | configure                | Description                                                                           | Default |
+|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
+| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
+| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
+| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
+| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
+| WITH_NATIVE_INSTRUCTIONS | --native                 | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
+| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
+| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
+| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
+| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
+| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
+| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+
+
+Install
+-------
+
+WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
+potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
+can make the whole system unusable, requiring recovery or reinstall.
+If you still want a manual install, we recommend using the /opt/ path prefix.
+
+For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
+the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
+will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
+
+```
+LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
+```
+
+### Cmake
+
+To install zlib-ng system-wide using cmake:
+
+```
+cmake --build . --target install
+```
+
+### Configure
+
+To install zlib-ng system-wide using the configure script:
+
+```
+make install
+```
+
+### Vcpkg
+
+Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+```sh or powershell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
+./vcpkg integrate install
+./vcpkg install zlib-ng
+```
+
+The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
+If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+Contributing
+------------
+
+Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
+Help with testing and reviewing pull requests etc is also very much appreciated.
+
+Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
+
+Acknowledgments
+----------------
+
+Thanks go out to all the people and companies who have taken the time to contribute
+code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
+
+The deflate format used by zlib was defined by Phil Katz.<br>
+The deflate and zlib specifications were written by L. Peter Deutsch.
+
+zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
+
+
+Advanced Build Options
+----------------------
+
+| CMake                           | configure             | Description                                                         | Default                |
+|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
+| FORCE_SSE2                      | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
+| WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
+| WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
+| WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
+| WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
+| WITH_SSSE3                      |                       | Build with SSSE3 intrinsics                                         | ON                     |
+| WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
+| WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
+| WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
+| WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
+| WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
+| WITH_ALTIVEC                    | --without-altivec     | Build with AltiVec (VMX) intrinsics                                 | ON                     |
+| WITH_POWER8                     | --without-power8      | Build with POWER8 optimisations                                     | ON                     |
+| WITH_RVV                        |                       | Build with RVV intrinsics                                           | ON                     |
+| WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
+| WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
+| WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
+| WITH_UNALIGNED                  | --without-unaligned   | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
+| WITH_INFLATE_STRICT             |                       | Build with strict inflate distance checking                         | OFF                    |
+| WITH_INFLATE_ALLOW_INVALID_DIST |                       | Build with zero fill for inflate invalid distances                  | OFF                    |
+| INSTALL_UTILS                   |                       | Copy minigzip and minideflate during install                        | OFF                    |
+| ZLIBNG_ENABLE_TESTS             |                       | Test zlib-ng specific API                                           | ON                     |
+
+
+Related Projects
+----------------
+
+* Fork of the popular minizip                   https://github.com/zlib-ng/minizip-ng
+* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
+* Python tool to benchmark pigz                 https://github.com/zlib-ng/pigzbench
+* 3rd party patches for zlib-ng compatibility   https://github.com/zlib-ng/patches
--- a/deps/zlib-ng/adler32.c
+++ b/deps/zlib-ng/adler32.c
@@ -0,0 +1,115 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    uint32_t sum1;
+    uint32_t sum2;
+    unsigned rem;
+
+    /* for negative len, return invalid adler32 as a clue for debugging */
+    if (len2 < 0)
+        return 0xffffffff;
+
+    /* the derivation of this formula is left as an exercise for the reader */
+    len2 %= BASE;                 /* assumes len2 >= 0 */
+    rem = (unsigned)len2;
+    sum1 = adler1 & 0xffff;
+    sum2 = rem * sum1;
+    sum2 %= BASE;
+    sum1 += (adler2 & 0xffff) + BASE - 1;
+    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
+    if (sum2 >= BASE) sum2 -= BASE;
+    return sum1 | (sum2 << 16);
+}
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+
+unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+#else
+uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    return adler32_combine_(adler1, adler2, len2);
+}
+#endif
--- a/deps/zlib-ng/adler32_fold.c
+++ b/deps/zlib-ng/adler32_fold.c
@@ -0,0 +1,16 @@
+/* adler32_fold.c -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_fold.h"
+
+#include <limits.h>
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = functable.adler32(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
--- a/deps/zlib-ng/adler32_fold.h
+++ b/deps/zlib-ng/adler32_fold.h
@@ -0,0 +1,11 @@
+/* adler32_fold.h -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_FOLD_H_
+#define ADLER32_FOLD_H_
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+#endif
--- a/deps/zlib-ng/adler32_p.h
+++ b/deps/zlib-ng/adler32_p.h
@@ -0,0 +1,70 @@
+/* adler32_p.h -- Private inline functions and macros shared with
+ *                different computation of the Adler-32 checksum
+ *                of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(sum1, sum2, buf, i)  {(sum1) += buf[(i)]; (sum2) += (sum1);}
+#define DO2(sum1, sum2, buf, i)  {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
+#define DO4(sum1, sum2, buf, i)  {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
+#define DO8(sum1, sum2, buf, i)  {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
+#define DO16(sum1, sum2, buf)    {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
+
+static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
+    adler += buf[0];
+    adler %= BASE;
+    sum2 += adler;
+    sum2 %= BASE;
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++;
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+#ifdef UNROLL_MORE
+    while (len >= 16) {
+        len -= 16;
+        DO16(adler, sum2, buf);
+        buf += 16;
+#else
+    while (len >= 8) {
+        len -= 8;
+        DO8(adler, sum2, buf, 0);
+        buf += 8;
+#endif
+    }
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif /* ADLER32_P_H */
--- a/deps/zlib-ng/arch/.gitignore
+++ b/deps/zlib-ng/arch/.gitignore
@@ -0,0 +1,2 @@
+# ignore Makefiles; they're all automatically generated
+Makefile
--- a/deps/zlib-ng/arch/arm/Makefile.in
+++ b/deps/zlib-ng/arch/arm/Makefile.in
@@ -0,0 +1,77 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ACLEFLAG=
+NEONFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	adler32_neon.o adler32_neon.lo \
+	arm_features.o arm_features.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	compare256_neon.o compare256_neon.lo \
+	crc32_acle.o crc32_acle.lo \
+	slide_hash_neon.o slide_hash_neon.lo \
+	insert_string_acle.o insert_string_acle.lo
+
+adler32_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+crc32_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+slide_hash_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+insert_string_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+insert_string_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/deps/zlib-ng/arch/arm/adler32_neon.c
+++ b/deps/zlib-ng/arch/arm/adler32_neon.c
@@ -0,0 +1,215 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
+        32, 31, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 11, 10, 9,
+        8, 7, 6, 5, 4, 3, 2, 1 };
+
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t pair[2];
+    int n = NMAX;
+    unsigned int done = 0;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        NEON_accum32(pair, buf + done, n >> 4);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        NEON_handle_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+
+#endif
--- a/deps/zlib-ng/arch/arm/arm_features.c
+++ b/deps/zlib-ng/arch/arm/arm_features.c
@@ -0,0 +1,82 @@
+#include "../../zbuild.h"
+#include "arm_features.h"
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#  ifdef ARM_ASM_HWCAP
+#    include <asm/hwcap.h>
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
+#  include <sys/sysctl.h>
+#elif defined(_WIN32)
+#  include <windows.h>
+#endif
+
+static int arm_has_crc32() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
+#  ifdef HWCAP_CRC32
+    return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+    return getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__APPLE__)
+    int hascrc32;
+    size_t size = sizeof(hascrc32);
+    return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
+      && hascrc32 == 1;
+#elif defined(_WIN32)
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#elif defined(ARM_NOCHECK_ACLE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+/* AArch64 has neon. */
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+static inline int arm_has_neon() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
+#  ifdef HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
+#  endif
+#elif defined(__APPLE__)
+    int hasneon;
+    size_t size = sizeof(hasneon);
+    return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
+      && hasneon == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+    return 1; /* Always supported */
+#  endif
+#endif
+
+#if defined(ARM_NOCHECK_NEON)
+    return 1;
+#else
+    return 0;
+#endif
+}
+#endif
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#if defined(__aarch64__) || defined(_M_ARM64)
+    features->has_neon = 1; /* always available */
+#else
+    features->has_neon = arm_has_neon();
+#endif
+    features->has_crc32 = arm_has_crc32();
+}
--- a/deps/zlib-ng/arch/arm/arm_features.h
+++ b/deps/zlib-ng/arch/arm/arm_features.h
@@ -0,0 +1,15 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_H_
+#define ARM_H_
+
+struct arm_cpu_features {
+    int has_neon;
+    int has_crc32;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_H_ */
--- a/deps/zlib-ng/arch/arm/chunkset_neon.c
+++ b/deps/zlib-ng/arch/arm/chunkset_neon.c
@@ -0,0 +1,101 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../generic/chunk_permute_table.h"
+
+typedef uint8x16_t chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+
+    /* This version of table is only available on aarch64 */
+#if defined(_M_ARM64) || defined(__aarch64__)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
+    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab = {{a, b}};
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/arm/compare256_neon.c
+++ b/deps/zlib-ng/arch/arm/compare256_neon.c
@@ -0,0 +1,59 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/arm/crc32_acle.c
+++ b/deps/zlib-ng/arch/arm/crc32_acle.c
@@ -0,0 +1,98 @@
+/* crc32_acle.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+*/
+
+#ifdef ARM_ACLE
+#ifdef _MSC_VER
+#  include <intrin.h>
+#else
+#  include <arm_acle.h>
+#endif
+#include "../../zbuild.h"
+
+Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+    Z_REGISTER const uint16_t *buf2;
+    Z_REGISTER const uint32_t *buf4;
+
+    c = ~crc;
+    if (len && ((ptrdiff_t)buf & 1)) {
+        c = __crc32b(c, *buf++);
+        len--;
+    }
+
+    if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
+        buf2 = (const uint16_t *) buf;
+        c = __crc32h(c, *buf2++);
+        len -= sizeof(uint16_t);
+        buf4 = (const uint32_t *) buf2;
+    } else {
+        buf4 = (const uint32_t *) buf;
+    }
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
+        c = __crc32w(c, *buf4++);
+        len -= sizeof(uint32_t);
+    }
+
+    if (len == 0) {
+        c = ~c;
+        return c;
+    }
+
+    const uint64_t *buf8 = (const uint64_t *) buf4;
+
+    while (len >= sizeof(uint64_t)) {
+        c = __crc32d(c, *buf8++);
+        len -= sizeof(uint64_t);
+    }
+
+    if (len >= sizeof(uint32_t)) {
+        buf4 = (const uint32_t *) buf8;
+        c = __crc32w(c, *buf4++);
+        len -= sizeof(uint32_t);
+        buf2 = (const uint16_t *) buf4;
+    } else {
+        buf2 = (const uint16_t *) buf8;
+    }
+
+    if (len >= sizeof(uint16_t)) {
+        c = __crc32h(c, *buf2++);
+        len -= sizeof(uint16_t);
+    }
+
+    buf = (const unsigned char *) buf2;
+#else /* __aarch64__ */
+
+    if (len == 0) {
+        c = ~c;
+        return c;
+    }
+
+    while (len >= sizeof(uint32_t)) {
+        c = __crc32w(c, *buf4++);
+        len -= sizeof(uint32_t);
+    }
+
+    if (len >= sizeof(uint16_t)) {
+        buf2 = (const uint16_t *) buf4;
+        c = __crc32h(c, *buf2++);
+        len -= sizeof(uint16_t);
+        buf = (const unsigned char *) buf2;
+    } else {
+        buf = (const unsigned char *) buf4;
+    }
+#endif /* __aarch64__ */
+
+    if (len) {
+        c = __crc32b(c, *buf);
+    }
+
+    c = ~c;
+    return c;
+}
+#endif
--- a/deps/zlib-ng/arch/arm/insert_string_acle.c
+++ b/deps/zlib-ng/arch/arm/insert_string_acle.c
@@ -0,0 +1,26 @@
+/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifdef ARM_ACLE
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#define HASH_CALC(s, h, val) \
+    h = __crc32w(0, val)
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         update_hash_acle
+#define INSERT_STRING       insert_string_acle
+#define QUICK_INSERT_STRING quick_insert_string_acle
+
+#include "../../insert_string_tpl.h"
+#endif
--- a/deps/zlib-ng/arch/arm/neon_intrins.h
+++ b/deps/zlib-ng/arch/arm/neon_intrins.h
@@ -0,0 +1,57 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+
+#  ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+    uint16x8x4_t ret = (uint16x8x4_t) {{
+                          vld1q_u16(a),
+                          vld1q_u16(a+8),
+                          vld1q_u16(a+16),
+                          vld1q_u16(a+24)}};
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#  endif // HASLD4 check
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
--- a/deps/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/deps/zlib-ng/arch/arm/slide_hash_neon.c
@@ -0,0 +1,46 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(wsize);
+
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p0 = vld1q_u16_x4(table);
+        p1 = vld1q_u16_x4(table+32);
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4(table, p0);
+        vst1q_u16_x4(table+32, p1);
+        table += 64;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
--- a/deps/zlib-ng/arch/generic/Makefile.in
+++ b/deps/zlib-ng/arch/generic/Makefile.in
@@ -0,0 +1,24 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all:
+
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~ \
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/deps/zlib-ng/arch/generic/chunk_permute_table.h
+++ b/deps/zlib-ng/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
--- a/deps/zlib-ng/arch/power/Makefile.in
+++ b/deps/zlib-ng/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+     power_features.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
+
+power_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+	$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+	$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+	$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+	$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+	$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+	$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/deps/zlib-ng/arch/power/adler32_power8.c
+++ b/deps/zlib-ng/arch/power/adler32_power8.c
@@ -0,0 +1,153 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "adler32_p.h"
+
+/* Vector across sum unsigned int (saturate).  */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(s1, buf, s2);
+
+    /* If buffer is empty or len=0 we need to return adler initial value.  */
+    if (UNLIKELY(buf == NULL))
+        return 1;
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_len_64(s1, buf, len, s2);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(s1, buf, len, s2);
+}
+
+#endif /* POWER8_VSX */
--- a/deps/zlib-ng/arch/power/adler32_vmx.c
+++ b/deps/zlib-ng/arch/power/adler32_vmx.c
@@ -0,0 +1,181 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+#include <altivec.h>
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    /* Different taps for the separable components of sums */
+    const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+    const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+    const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+    const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+    vector unsigned int  adacc, s2acc;
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+    s2acc = vec_slo(pair_vec, shift_vec);
+
+    vector unsigned int zero = vmx_zero();
+    vector unsigned int s3acc = zero;
+    vector unsigned int s3acc_0 = zero;
+    vector unsigned int adacc_prev = adacc;
+    vector unsigned int adacc_prev_0 = zero;
+
+    vector unsigned int s2acc_0 = zero;
+    vector unsigned int s2acc_1 = zero;
+    vector unsigned int s2acc_2 = zero;
+
+    /* Maintain a running sum of a second half, this might help use break yet another
+     * data dependency bubble in the sum */
+    vector unsigned int adacc_0 = zero;
+
+    int num_iter = len / 4;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+        vector unsigned char d2 = vec_ld(32, buf);
+        vector unsigned char d3 = vec_ld(48, buf);
+
+        /* The core operation of the loop, basically
+         * what is being unrolled below */
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_prev);
+        s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+        s2acc = vec_msum(t0, d0, s2acc);
+
+        /* interleave dependent sums in here */
+        adacc_0 = vec_sum4s(d1, adacc_0);
+        s2acc_0 = vec_msum(t1, d1, s2acc_0);
+        adacc = vec_sum4s(d2, adacc);
+        s2acc_1 = vec_msum(t2, d2, s2acc_1);
+        s2acc_2 = vec_msum(t3, d3, s2acc_2);
+        adacc_0 = vec_sum4s(d3, adacc_0);
+
+        adacc_prev = adacc;
+        adacc_prev_0 = adacc_0;
+        buf += 64;
+    }
+
+    adacc = vec_add(adacc, adacc_0);
+    s3acc = vec_add(s3acc, s3acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+    if (rem) {
+        adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+        adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+        while (rem--) {
+            vector unsigned char d0 = vec_ld(0, buf);
+            adacc = vec_sum4s(d0, adacc);
+            s3acc = vec_add(s3acc, adacc_prev);
+            s2acc = vec_msum(t3, d0, s2acc);
+            adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+            buf += 16;
+        }
+    }
+
+
+    /* Sum up independent second sums */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s2acc_2 = vec_add(s2acc_1, s2acc_2);
+    s2acc = vec_add(s2acc, s2acc_2);
+
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    uint32_t pair[16] ALIGNED_(16);
+    memset(&pair[2], 0, 14);
+    int n = NMAX;
+    unsigned int done = 0, i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    // Align buffer
+    unsigned int al = 0;
+    if ((uintptr_t)buf & 0xf) {
+        al = 16-((uintptr_t)buf & 0xf);
+        if (al > len) {
+            al=len;
+        }
+        vmx_handle_head_or_tail(pair, buf, al);
+
+        done += al;
+        /* Rather than rebasing, we can reduce the max sums for the
+         * first round only */
+        n -= al;
+    }
+    for (i = al; i < len; i += n) {
+        int remaining = (int)(len-i);
+        n = MIN(remaining, (i == al) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        vmx_accum32(pair, buf + i, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        vmx_handle_head_or_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
--- a/deps/zlib-ng/arch/power/chunkset_power8.c
+++ b/deps/zlib-ng/arch/power/chunkset_power8.c
@@ -0,0 +1,55 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+#include <altivec.h>
+#include "../../zbuild.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/power/compare256_power9.c
+++ b/deps/zlib-ng/arch/power/compare256_power9.c
@@ -0,0 +1,66 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "../../zbuild.h"
+#include "../../zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && (__GNUC__ < 12)
+#  define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+#  define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#  define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+#if BYTE_ORDER == LITTLE_ENDIAN
+        zng_vec_vctzlsbb(vc, cmplen);
+#else
+        zng_vec_vclzlsbb(vc, cmplen);
+#endif
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/power/crc32_constants.h
+++ b/deps/zlib-ng/arch/power/crc32_constants.h
--- a/deps/zlib-ng/arch/power/crc32_power8.c
+++ b/deps/zlib-ng/arch/power/crc32_power8.c
@@ -0,0 +1,589 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#include <altivec.h>
+#include "zendian.h"
+#include "zbuild.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#if defined (__clang__)
+#include "fallback_builtins.h"
+#endif
+
+#define MAX_SIZE    32768
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+    unsigned int prealign;
+    unsigned int tail;
+
+    unsigned long len = (unsigned long) _len;
+
+    if (p == (const unsigned char *) 0x0)
+        return 0;
+
+    crc ^= 0xffffffff;
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += len & ~VMX_ALIGN_MASK;
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+    crc ^= 0xffffffff;
+
+    return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+    const __vector unsigned long long vzero = {0,0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+    const __vector unsigned long long vmask_32bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+    const __vector unsigned long long vmask_64bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0,0};
+    __vector unsigned long long v1 = {0,0};
+    __vector unsigned long long v2 = {0,0};
+    __vector unsigned long long v3 = {0,0};
+    __vector unsigned long long v4 = {0,0};
+    __vector unsigned long long v5 = {0,0};
+    __vector unsigned long long v6 = {0,0};
+    __vector unsigned long long v7 = {0,0};
+
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+        vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+        vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+        vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each
+             * constant is 16 bytes, and it is used against 128 bytes of input
+             * data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE/8) - (block_size/8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size/128)-1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                           (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                           (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                           (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                           (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                           (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                           (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                           (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                           (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. Each iteration calculates the CRC for a 128-byte
+                 * block.
+                 */
+                for (i = 0; i < chunks-2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+                                                   vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down */
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                               (__vector unsigned long long)vconst1);
+            }/* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits
+             * of the register. Since we are bit reflected we have to shift it
+             * left 32 bits so it occupies the least significant bits in the
+             * bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                                      (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+                                                      (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+                                                      (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+                                                      (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+                                                      (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+                                                      (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+                                                      (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+                                                      (__vector unsigned char)vzero, 4);
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long*) p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long*) p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long*) p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long*) p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long*) p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long*) p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long*) p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long*) p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i+=16) {
+            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i,vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1,v0);
+
+    /* shift left one bit */
+    __vector unsigned char vsht_splat = vec_splat_u8 (1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+    v0 = vec_and(v0, vmask_64bit);
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit
+     * reflecting our data (which is expensive to do), we bit reflect our
+     * constants and our algorithm, which means the intermediate data in
+     * our vector registers goes from 0-63 instead of 63-0. We can reflect
+     * the algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor (v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in
+     * the high 32 bits. We just need to shift it left 4 bytes
+     * V0 [ 0 1 X 3 ]
+     * V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return v0[0];
+#else
+    return v0[1];
+#endif
+}
--- a/deps/zlib-ng/arch/power/fallback_builtins.h
+++ b/deps/zlib-ng/arch/power/fallback_builtins.h
@@ -0,0 +1,31 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_BUILTINS_H
+#define POWER_BUILTINS_H
+
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+    return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+
+#endif
--- a/deps/zlib-ng/arch/power/power_features.c
+++ b/deps/zlib-ng/arch/power/power_features.c
@@ -0,0 +1,42 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
+#include "../../zbuild.h"
+#include "power_features.h"
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+    hwcap = getauxval(AT_HWCAP);
+#endif
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+    unsigned long hwcap2;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+    hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+        features->has_arch_2_07 = 1;
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        features->has_arch_3_00 = 1;
+#endif
+}
--- a/deps/zlib-ng/arch/power/power_features.h
+++ b/deps/zlib-ng/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_H_
+#define POWER_H_
+
+struct power_cpu_features {
+    int has_altivec;
+    int has_arch_2_07;
+    int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_H_ */
--- a/deps/zlib-ng/arch/power/slide_hash_power8.c
+++ b/deps/zlib-ng/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
--- a/deps/zlib-ng/arch/power/slide_hash_vmx.c
+++ b/deps/zlib-ng/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
--- a/deps/zlib-ng/arch/power/slide_ppc_tpl.h
+++ b/deps/zlib-ng/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,31 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        vector unsigned short value, result;
+
+        value = vec_ld(0, p);
+        result = vec_subs(value, vmx_wsize);
+        vec_st(result, 0, p);
+
+        p += 8;
+        entries -= 8;
+   } while (entries > 0);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+    uint16_t wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
--- a/deps/zlib-ng/arch/riscv/README.md
+++ b/deps/zlib-ng/arch/riscv/README.md
@@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> We cannot detect rvv support at runtime, running the rvv code on a no-rvv target is a risk. Users should disable the rvv when the target does not support it. 
+>
+> We will have a better solution when the kernels update `hwcap` or `hwprobe` for risc-v.
+
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+  -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+  -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+  -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+  -D QEMU_PATH={QEMU_PATH} \
+  .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
--- a/deps/zlib-ng/arch/riscv/riscv_features.c
+++ b/deps/zlib-ng/arch/riscv/riscv_features.c
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../zbuild.h"
+#include "riscv_features.h"
+
+/* TODO: detect risc-v cpu info at runtime when the kernel updates hwcap or hwprobe for risc-v */
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+#if defined(__riscv_v) && defined(__linux__)
+    features->has_rvv = 1;
+#else
+    features->has_rvv = 0;
+#endif
+}
--- a/deps/zlib-ng/arch/riscv/riscv_features.h
+++ b/deps/zlib-ng/arch/riscv/riscv_features.h
@@ -0,0 +1,18 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * 
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_H_
+#define RISCV_H_
+
+struct riscv_cpu_features {
+    int has_rvv;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_H_ */
--- a/deps/zlib-ng/arch/s390/Makefile.in
+++ b/deps/zlib-ng/arch/s390/Makefile.in
@@ -0,0 +1,54 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_common.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
+
+dfltcc_common.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/deps/zlib-ng/arch/s390/README.md
+++ b/deps/zlib-ng/arch/s390/README.md
@@ -0,0 +1,284 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer and a window. `ZALLOC_DEFLATE_STATE()`, `ZALLOC_INFLATE_STATE()`,
+`ZFREE_STATE()`, `ZCOPY_DEFLATE_STATE()`, `ZCOPY_INFLATE_STATE()`,
+`ZALLOC_WINDOW()`, `ZCOPY_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate
+allocation  details for the parameter block (which is allocated alongside
+zlib-ng state) and the window (which must be page-aligned and large enough).
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction -
+  `dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There are no IBM Z builds of GitHub Actions runner, and
+stable qemu-user has problems with .NET apps, so the builder runs the
+x86_64 runner version with qemu-user built from the master branch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+
+```
+$ sudo dnf install docker
+```
+
+### Add services.
+
+```
+$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
+$ sudo systemctl daemon-reload
+```
+
+### Create a config file.
+
+```
+$ sudo tee /etc/actions-runner
+repo=<owner>/<name>
+access_token=<ghp_***>
+```
+
+Access token should have the repo scope, consult
+https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
+for details.
+
+### Autostart the x86_64 emulation support.
+
+```
+$ sudo systemctl enable --now qemu-user-static
+```
+
+### Autostart the runner.
+
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+## Rebuilding the image
+
+In order to update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
+latest OS security fixes, use the following commands:
+
+```
+$ sudo docker build \
+      --pull \
+      -f self-hosted-builder/actions-runner.Dockerfile \
+      -t iiilinuxibmcom/actions-runner
+$ sudo systemctl restart actions-runner
+```
+
+## Removing persistent data
+
+The `actions-runner` service stores various temporary data, such as runner
+registration information, work directories and logs, in the `actions-runner`
+volume. In order to remove it and start from scratch, e.g. when switching the
+runner to a different repository, use the following commands:
+
+```
+$ sudo systemctl stop actions-runner
+$ sudo docker rm -f actions-runner
+$ sudo docker volume rm actions-runner
+```
--- a/deps/zlib-ng/arch/s390/crc32-vx.c
+++ b/deps/zlib-ng/arch/s390/crc32-vx.c
@@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "../../zbuild.h"
+#include "crc32_braid_p.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return PREFIX(crc32_braid)(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = PREFIX(crc32_braid)(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
+
+    return crc;
+}
--- a/deps/zlib-ng/arch/s390/dfltcc_common.c
+++ b/deps/zlib-ng/arch/s390/dfltcc_common.c
@@ -0,0 +1,40 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */
+
+#include "zbuild.h"
+#include "dfltcc_common.h"
+#include "dfltcc_detail.h"
+
+/*
+   Memory management.
+
+   DFLTCC requires parameter blocks and window to be aligned. zlib-ng allows
+   users to specify their own allocation functions, so using e.g.
+   `posix_memalign' is not an option. Thus, we overallocate and take the
+   aligned portion of the buffer.
+*/
+
+static const int PAGE_ALIGN = 0x1000;
+
+void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size) {
+    void *p;
+    void *w;
+
+    /* To simplify freeing, we store the pointer to the allocated buffer right
+     * before the window. Note that DFLTCC always uses HB_SIZE bytes.
+     */
+    p = ZALLOC(strm, sizeof(void *) + MAX(items * size, HB_SIZE) + PAGE_ALIGN, sizeof(unsigned char));
+    if (p == NULL)
+        return NULL;
+    w = ALIGN_UP((char *)p + sizeof(void *), PAGE_ALIGN);
+    *(void **)((char *)w - sizeof(void *)) = p;
+    return w;
+}
+
+void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n) {
+    memcpy(dest, src, MAX(n, HB_SIZE));
+}
+
+void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w) {
+    if (w)
+        ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
+}
--- a/deps/zlib-ng/arch/s390/dfltcc_common.h
+++ b/deps/zlib-ng/arch/s390/dfltcc_common.h
@@ -0,0 +1,44 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size);
+void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n);
+void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w);
+
+#define ZFREE_STATE ZFREE
+
+#define ZALLOC_WINDOW PREFIX(dfltcc_alloc_window)
+
+#define ZCOPY_WINDOW PREFIX(dfltcc_copy_window)
+
+#define ZFREE_WINDOW PREFIX(dfltcc_free_window)
+
+#define TRY_FREE_WINDOW PREFIX(dfltcc_free_window)
+
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
--- a/deps/zlib-ng/arch/s390/dfltcc_deflate.c
+++ b/deps/zlib-ng/arch/s390/dfltcc_deflate.c
@@ -0,0 +1,404 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+struct dfltcc_deflate_state {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+};
+
+#define GET_DFLTCC_DEFLATE_STATE(state) ((struct dfltcc_deflate_state *)GET_DFLTCC_STATE(state))
+
+void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp) strm) {
+    return dfltcc_alloc_state(strm, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
+}
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src) {
+    dfltcc_copy_state(dst, src, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
+    PREFIX(flush_pending)(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(state->crc_fold.value);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        state->crc_fold.value = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/deps/zlib-ng/arch/s390/dfltcc_deflate.h
+++ b/deps/zlib-ng/arch/s390/dfltcc_deflate.h
@@ -0,0 +1,60 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp));
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src);
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define ZALLOC_DEFLATE_STATE PREFIX(dfltcc_alloc_deflate_state)
+#define ZCOPY_DEFLATE_STATE PREFIX(dfltcc_copy_deflate_state)
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#endif
--- a/deps/zlib-ng/arch/s390/dfltcc_detail.h
+++ b/deps/zlib-ng/arch/s390/dfltcc_detail.h
@@ -0,0 +1,312 @@
+#include "../../zbuild.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+};
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+};
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+#ifdef Z_MEMORY_SANITIZER
+    unsigned char *orig_t2 = t2;
+#endif
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0") = fn;
+    Z_REGISTER void *r1 __asm__("r1") = param;
+    Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
+    Z_REGISTER size_t r3 __asm__("r3") = t3;
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
+    Z_REGISTER size_t r5 __asm__("r5") = t5;
+    int cc;
+
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+#ifdef Z_MEMORY_SANITIZER
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        break;
+    }
+#endif
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+/*
+   Extension of inflate_state and deflate_state. Must be doubleword-aligned.
+*/
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
+
+static inline void *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt size, uInt extension_size) {
+    return ZALLOC(strm, 1, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    if (param->ho + param->hl <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(buf, history + param->ho, param->hl);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        memcpy(buf, history + param->ho, HB_SIZE - param->ho);
+        memcpy(buf + HB_SIZE - param->ho, history, param->ho + param->hl - HB_SIZE);
+    }
+}
--- a/deps/zlib-ng/arch/s390/dfltcc_inflate.c
+++ b/deps/zlib-ng/arch/s390/dfltcc_inflate.c
@@ -0,0 +1,205 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm) {
+    return (struct inflate_state *)dfltcc_alloc_state(strm, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
+}
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+
+    dfltcc_reset_state(dfltcc_state);
+}
+
+void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src) {
+    dfltcc_copy_state(dst, src, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    if (PREFIX(inflate_ensure_window)(state)) {
+        state->mode = MEM;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+
+    return !param->nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (PREFIX(inflate_ensure_window)(state)) {
+        state->mode = MEM;
+        return Z_MEM_ERROR;
+    }
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/deps/zlib-ng/arch/s390/dfltcc_inflate.h
+++ b/deps/zlib-ng/arch/s390/dfltcc_inflate.h
@@ -0,0 +1,70 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm);
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define ZALLOC_INFLATE_STATE PREFIX(dfltcc_alloc_inflate_state)
+#define ZCOPY_INFLATE_STATE PREFIX(dfltcc_copy_inflate_state)
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#endif
--- a/deps/zlib-ng/arch/s390/s390_features.c
+++ b/deps/zlib-ng/arch/s390/s390_features.c
@@ -0,0 +1,14 @@
+#include "../../zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS HWCAP_S390_VX
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
--- a/deps/zlib-ng/arch/s390/s390_features.h
+++ b/deps/zlib-ng/arch/s390/s390_features.h
@@ -0,0 +1,10 @@
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+    int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
--- a/deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+++ b/deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@@ -0,0 +1,45 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+# Temporary image: amd64 dependencies.
+FROM amd64/ubuntu:20.04 as ld-prefix
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install ca-certificates libicu66 libssl1.1
+
+# Main image.
+FROM s390x/ubuntu:20.04
+
+# Packages for zlib-ng testing.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install \
+        clang-11 \
+        cmake \
+        curl \
+        gcc \
+        git \
+        jq \
+        libxml2-dev \
+        libxslt-dev \
+        llvm-11-tools \
+        ninja-build \
+        python-is-python3 \
+        python3 \
+        python3-dev \
+        python3-pip
+
+# amd64 dependencies.
+COPY --from=ld-prefix / /usr/x86_64-linux-gnu/
+RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/
+RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/
+ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu
+
+# amd64 Github Actions Runner.
+RUN useradd -m actions-runner
+USER actions-runner
+WORKDIR /home/actions-runner
+RUN curl -L https://github.com/actions/runner/releases/download/v2.287.1/actions-runner-linux-x64-2.287.1.tar.gz | tar -xz
+VOLUME /home/actions-runner
+
+# Scripts.
+COPY fs/ /
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD ["/usr/bin/actions-runner"]
--- a/deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
+++ b/deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
@@ -0,0 +1,24 @@
+[Unit]
+Description=Self-Hosted IBM Z Github Actions Runner
+Wants=qemu-user-static
+After=qemu-user-static
+StartLimitIntervalSec=0
+
+[Service]
+Type=simple
+Restart=always
+ExecStartPre=-/usr/bin/docker rm --force actions-runner
+ExecStart=/usr/bin/docker run \
+              --env-file=/etc/actions-runner \
+              --init \
+              --interactive \
+              --name=actions-runner \
+              --rm \
+              --volume=actions-runner:/home/actions-runner \
+              iiilinuxibmcom/actions-runner
+ExecStop=/bin/sh -c "docker exec actions-runner kill -INT -- -1"
+ExecStop=/bin/sh -c "docker wait actions-runner"
+ExecStop=/bin/sh -c "docker rm actions-runner"
+
+[Install]
+WantedBy=multi-user.target
--- a/deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/actions-runner
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+#
+# Ephemeral runner startup script.
+#
+# Expects the following environment variables:
+#
+# - repo=<owner>/<name>
+# - access_token=<ghp_***>
+#
+
+set -e -u
+
+# Check the cached registration token.
+token_file=registration-token.json
+set +e
+expires_at=$(jq --raw-output .expires_at "$token_file" 2>/dev/null)
+status=$?
+set -e
+if [[ $status -ne 0 || $(date +%s) -ge $(date -d "$expires_at" +%s) ]]; then
+    # Refresh the cached registration token.
+    curl \
+        -X POST \
+        -H "Accept: application/vnd.github.v3+json" \
+        -H "Authorization: token $access_token" \
+        "https://api.github.com/repos/$repo/actions/runners/registration-token" \
+        -o "$token_file"
+fi
+
+# (Re-)register the runner.
+registration_token=$(jq --raw-output .token "$token_file")
+./config.sh remove --token "$registration_token" || true
+./config.sh \
+    --url "https://github.com/$repo" \
+    --token "$registration_token" \
+    --labels z15 \
+    --ephemeral
+
+# Run one job.
+./run.sh
--- a/deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/entrypoint
+++ b/deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/entrypoint
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#
+# Container entrypoint that waits for all spawned processes.
+#
+
+set -e -u
+
+# Create a FIFO and start reading from its read end.
+tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
+trap 'rm -r "$tempdir"' EXIT
+done="$tempdir/pipe"
+mkfifo "$done"
+cat "$done" & waiter=$!
+
+# Start the workload. Its descendants will inherit the FIFO's write end.
+status=0
+if [ "$#" -eq 0 ]; then
+    bash 9>"$done" || status=$?
+else
+    "$@" 9>"$done" || status=$?
+fi
+
+# When the workload and all of its descendants exit, the FIFO's write end will
+# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
+# in order to handle SelfUpdater, which the workload may start in background
+# before exiting.
+wait "$waiter"
+
+exit "$status"
--- a/deps/zlib-ng/arch/s390/self-hosted-builder/qemu-user-static.service
+++ b/deps/zlib-ng/arch/s390/self-hosted-builder/qemu-user-static.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Support for transparent execution of non-native binaries with QEMU user emulation
+
+[Service]
+Type=oneshot
+# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1
+# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available
+ExecStart=/usr/bin/docker run --rm --interactive --privileged iiilinuxibmcom/qemu-user-static --reset -p yes
+
+[Install]
+WantedBy=multi-user.target
--- a/deps/zlib-ng/arch/x86/Makefile.in
+++ b/deps/zlib-ng/arch/x86/Makefile.in
@@ -0,0 +1,147 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
+AVX2FLAG=-mavx2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	x86_features.o x86_features.lo \
+	adler32_avx2.o adler32_avx2.lo \
+	adler32_avx512.o adler32_avx512.lo \
+	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+	adler32_sse42.o adler32_sse42.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx2.o chunkset_avx2.lo \
+	chunkset_sse2.o chunkset_sse2.lo \
+	chunkset_ssse3.o chunkset_ssse3.lo \
+	compare256_avx2.o compare256_avx2.lo \
+	compare256_sse2.o compare256_sse2.lo \
+	insert_string_sse42.o insert_string_sse42.lo \
+	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
+	slide_hash_avx2.o slide_hash_avx2.lo \
+	slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+	$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+	$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+insert_string_sse42.o:
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+insert_string_sse42.lo:
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+crc32_pclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+crc32_vpclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+slide_hash_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/deps/zlib-ng/arch/x86/adler32_avx2.c
+++ b/deps/zlib-ng/arch/x86/adler32_avx2.c
@@ -0,0 +1,17 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <immintrin.h>
+
+#ifdef X86_AVX2
+
+#include "adler32_avx2_tpl.h"
+
+#define COPY
+#include "adler32_avx2_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_avx2_p.h
+++ b/deps/zlib-ng/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+                                  _mm256_castsi256_si128(x));
+    __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros */
+    const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+    __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+    __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+    __m128i sum2  = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_avx2_tpl.h
+++ b/deps/zlib-ng/arch/x86/adler32_avx2_tpl.h
@@ -0,0 +1,141 @@
+/* adler32_avx2_tpl.h -- adler32 avx2 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "../../fallback_builtins.h"
+#include "adler32_avx2_p.h"
+
+#ifdef X86_SSE42
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
+#define sub32(a, b, c) adler32_ssse3(a, b, c)
+#else
+#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
+#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
+#endif
+
+#ifdef COPY
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+#endif
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+#ifdef COPY
+        return adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    } else if (len < 32) {
+#ifdef COPY
+        return copy_sub32(adler, dst, src, len);
+#else
+        return sub32(adler, src, len);
+#endif
+    }
+
+    __m256i vs1, vs2;
+
+    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+                                           14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m256i dot3v = _mm256_set1_epi16(1);
+    const __m256i zero = _mm256_setzero_si256();
+
+    while (len >= 32) {
+       vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+       vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+       __m256i vs1_0 = vs1;
+       __m256i vs3 = _mm256_setzero_si256();
+
+       size_t k = MIN(len, NMAX);
+       k -= k % 32;
+       len -= k;
+
+       while (k >= 32) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+           */
+           __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+           src += 32;
+           k -= 32;
+
+           __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+                                                          //
+#ifdef COPY
+            _mm256_storeu_si256((__m256i*)dst, vbuf);
+            dst += 32;
+#endif
+           vs1 = _mm256_add_epi32(vs1, vs1_sad);
+           vs3 = _mm256_add_epi32(vs3, vs1_0);
+           __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+           __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+           vs2 = _mm256_add_epi32(vsum2, vs2);
+           vs1_0 = vs1;
+       }
+
+       /* Defer the multiplication with 32 to outside of the loop */
+       vs3 = _mm256_slli_epi32(vs3, 5);
+       vs2 = _mm256_add_epi32(vs2, vs3);
+
+       /* The compiler is generating the following sequence for this integer modulus
+        * when done the scalar way, in GPRs:
+
+        adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+                (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+        mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
+        ...
+        vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+        mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
+        imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
+        shr    $0x2f,%rsi // shift right by 47
+        imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+        sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+        ...
+        // repeats for each element with vpextract instructions
+
+        This is tricky with AVX2 for a number of reasons:
+            1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+            2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+                back down to 32 bit precision later (there is in AVX512)
+            3.) Full width integer multiplications aren't cheap
+
+        We can, however, and do a relatively cheap sequence for horizontal sums.
+        Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+        previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+        that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+        performed on the maximum possible inputs before overflow
+        */
+
+
+        /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
+         * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+         * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+         * what the compiler is doing to avoid integer divisions. */
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
--- a/deps/zlib-ng/arch/x86/adler32_avx512.c
+++ b/deps/zlib-ng/arch/x86/adler32_avx512.c
@@ -0,0 +1,16 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "adler32_avx512_tpl.h"
+
+#define COPY
+#include "adler32_avx512_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_avx512_p.h
+++ b/deps/zlib-ng/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_avx512_tpl.h
+++ b/deps/zlib-ng/arch/x86/adler32_avx512_tpl.h
@@ -0,0 +1,106 @@
+/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "../../cpu_features.h"
+#include "../../fallback_builtins.h"
+#include <immintrin.h>
+#include "adler32_avx512_p.h"
+
+#ifdef X86_AVX512
+
+#ifdef COPY
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+#endif
+
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 64) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+#ifdef COPY
+        __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+        __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+        _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+#endif
+
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    __m512i vbuf, vs1_0, vs3;
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+    const __m512i dot3v = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    size_t k;
+
+    while (len >= 64) {
+        __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        vs1_0 = vs1;
+        vs3 = _mm512_setzero_si512();
+
+        k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf = _mm512_loadu_si512(src);
+#ifdef COPY
+            _mm512_storeu_si512(dst, vbuf);
+            dst += 64;
+#endif
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+            __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+            vs1 = _mm512_add_epi32(vs1_sad, vs1);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm512_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_avx512_vnni.c
+++ b/deps/zlib-ng/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,225 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../cpu_features.h"
+#include "../../fallback_builtins.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 32)
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    if (len < 64)
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+    const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
+
+    while (len >= 64) {
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+        __m512i vs1_0 = vs1;
+        __m512i vs3 = _mm512_setzero_si512();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m512i vs2_1 = _mm512_setzero_si512();
+        __m512i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 128) {
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 128) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
+            k -= 128;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm512_add_epi32(vs3, vs1);
+            vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+        vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler;
+}
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_sse42.c
+++ b/deps/zlib-ng/arch/x86/adler32_sse42.c
@@ -0,0 +1,121 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42
+
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+    }
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = MIN(len, NMAX);
+        k -= k % 16;
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler0 | (adler1 << 16);
+}
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/deps/zlib-ng/arch/x86/adler32_ssse3.c
@@ -0,0 +1,156 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if 16 - the remainder + len is
+     * < 16 */
+    size_t max_iters = NMAX;
+    size_t rem = (uintptr_t)buf & 15;
+    size_t align_offset = 16 - rem;
+    size_t k = 0;
+    if (rem) {
+        if (len < 16 + align_offset) {
+            /* Let's eat the cost of this one unaligned load so that
+             * we don't completely skip over the vectorization. Doing
+             * 16 bytes at a time unaligned is is better than 16 + <= 15
+             * sums */
+            vbuf = _mm_loadu_si128((__m128i*)buf);
+            len -= 16;
+            buf += 16;
+            vs1 = _mm_cvtsi32_si128(adler);
+            vs2 = _mm_cvtsi32_si128(sum2);
+            vs3 = _mm_setzero_si128();
+            vs1_0 = vs1;
+            goto unaligned_jmp;
+        }
+
+        for (size_t i = 0; i < align_offset; ++i) {
+            adler += *(buf++);
+            sum2 += adler;
+        }
+
+        /* lop off the max number of sums based on the scalar sums done
+         * above */
+        len -= align_offset;
+        max_iters -= align_offset;
+    }
+
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = (len < max_iters ? len : max_iters);
+        k -= k % 16;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+            buf += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            buf += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler = partial_hsum(vs1) % BASE;
+        sum2 = hsum(vs2) % BASE;
+        max_iters = NMAX;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
--- a/deps/zlib-ng/arch/x86/adler32_ssse3_p.h
+++ b/deps/zlib-ng/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_srli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
--- a/deps/zlib-ng/arch/x86/chunkset_avx2.c
+++ b/deps/zlib-ng/arch/x86/chunkset_avx2.c
@@ -0,0 +1,135 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX2
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m256i chunk_t;
+
+#define CHUNK_SIZE 32
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+#endif
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    } else if (dist == 16) {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+    } else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_avx2
+#define CHUNKCOPY        chunkcopy_avx2
+#define CHUNKUNROLL      chunkunroll_avx2
+#define CHUNKMEMSET      chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/chunkset_sse2.c
+++ b/deps/zlib-ng/arch/x86/chunkset_sse2.c
@@ -0,0 +1,56 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/chunkset_ssse3.c
+++ b/deps/zlib-ng/arch/x86/chunkset_ssse3.c
@@ -0,0 +1,103 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSSE3) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+#ifdef Z_MEMORY_SANITIZER
+    /* Important to note:
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load uninitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+
+#define CHUNKSIZE        chunksize_ssse3
+#define CHUNKMEMSET      chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/compare256_avx2.c
+++ b/deps/zlib-ng/arch/x86/compare256_avx2.c
@@ -0,0 +1,63 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/compare256_sse2.c
+++ b/deps/zlib-ng/arch/x86/compare256_sse2.c
@@ -0,0 +1,96 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/deps/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@@ -0,0 +1,186 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+#endif
+    unsigned long algn_diff;
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i xmm_crc_part = _mm_setzero_si128();
+#ifdef COPY
+    char ALIGNED_(16) partial_buf[16] = { 0 };
+#else
+    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
+    int32_t first = init_crc != 0;
+
+    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
+     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
+     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
+     * by definition can be up to 15 bytes + one full vector load. */
+    assert(len >= 31 || first == 0);
+#endif
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    if (len < 16) {
+#ifdef COPY
+        if (len == 0)
+            return;
+
+        memcpy(partial_buf, src, len);
+        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+        memcpy(dst, partial_buf, len);
+#endif
+        goto partial;
+    }
+
+    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+    if (algn_diff) {
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+        dst += algn_diff;
+#else
+        XOR_INITIAL128(xmm_crc_part);
+
+        if (algn_diff < 4 && init_crc != 0) {
+            xmm_t0 = xmm_crc_part;
+            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            src += 16;
+            len -= 16;
+        }
+#endif
+
+        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+
+        src += algn_diff;
+        len -= algn_diff;
+    }
+
+#ifdef X86_VPCLMULQDQ
+    if (len >= 256) {
+#ifdef COPY
+        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+        dst += n;
+#else
+        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
+            xmm_initial, first);
+        first = 0;
+#endif
+        len -= n;
+        src += n;
+    }
+#endif
+
+    while (len >= 64) {
+        len -= 64;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+        src += 64;
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+        dst += 64;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len >= 48) {
+        len -= 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        src += 48;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        dst += 48;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+    } else if (len >= 32) {
+        len -= 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        src += 32;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        dst += 32;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+    } else if (len >= 16) {
+        len -= 16;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        src += 16;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        dst += 16;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+    }
+
+partial:
+    if (len) {
+        memcpy(&xmm_crc_part, src, len);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
+        memcpy(dst, partial_buf, len);
+#endif
+        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    }
+
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+}
--- a/deps/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
+++ b/deps/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
@@ -0,0 +1,107 @@
+/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
+    __m128i init_crc, int32_t first) {
+    __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+#endif
+    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+    __m512i z0, z1, z2, z3;
+    size_t len_tmp = len;
+    const __m512i zmm_fold4 = _mm512_set4_epi32(
+        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+    const __m512i zmm_fold16 = _mm512_set4_epi32(
+        0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+    // zmm register init
+    zmm_crc0 = _mm512_setzero_si512();
+    zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+#ifndef COPY
+    XOR_INITIAL512(zmm_t0);
+#endif
+    zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+    zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+    zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+    /* already have intermediate CRC in xmm registers
+        * fold4 with 4 xmm_crc to get zmm_crc0
+    */
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+
+#ifdef COPY
+    _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+    _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+    _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+    _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+    dst += 256;
+#endif
+    len -= 256;
+    src += 256;
+
+    // fold-16 loops
+    while (len >= 256) {
+        zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+        zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+        zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+        zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+        z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+        z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+        z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+        z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+        zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+        zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+        zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+        zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+        zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+        zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
+        zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
+        zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
+
+#ifdef COPY
+        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+        _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+        _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+        _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+        dst += 256;
+#endif
+        len -= 256;
+        src += 256;
+    }
+    // zmm_crc[0,1,2,3] -> zmm_crc0
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
+
+    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+    *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+    *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+    *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+    *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+    return (len_tmp - len);  // return n bytes processed
+}
--- a/deps/zlib-ng/arch/x86/crc32_pclmulqdq.c
+++ b/deps/zlib-ng/arch/x86/crc32_pclmulqdq.c
@@ -0,0 +1,30 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#define CRC32_FOLD_COPY  crc32_fold_pclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_pclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
+#define CRC32            crc32_pclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/deps/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@@ -0,0 +1,363 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+#ifdef X86_VPCLMULQDQ
+#  include <immintrin.h>
+#endif
+
+#include "../../crc32_fold.h"
+#include "../../crc32_braid_p.h"
+#include "../../fallback_builtins.h"
+#include <assert.h>
+
+#ifdef X86_VPCLMULQDQ
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
+    int32_t first);
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc3, ps_res;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = x_tmp3;
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3, x_tmp2;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+    x_tmp3 = *xmm_crc3;
+    x_tmp2 = *xmm_crc2;
+
+    *xmm_crc3 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
+
+    *xmm_crc2 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
+
+    *xmm_crc0 = x_tmp2;
+    *xmm_crc1 = x_tmp3;
+    *xmm_crc2 = _mm_castps_si128(ps_res20);
+    *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc2;
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+    *xmm_crc2 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
+
+    *xmm_crc1 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
+
+    *xmm_crc0 = x_tmp3;
+    *xmm_crc1 = _mm_castps_si128(ps_res10);
+    *xmm_crc2 = _mm_castps_si128(ps_res21);
+    *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+    x_tmp0 = *xmm_crc0;
+    x_tmp1 = *xmm_crc1;
+    x_tmp2 = *xmm_crc2;
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_t0 = _mm_castsi128_ps(x_tmp0);
+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_t1 = _mm_castsi128_ps(x_tmp1);
+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_t2 = _mm_castsi128_ps(x_tmp2);
+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_t3 = _mm_castsi128_ps(x_tmp3);
+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+    *xmm_crc0 = _mm_castps_si128(ps_res0);
+    *xmm_crc1 = _mm_castps_si128(ps_res1);
+    *xmm_crc2 = _mm_castps_si128(ps_res2);
+    *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
+    0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+    0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+    0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
+    0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
+    0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
+    0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
+    0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
+    0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
+    0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
+    0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
+    0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
+    0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+    0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+    0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+    0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
+                         __m128i *xmm_crc3, __m128i *xmm_crc_part) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+
+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+    __m128i xmm_a0_0, xmm_a0_1;
+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+    xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
+    xmm_shr = xmm_shl;
+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+    ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
+    *fold0 = _mm_load_si128(fold + 0);
+    *fold1 = _mm_load_si128(fold + 1);
+    *fold2 = _mm_load_si128(fold + 2);
+    *fold3 = _mm_load_si128(fold + 3);
+}
+
+static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
+                                   const __m128i *fold2, const __m128i *fold3) {
+    _mm_storeu_si128(fold + 0, *fold0);
+    _mm_storeu_si128(fold + 1, *fold1);
+    _mm_storeu_si128(fold + 2, *fold2);
+    _mm_storeu_si128(fold + 3, *fold3);
+}
+
+Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
+    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    __m128i xmm_zero = _mm_setzero_si128();
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
+    return 0;
+}
+
+#define ONCE(op)                 if (first) { first = 0; op; }
+#define XOR_INITIAL128(where)    ONCE(where = _mm_xor_si128(where, xmm_initial))
+#ifdef X86_VPCLMULQDQ
+#  define XOR_INITIAL512(where)  ONCE(where = _mm512_xor_si512(where, zmm_initial))
+#endif
+
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+#define COPY
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+
+static const unsigned ALIGNED_(16) crc_k[] = {
+    0xccaa009e, 0x00000000, /* rk1 */
+    0x751997d0, 0x00000001, /* rk2 */
+    0xccaa009e, 0x00000000, /* rk5 */
+    0x63cd6124, 0x00000001, /* rk6 */
+    0xf7011640, 0x00000001, /* rk7 */
+    0xdb710640, 0x00000001  /* rk8 */
+};
+
+static const unsigned ALIGNED_(16) crc_mask[4] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+static const unsigned ALIGNED_(16) crc_mask2[4] = {
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    /*
+     * k1
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+    /*
+     * k5
+     */
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+    /*
+     * k7
+     */
+    xmm_crc1 = xmm_crc3;
+    xmm_crc2 = xmm_crc3;
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
+
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+    xmm_crc2 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+    crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
+
+    return crc->value;
+}
+
+Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
+    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
+     * these short lengths might also prove to be effective */
+    if (len < 64)
+        return PREFIX(crc32_braid)(crc32, buf, len);
+
+    crc32_fold ALIGNED_(16) crc_state;
+    CRC32_FOLD_RESET(&crc_state);
+    CRC32_FOLD(&crc_state, buf, len, crc32);
+    return CRC32_FOLD_FINAL(&crc_state);
+}
--- a/deps/zlib-ng/arch/x86/crc32_vpclmulqdq.c
+++ b/deps/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+
+#define X86_VPCLMULQDQ
+#define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_vpclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
+#define CRC32            crc32_vpclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
--- a/deps/zlib-ng/arch/x86/insert_string_sse42.c
+++ b/deps/zlib-ng/arch/x86/insert_string_sse42.c
@@ -0,0 +1,50 @@
+/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+#include "../../deflate.h"
+
+#ifdef X86_SSE42_CRC_INTRIN
+#  ifdef _MSC_VER
+#    define HASH_CALC(s, h, val)\
+        h = _mm_crc32_u32(h, val)
+#  else
+#    define HASH_CALC(s, h, val)\
+        h = __builtin_ia32_crc32si(h, val)
+#  endif
+#else
+#  ifdef _MSC_VER
+#    define HASH_CALC(s, h, val) {\
+        __asm mov edx, h\
+        __asm mov eax, val\
+        __asm crc32 eax, edx\
+        __asm mov h, eax\
+    }
+#  else
+#    define HASH_CALC(s, h, val) \
+        __asm__ __volatile__ (\
+            "crc32 %1,%0\n\t"\
+            : "+r" (h)\
+            : "r" (val)\
+        );
+#  endif
+#endif
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         update_hash_sse42
+#define INSERT_STRING       insert_string_sse42
+#define QUICK_INSERT_STRING quick_insert_string_sse42
+
+#ifdef X86_SSE42
+#  include "../../insert_string_tpl.h"
+#endif
--- a/deps/zlib-ng/arch/x86/slide_hash_avx2.c
+++ b/deps/zlib-ng/arch/x86/slide_hash_avx2.c
@@ -0,0 +1,39 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
+
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)table);
+        result = _mm256_subs_epu16(value, wsize);
+        _mm256_storeu_si256((__m256i *)table, result);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
--- a/deps/zlib-ng/arch/x86/slide_hash_sse2.c
+++ b/deps/zlib-ng/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,62 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+                                    uint32_t entries1, const __m128i wsize) {
+    uint32_t entries;
+    Pos *table;
+    __m128i value0, value1, result0, result1;
+
+    int on_chain = 0;
+
+next_chain:
+    table = (on_chain) ? table1 : table0;
+    entries = (on_chain) ? entries1 : entries0;
+
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        value0 = _mm_load_si128((__m128i *)table);
+        value1 = _mm_load_si128((__m128i *)(table + 8));
+        result0 = _mm_subs_epu16(value0, wsize);
+        result1 = _mm_subs_epu16(value1, wsize);
+        _mm_store_si128((__m128i *)table, result0);
+        _mm_store_si128((__m128i *)(table + 8), result1);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+
+    ++on_chain;
+    if (on_chain > 1) {
+        return;
+    } else {
+        goto next_chain;
+    }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
--- a/deps/zlib-ng/arch/x86/x86_features.c
+++ b/deps/zlib-ng/arch/x86/x86_features.c
@@ -0,0 +1,97 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ *  Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "x86_features.h"
+
+#ifdef _WIN32
+#  include <intrin.h>
+#else
+// Newer versions of GCC and clang come with cpuid.h
+#  include <cpuid.h>
+#endif
+
+#include <string.h>
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _WIN32
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _WIN32
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#ifdef _WIN32
+    return _xgetbv(xcr);
+#else
+    uint32_t eax, edx;
+    __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+    return (uint64_t)(edx) << 32 | eax;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    features->has_sse2 = edx & 0x4000000;
+    features->has_ssse3 = ecx & 0x200;
+    features->has_sse42 = ecx & 0x100000;
+    features->has_pclmulqdq = ecx & 0x2;
+
+    if (ecx & 0x08000000) {
+        uint64_t xfeature = xgetbv(0);
+
+        features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+        features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+    }
+
+    if (maxbasic >= 7) {
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI1 bit
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        features->has_vpclmulqdq = ecx & 0x400;
+
+        // check AVX2 bit if the OS supports saving YMM registers
+        if (features->has_os_save_ymm) {
+            features->has_avx2 = ebx & 0x20;
+        }
+
+        // check AVX512 bits if the OS supports saving ZMM registers
+        if (features->has_os_save_zmm) {
+            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512vnni = ecx & 0x800;
+        }
+    }
+}
--- a/deps/zlib-ng/arch/x86/x86_features.h
+++ b/deps/zlib-ng/arch/x86/x86_features.h
@@ -0,0 +1,24 @@
+/* x86_features.h -- check for CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+    int has_avx2;
+    int has_avx512;
+    int has_avx512vnni;
+    int has_sse2;
+    int has_ssse3;
+    int has_sse42;
+    int has_pclmulqdq;
+    int has_vpclmulqdq;
+    int has_os_save_ymm;
+    int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* CPU_H_ */
--- a/deps/zlib-ng/chunkset.c
+++ b/deps/zlib-ng/chunkset.c
@@ -0,0 +1,42 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+typedef uint64_t chunk_t;
+
+#define CHUNK_SIZE 8
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint8_t *dest = (uint8_t *)chunk;
+    memcpy(dest, from, sizeof(uint32_t));
+    memcpy(dest+4, from, sizeof(uint32_t));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    memcpy(chunk, from, sizeof(uint64_t));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk, sizeof(uint64_t));
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_c
+
+#include "inffast_tpl.h"
--- a/deps/zlib-ng/chunkset_tpl.h
+++ b/deps/zlib-ng/chunkset_tpl.h
@@ -0,0 +1,200 @@
+/* chunkset_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include <stdlib.h>
+
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
+/* Returns the chunk size */
+Z_INTERNAL uint32_t CHUNKSIZE(void) {
+    return sizeof(chunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+   chunk_t bytes of output even if the length is shorter than this,
+   that the length is non-zero, and that `from` lags `out` by at least
+   sizeof chunk_t bytes (or that they don't overlap at all or simply that
+   the distance is less than the length of the copy).
+
+   Aside from better memory bus utilisation, this means that short copies
+   (chunk_t bytes or fewer) will fall straight through the loop
+   without iteration, which will hopefully make the branch prediction more
+   reliable. */
+#ifndef HAVE_CHUNKCOPY
+Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    chunk_t chunk;
+    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    loadchunk(from, &chunk);
+    storechunk(out, &chunk);
+    out += align;
+    from += align;
+    len -= align;
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+    return out;
+}
+#endif
+
+/* Perform short copies until distance can be rewritten as being at least
+   sizeof chunk_t.
+
+   This assumes that it's OK to overwrite at least the first
+   2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
+   This assumption holds because inflate_fast() starts every iteration with at
+   least 258 bytes of output space available (258 being the maximum length
+   output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
+Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+    unsigned char const *from = out - *dist;
+    chunk_t chunk;
+    while (*dist < *len && *dist < sizeof(chunk_t)) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += *dist;
+        *len -= *dist;
+        *dist += *dist;
+    }
+    return out;
+}
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        uint32_t cpy_dist;
+        uint32_t bytes_remaining = sizeof(chunk_t);
+        chunk_t chunk_load;
+        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, buf, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway */
+            *chunk_rem = cpy_dist;
+        }
+
+        return chunk_load;
+}
+#endif
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+   Return OUT + LEN. */
+Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+    /* Debug performance related issues when len < sizeof(uint64_t):
+       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+    Assert(dist > 0, "chunkmemset cannot have a distance 0");
+    /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+    if (len <= 16) {
+        return chunkmemset_ssse3(out, dist, len);
+    }
+#endif
+
+    uint8_t *from = out - dist;
+
+    if (dist == 1) {
+        memset(out, *from, len);
+        return out + len;
+    } else if (dist > sizeof(chunk_t)) {
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+
+    /* TODO: possibly build up a permutation table for this if not an even modulus */
+#ifdef HAVE_CHUNKMEMSET_2
+    if (dist == 2) {
+        chunkmemset_2(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+    if (dist == 4) {
+        chunkmemset_4(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+    if (dist == 8) {
+        chunkmemset_8(from, &chunk_load);
+    } else if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
+    } else
+#endif
+    {
+        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
+    }
+
+    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+    if (chunk_mod == 0) {
+        while (len >= (2 * sizeof(chunk_t))) {
+            storechunk(out, &chunk_load);
+            storechunk(out + sizeof(chunk_t), &chunk_load);
+            out += 2 * sizeof(chunk_t);
+            len -= 2 * sizeof(chunk_t);
+        }
+    }
+
+    /* If we don't have a "dist" length that divides evenly into a vector
+     * register, we can write the whole vector register but we need only
+     * advance by the amount of the whole string that fits in our chunk_t.
+     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+    while (len >= sizeof(chunk_t)) {
+        storechunk(out, &chunk_load);
+        len -= adv_amount;
+        out += adv_amount;
+    }
+
+    if (len) {
+        memcpy(out, &chunk_load, len);
+        out += len;
+    }
+
+    return out;
+}
+
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+#if !defined(UNALIGNED64_OK)
+#  if !defined(UNALIGNED_OK)
+    static const uint32_t align_mask = 7;
+#  else
+    static const uint32_t align_mask = 3;
+#  endif
+#endif
+
+    len = MIN(len, left);
+    uint8_t *from = out - dist;
+#if !defined(UNALIGNED64_OK)
+    while (((uintptr_t)out & align_mask) && (len > 0)) {
+        *out++ = *from++;
+        --len;
+        --left;
+    }
+#endif
+    if (left < (unsigned)(3 * sizeof(chunk_t))) {
+        while (len > 0) {
+            *out++ = *from++;
+            --len;
+        }
+        return out;
+    }
+    if (len)
+        return CHUNKMEMSET(out, dist, len);
+
+    return out;
+}
--- a/deps/zlib-ng/cmake/detect-arch.c
+++ b/deps/zlib-ng/cmake/detect-arch.c
@@ -0,0 +1,111 @@
+// archdetect.c -- Detect compiler architecture and raise preprocessor error
+//                 containing a simple arch identifier.
+// Copyright (C) 2019 Hans Kristian Rosbach
+// Licensed under the Zlib license, see LICENSE.md for details
+
+// x86_64
+#if defined(__x86_64__) || defined(_M_X64)
+    #error archfound x86_64
+
+// x86
+#elif defined(__i386) || defined(_M_IX86)
+    #error archfound i686
+
+// ARM
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+    #error archfound aarch64
+#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
+        #error archfound armv8
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+        #error archfound armv7
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
+        #error archfound armv6
+    #elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+        #error archfound armv5
+    #elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
+        #error archfound armv4
+    #elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
+        #error archfound armv3
+    #elif defined(__ARM_ARCH_2__)
+        #error archfound armv2
+    #endif
+
+// PowerPC
+#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
+    #if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #error archfound powerpc64le
+        #else
+            #error archfound powerpc64
+        #endif
+    #else
+        #error archfound powerpc
+    #endif
+
+// --------------- Less common architectures alphabetically below ---------------
+
+// ALPHA
+#elif defined(__alpha__) || defined(__alpha)
+    #error archfound alpha
+
+// Blackfin
+#elif defined(__BFIN__)
+    #error archfound blackfin
+
+// Itanium
+#elif defined(__ia64) || defined(_M_IA64)
+    #error archfound ia64
+
+// MIPS
+#elif defined(__mips__) || defined(__mips)
+    #error archfound mips
+
+// Motorola 68000-series
+#elif defined(__m68k__)
+    #error archfound m68k
+
+// SuperH
+#elif defined(__sh__)
+    #error archfound sh
+
+// SPARC
+#elif defined(__sparc__) || defined(__sparc)
+    #if defined(__sparcv9) || defined(__sparc_v9__)
+        #error archfound sparc9
+    #elif defined(__sparcv8) || defined(__sparc_v8__)
+        #error archfound sparc8
+    #endif
+
+// SystemZ
+#elif defined(__370__)
+    #error archfound s370
+#elif defined(__s390__)
+    #error archfound s390
+#elif defined(__s390x) || defined(__zarch__)
+    #error archfound s390x
+
+// PARISC
+#elif defined(__hppa__)
+    #error archfound parisc
+
+// RS-6000
+#elif defined(__THW_RS6000)
+    #error archfound rs6000
+
+// RISC-V
+#elif defined(__riscv)
+    #if __riscv_xlen == 64
+        #error archfound riscv64
+    #elif __riscv_xlen == 32
+        #error archfound riscv32
+    #endif
+
+// Emscripten (WebAssembly)
+#elif defined(__EMSCRIPTEN__)
+    #error archfound wasm32
+
+// return 'unrecognized' if we do not know what architecture this is
+#else
+    #error archfound unrecognized
+#endif
--- a/deps/zlib-ng/cmake/detect-arch.cmake
+++ b/deps/zlib-ng/cmake/detect-arch.cmake
@@ -0,0 +1,101 @@
+# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
+# Copyright (C) 2019 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+set(ARCHDETECT_FOUND TRUE)
+
+if(CMAKE_OSX_ARCHITECTURES)
+    # If multiple architectures are requested (universal build), pick only the first
+    list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
+elseif(MSVC)
+    if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
+        set(ARCH "i686")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
+        set(ARCH "x86_64")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
+        set(ARCH "arm")
+    elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64")
+        set(ARCH "aarch64")
+    endif()
+elseif(EMSCRIPTEN)
+    set(ARCH "wasm32")
+elseif(CMAKE_CROSSCOMPILING)
+    set(ARCH ${CMAKE_C_COMPILER_TARGET})
+else()
+    # Let preprocessor parse archdetect.c and raise an error containing the arch identifier
+    enable_language(C)
+    try_run(
+        run_result_unused
+        compile_result_unused
+        ${CMAKE_CURRENT_BINARY_DIR}
+        ${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
+        COMPILE_OUTPUT_VARIABLE RAWOUTPUT
+        CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+    )
+
+    # Find basearch tag, and extract the arch word into BASEARCH variable
+    string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
+    if(NOT ARCH)
+        set(ARCH unknown)
+    endif()
+endif()
+
+# Make sure we have ARCH set
+if(NOT ARCH OR ARCH STREQUAL "unknown")
+    set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
+    message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
+else()
+    message(STATUS "Arch detected: '${ARCH}'")
+endif()
+
+# Base arch detection
+if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64)")
+    set(BASEARCH "arm")
+    set(BASEARCH_ARM_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
+    set(BASEARCH "ppc")
+    set(BASEARCH_PPC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "alpha")
+    set(BASEARCH "alpha")
+    set(BASEARCH_ALPHA_FOUND TRUE)
+elseif("${ARCH}" MATCHES "blackfin")
+    set(BASEARCH "blackfin")
+    set(BASEARCH_BLACKFIN_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ia64")
+    set(BASEARCH "ia64")
+    set(BASEARCH_IA64_FOUND TRUE)
+elseif("${ARCH}" MATCHES "mips")
+    set(BASEARCH "mips")
+    set(BASEARCH_MIPS_FOUND TRUE)
+elseif("${ARCH}" MATCHES "m68k")
+    set(BASEARCH "m68k")
+    set(BASEARCH_M68K_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sh")
+    set(BASEARCH "sh")
+    set(BASEARCH_SH_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sparc[89]?")
+    set(BASEARCH "sparc")
+    set(BASEARCH_SPARC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "s3[679]0x?")
+    set(BASEARCH "s360")
+    set(BASEARCH_S360_FOUND TRUE)
+elseif("${ARCH}" MATCHES "parisc")
+    set(BASEARCH "parisc")
+    set(BASEARCH_PARISC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "rs6000")
+    set(BASEARCH "rs6000")
+    set(BASEARCH_RS6000_FOUND TRUE)
+elseif("${ARCH}" MATCHES "riscv(32|64)")
+    set(BASEARCH "riscv")
+    set(BASEARCH_RISCV_FOUND TRUE)
+elseif("${ARCH}" MATCHES "wasm32")
+    set(BASEARCH "wasm32")
+    set(BASEARCH_WASM32_FOUND TRUE)
+else()
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+    message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
+endif()
+message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")
--- a/deps/zlib-ng/cmake/detect-coverage.cmake
+++ b/deps/zlib-ng/cmake/detect-coverage.cmake
@@ -0,0 +1,46 @@
+# detect-coverage.cmake -- Detect supported compiler coverage flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_code_coverage)
+    # Check for -coverage flag support for Clang/GCC
+    if(CMAKE_VERSION VERSION_LESS 3.14)
+        set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+    else()
+        set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
+    endif()
+    check_c_compiler_flag(-coverage HAVE_COVERAGE)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+    if(HAVE_COVERAGE)
+        add_compile_options(-coverage)
+        add_link_options(-coverage)
+        message(STATUS "Code coverage enabled using: -coverage")
+    else()
+        # Some versions of GCC don't support -coverage shorthand
+        if(CMAKE_VERSION VERSION_LESS 3.14)
+            set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+        else()
+            set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
+        endif()
+        check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
+        set(CMAKE_REQUIRED_LIBRARIES)
+        set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+        if(HAVE_TEST_COVERAGE)
+            add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
+            add_link_options(-lgcov -fprofile-arcs)
+            message(STATUS "Code coverage enabled using: -ftest-coverage")
+        else()
+            message(WARNING "Compiler does not support code coverage")
+            set(WITH_CODE_COVERAGE OFF)
+        endif()
+    endif()
+
+    # Set optimization level to zero for code coverage builds
+    if (WITH_CODE_COVERAGE)
+        # Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
+        set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
+    endif()
+endmacro()
--- a/deps/zlib-ng/cmake/detect-install-dirs.cmake
+++ b/deps/zlib-ng/cmake/detect-install-dirs.cmake
@@ -0,0 +1,43 @@
+# detect-install-dirs.cmake -- Detect install directory parameters
+# Copyright (C) 2021 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# Determine installation directory for executables
+if (DEFINED BIN_INSTALL_DIR)
+    set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
+elseif (DEFINED INSTALL_BIN_DIR)
+    set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
+endif()
+
+# Determine installation directory for libraries
+if (DEFINED LIB_INSTALL_DIR)
+    set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
+elseif (DEFINED INSTALL_LIB_DIR)
+    set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
+endif()
+
+# Determine installation directory for include files
+if (DEFINED INC_INSTALL_DIR)
+    set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
+elseif (DEFINED INSTALL_INC_DIR)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
+endif()
+
+# Define GNU standard installation directories
+include(GNUInstallDirs)
+
+# Determine installation directory for pkgconfig files
+if (DEFINED PKGCONFIG_INSTALL_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED INSTALL_PKGCONFIG_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+else()
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
+endif()
--- a/deps/zlib-ng/cmake/detect-intrinsics.cmake
+++ b/deps/zlib-ng/cmake/detect-intrinsics.cmake
@@ -0,0 +1,548 @@
+# detect-intrinsics.cmake -- Detect compiler intrinsics support
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(check_acle_compiler_flag)
+    if(MSVC)
+        # Both ARM and ARM64-targeting msvc support intrinsics, but
+        # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
+        if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
+            set(HAVE_ACLE_FLAG TRUE)
+        endif()
+    else()
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
+                set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
+            endif()
+        endif()
+        # Check whether compiler supports ACLE flag
+        set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG}")
+        check_c_source_compiles(
+            "int main() { return 0; }"
+            HAVE_ACLE_FLAG FAIL_REGEX "not supported")
+        if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
+            set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
+            # Check whether compiler supports ACLE flag
+            set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
+            check_c_source_compiles(
+                "int main() { return 0; }"
+                HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
+            set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
+            unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
+        endif()
+        set(CMAKE_REQUIRED_FLAGS)
+    endif()
+endmacro()
+
+macro(check_avx512_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+        else()
+            set(AVX512FLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+            # instruction scheduling unless you specify a reasonable -mtune= target
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512FLAG "/arch:AVX512")
+    endif()
+    # Check whether compiler supports AVX512 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m512i x = _mm512_set1_epi8(2);
+            const __m512i y = _mm512_set_epi32(0x1020304, 0x5060708, 0x90a0b0c, 0xd0e0f10,
+                                               0x11121314, 0x15161718, 0x191a1b1c, 0x1d1e1f20,
+                                               0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30,
+                                               0x31323334, 0x35363738, 0x393a3b3c, 0x3d3e3f40);
+            x = _mm512_sub_epi8(x, y);
+            (void)x;
+            return 0;
+        }"
+        HAVE_AVX512_INTRIN
+    )
+
+    # Evidently both GCC and clang were late to implementing these
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __mmask16 a = 0xFF;
+            a = _knot_mask16(a);
+            (void)a;
+            return 0;
+        }"
+        HAVE_MASK_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512vnni_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
+        else()
+            set(AVX512VNNIFLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512VNNIFLAG "/arch:AVX512")
+    endif()
+
+    # Check whether compiler supports AVX512vnni intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m512i x = _mm512_set1_epi8(2);
+            const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                              56, 57, 58, 59, 60, 61, 62, 63, 64);
+            __m512i z = _mm512_setzero_epi32();
+            z = _mm512_dpbusd_epi32(z, x, y);
+            (void)z;
+            return 0;
+        }"
+        HAVE_AVX512VNNI_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX2FLAG "-mavx2")
+        else()
+            set(AVX2FLAG "/arch:AVX2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX2FLAG "-mavx2")
+        endif()
+    elseif(MSVC)
+        set(AVX2FLAG "/arch:AVX2")
+    endif()
+    # Check whether compiler supports AVX2 intrinics
+    set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m256i x = _mm256_set1_epi16(2);
+            const __m256i y = _mm256_set1_epi16(1);
+            x = _mm256_subs_epu16(x, y);
+            (void)x;
+            return 0;
+        }"
+        HAVE_AVX2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_compiler_flag)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports NEON flag
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#ifdef _M_ARM64
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int main() { return 0; }"
+        MFPU_NEON_AVAILABLE FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_ld4_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports loading 4 neon vecs into a register range
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
+    check_c_source_compiles(
+        "#ifdef _M_ARM64
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int main(void) {
+            int stack_var[16];
+            int32x4x4_t v = vld1q_s32_x4(stack_var);
+            (void)v;
+            return 0;
+        }"
+        NEON_HAS_LD4)
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_pclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(PCLMULFLAG "-mpclmul")
+        endif()
+    endif()
+    # Check whether compiler supports PCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
+        set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG}")
+        check_c_source_compile_or_run(
+            "#include <immintrin.h>
+            int main(void) {
+                __m128i a = _mm_setzero_si128();
+                __m128i b = _mm_setzero_si128();
+                __m128i c = _mm_clmulepi64_si128(a, b, 0x10);
+                (void)c;
+                return 0;
+            }"
+            HAVE_PCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_PCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_vpclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
+        endif()
+    endif()
+    # Check whether compiler supports VPCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG}")
+        check_c_source_compile_or_run(
+            "#include <immintrin.h>
+            int main(void) {
+                __m512i a = _mm512_setzero_si512();
+                __m512i b = _mm512_setzero_si512();
+                __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
+                (void)c;
+                return 0;
+            }"
+            HAVE_VPCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_VPCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_ppc_intrinsics)
+    # Check if compiler supports AltiVec
+    set(CMAKE_REQUIRED_FLAGS "-maltivec")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_ALTIVEC
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_ALTIVEC)
+        set(PPCFLAGS "-maltivec")
+    endif()
+
+    set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_NOVSX
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_NOVSX)
+        set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
+    endif()
+
+    # Check if we have what we need for AltiVec optimizations
+    set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
+        #else
+            return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
+        #endif
+        }"
+        HAVE_VMX
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_power8_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER8FLAG "-mcpu=power8")
+        endif()
+    endif()
+    # Check if we have what we need for POWER8 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE2_ARCH_2_07);
+        #else
+            return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
+        #endif
+        }"
+        HAVE_POWER8_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_rvv_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(RISCVFLAG "-march=rv64gcv")
+        endif()
+    endif()
+    # Check whether compiler supports RVV
+    set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#include <riscv_vector.h>
+        int main() { 
+            return 0; 
+        }" 
+        HAVE_RVV_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_s390_intrinsics)
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifndef HWCAP_S390_VXRS
+        #define HWCAP_S390_VXRS HWCAP_S390_VX
+        #endif
+        int main() {
+            return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
+        }"
+        HAVE_S390_INTRIN
+    )
+endmacro()
+
+macro(check_power9_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER9FLAG "-mcpu=power9")
+        endif()
+    endif()
+    # Check if we have what we need for POWER9 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "int main() {
+            return 0;
+        }"
+        HAVE_POWER9_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_sse2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE2FLAG "-msse2")
+        else()
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(MSVC)
+        if(NOT "${ARCH}" MATCHES "x86_64")
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE2FLAG "-msse2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE2 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m128i zero = _mm_setzero_si128();
+            (void)zero;
+            return 0;
+        }"
+        HAVE_SSE2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_ssse3_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSSE3FLAG "-mssse3")
+        else()
+            set(SSSE3FLAG "/arch:SSSE3")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSSE3FLAG "-mssse3")
+        endif()
+    endif()
+    # Check whether compiler supports SSSE3 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m128i u, v, w;
+            u = _mm_set1_epi32(1);
+            v = _mm_set1_epi32(2);
+            w = _mm_hadd_epi32(u, v);
+            (void)w;
+            return 0;
+        }"
+        HAVE_SSSE3_INTRIN
+    )
+endmacro()
+
+macro(check_sse42_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE42FLAG "-msse4.2")
+        else()
+            set(SSE42FLAG "/arch:SSE4.2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE42FLAG "-msse4.2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.2 CRC inline asm
+    set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG}")
+    check_c_source_compile_or_run(
+        "int main(void) {
+            unsigned val = 0, h = 0;
+        #if defined(_MSC_VER)
+            { __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov h, eax }
+        #else
+            __asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) );
+        #endif
+            return (int)h;
+        }"
+        HAVE_SSE42CRC_INLINE_ASM
+    )
+    # Check whether compiler supports SSE4.2 CRC intrinsics
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            unsigned crc = 0;
+            char c = 'c';
+        #if defined(_MSC_VER)
+            crc = _mm_crc32_u32(crc, c);
+        #else
+            crc = __builtin_ia32_crc32qi(crc, c);
+        #endif
+            (void)crc;
+            return 0;
+        }"
+        HAVE_SSE42CRC_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_vgfma_intrinsics)
+    if(NOT NATIVEFLAG)
+        set(VGFMAFLAG "-march=z13")
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+            set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
+        endif()
+        if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
+        endif()
+    endif()
+    # Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#include <vecintrin.h>
+        int main(void) {
+            unsigned long long a __attribute__((vector_size(16))) = { 0 };
+            unsigned long long b __attribute__((vector_size(16))) = { 0 };
+            unsigned char c __attribute__((vector_size(16))) = { 0 };
+            c = vec_gfmsum_accum_128(a, b, c);
+            return c[0];
+        }"
+        HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_xsave_intrinsics)
+    if(NOT NATIVEFLAG AND NOT MSVC)
+        set(XSAVEFLAG "-mxsave")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "#ifdef _WIN32
+         #  include <intrin.h>
+         #else
+         #  include <x86gprintrin.h>
+         #endif
+         int main(void) {
+             return _xgetbv(0);
+         }"
+        HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
--- a/deps/zlib-ng/cmake/detect-sanitizer.cmake
+++ b/deps/zlib-ng/cmake/detect-sanitizer.cmake
@@ -0,0 +1,166 @@
+# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_common_sanitizer_flags)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        add_compile_options(-g3)
+    endif()
+    check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
+    if(HAVE_NO_OMIT_FRAME_POINTER)
+        add_compile_options(-fno-omit-frame-pointer)
+        add_link_options(-fno-omit-frame-pointer)
+    endif()
+    check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+    if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+        add_compile_options(-fno-optimize-sibling-calls)
+        add_link_options(-fno-optimize-sibling-calls)
+    endif()
+endmacro()
+
+macro(check_sanitizer_support known_checks supported_checks)
+    set(available_checks "")
+
+    # Build list of supported sanitizer flags by incrementally trying compilation with
+    # known sanitizer checks
+
+    foreach(check ${known_checks})
+        if(available_checks STREQUAL "")
+            set(compile_checks "${check}")
+        else()
+            set(compile_checks "${available_checks},${check}")
+        endif()
+
+        set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
+
+        check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
+            FAIL_REGEX "not supported|unrecognized command|unknown option")
+
+        set(CMAKE_REQUIRED_FLAGS)
+
+        if(HAVE_SANITIZER_${check})
+            set(available_checks ${compile_checks})
+        endif()
+    endforeach()
+
+    set(${supported_checks} ${available_checks})
+endmacro()
+
+macro(add_address_sanitizer)
+    set(known_checks
+        address
+        pointer-compare
+        pointer-subtract
+        )
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Address sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Address sanitizer is not supported")
+    endif()
+
+    if(CMAKE_CROSSCOMPILING_EMULATOR)
+        # Only check for leak sanitizer if not cross-compiling due to qemu crash
+        message(WARNING "Leak sanitizer is not supported when cross compiling")
+    else()
+        # Leak sanitizer requires address sanitizer
+        check_sanitizer_support("leak" supported_checks)
+        if(NOT ${supported_checks} STREQUAL "")
+            message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
+            add_compile_options(-fsanitize=${supported_checks})
+            add_link_options(-fsanitize=${supported_checks})
+            add_common_sanitizer_flags()
+        else()
+            message(STATUS "Leak sanitizer is not supported")
+        endif()
+    endif()
+endmacro()
+
+macro(add_memory_sanitizer)
+    check_sanitizer_support("memory" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+
+        check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
+        if(HAVE_MEMORY_TRACK_ORIGINS)
+            add_compile_options(-fsanitize-memory-track-origins)
+            add_link_options(-fsanitize-memory-track-origins)
+        endif()
+    else()
+        message(STATUS "Memory sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_thread_sanitizer)
+    check_sanitizer_support("thread" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Thread sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_undefined_sanitizer)
+    set(known_checks
+        array-bounds
+        bool
+        bounds
+        builtin
+        enum
+        float-cast-overflow
+        float-divide-by-zero
+        function
+        integer-divide-by-zero
+        local-bounds
+        null
+        nonnull-attribute
+        pointer-overflow
+        return
+        returns-nonnull-attribute
+        shift
+        shift-base
+        shift-exponent
+        signed-integer-overflow
+        undefined
+        unsigned-integer-overflow
+        unsigned-shift-base
+        vla-bound
+        vptr
+        )
+
+    # Only check for alignment sanitizer flag if unaligned access is not supported
+    if(NOT WITH_UNALIGNED)
+        list(APPEND known_checks alignment)
+    endif()
+    # Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
+    if(NOT CMAKE_C_FLAGS MATCHES "-O0")
+        list(APPEND known_checks object-size)
+    endif()
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+
+        # Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
+        # it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
+        if(WITH_UNALIGNED)
+            add_compile_options(-fno-sanitize=alignment)
+        endif()
+
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Undefined behavior sanitizer is not supported")
+    endif()
+endmacro()
--- a/deps/zlib-ng/cmake/fallback-macros.cmake
+++ b/deps/zlib-ng/cmake/fallback-macros.cmake
@@ -0,0 +1,19 @@
+# fallback-macros.cmake -- CMake fallback macros
+# Copyright (C) 2022 Nathan Moinvaziri
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# CMake less than version 3.5.2
+if(NOT COMMAND add_compile_options)
+    macro(add_compile_options options)
+        string(APPEND CMAKE_C_FLAGS ${options})
+        string(APPEND CMAKE_CXX_FLAGS ${options})
+    endmacro()
+endif()
+
+# CMake less than version 3.14
+if(NOT COMMAND add_link_options)
+    macro(add_link_options options)
+        string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
+        string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
+    endmacro()
+endif()
--- a/deps/zlib-ng/cmake/toolchain-aarch64.cmake
+++ b/deps/zlib-ng/cmake/toolchain-aarch64.cmake
@@ -0,0 +1,24 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+set(CMAKE_SYSTEM_VERSION 1)
+
+set(CMAKE_C_COMPILER_TARGET "aarch64-linux-gnu")
+set(CMAKE_CXX_COMPILER_TARGET "aarch64-linux-gnu")
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR qemu-aarch64 -L /usr/${CMAKE_C_COMPILER_TARGET}/)
+
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
--- a/deps/zlib-ng/cmake/toolchain-arm.cmake
+++ b/deps/zlib-ng/cmake/toolchain-arm.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+set(CMAKE_SYSTEM_VERSION 1)
+
+if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
+    set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabi)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
+    set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabi)
+endif()
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
--- a/deps/zlib-ng/cmake/toolchain-armhf.cmake
+++ b/deps/zlib-ng/cmake/toolchain-armhf.cmake
@@ -0,0 +1,25 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+set(CMAKE_SYSTEM_VERSION 1)
+
+set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabihf)
+set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabihf)
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
--- a/deps/zlib-ng/cmake/toolchain-mingw-i686.cmake
+++ b/deps/zlib-ng/cmake/toolchain-mingw-i686.cmake
@@ -0,0 +1,35 @@
+set(CMAKE_SYSTEM_NAME Windows)
+
+set(CMAKE_C_COMPILER_TARGET i686-w64-mingw32)
+set(CMAKE_CXX_COMPILER_TARGET i686-w64-mingw32)
+set(CMAKE_RC_COMPILER_TARGET i686-w64-mingw32)
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR wine)
+
+set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Prefer posix gcc variant for gtest pthread support
+find_program(C_COMPILER_FULL_PATH NAMES
+    ${CMAKE_C_COMPILER_TARGET}-gcc-posix
+    ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES
+    ${CMAKE_CXX_COMPILER_TARGET}-g++-posix
+    ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
+
+find_program(RC_COMPILER_FULL_PATH NAMES
+    ${CMAKE_RC_COMPILER_TARGET}-windres)
+if(RC_COMPILER_FULL_PATH)
+    set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
+endif()
--- a/deps/zlib-ng/cmake/toolchain-mingw-x86_64.cmake
+++ b/deps/zlib-ng/cmake/toolchain-mingw-x86_64.cmake
@@ -0,0 +1,34 @@
+set(CMAKE_SYSTEM_NAME Windows)
+
+set(CMAKE_C_COMPILER_TARGET x86_64-w64-mingw32)
+set(CMAKE_CXX_COMPILER_TARGET x86_64-w64-mingw32)
+set(CMAKE_RC_COMPILER_TARGET x86_64-w64-mingw32)
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR wine)
+
+set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Prefer posix gcc variant for gtest pthread support
+find_program(C_COMPILER_FULL_PATH NAMES
+    ${CMAKE_C_COMPILER_TARGET}-gcc-posix
+    ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES
+    ${CMAKE_CXX_COMPILER_TARGET}-g++-posix
+    ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
+
+find_program(RC_COMPILER_FULL_PATH NAMES ${CMAKE_RC_COMPILER_TARGET}-windres)
+if(RC_COMPILER_FULL_PATH)
+    set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
+endif()
--- a/deps/zlib-ng/cmake/toolchain-mips.cmake
+++ b/deps/zlib-ng/cmake/toolchain-mips.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR mips)
+set(CMAKE_SYSTEM_VERSION 1)
+
+if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
+    set(CMAKE_C_COMPILER_TARGET mips-linux-gnu)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
+    set(CMAKE_CXX_COMPILER_TARGET mips-linux-gnu)
+endif()
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_CROSSCOMPILING_EMULATOR qemu-mips -L /usr/${CMAKE_C_COMPILER_TARGET}/)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
+if(NOT C_COMPILER_FULL_PATH)
+    message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
+endif()
+set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
+
+find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
+if(CXX_COMPILER_FULL_PATH)
+    set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
+endif()
--- a/Show More
+++ b/Show More