This commit is contained in:
Edward Thomson
2023-06-27 11:45:54 +02:00
parent c9604efc4a
commit e4847c6b03
297 changed files with 89428 additions and 5 deletions

View File

@@ -38,7 +38,7 @@ option(USE_GSSAPI "Link with libgssapi for SPNEGO auth" OFF)
set(USE_HTTP_PARSER "" CACHE STRING "Specifies the HTTP Parser implementation; either system or builtin.")
# set(USE_XDIFF "" CACHE STRING "Specifies the xdiff implementation; either system or builtin.")
set(REGEX_BACKEND "" CACHE STRING "Regular expression implementation. One of regcomp_l, pcre2, pcre, regcomp, or builtin.")
option(USE_BUNDLED_ZLIB "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL" OFF)
set(USE_BUNDLED_ZLIB "" CACHE STRING "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium/zlibg-ng. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL")
# Debugging options
option(USE_LEAK_CHECKER "Run tests with leak checker" OFF)

View File

@@ -2,11 +2,11 @@
include(SanitizeBool)
SanitizeBool(USE_BUNDLED_ZLIB)
if(USE_BUNDLED_ZLIB STREQUAL ON)
if(USE_BUNDLED_ZLIB STREQUAL "ON")
set(USE_BUNDLED_ZLIB "Bundled")
endif()
if(USE_BUNDLED_ZLIB STREQUAL "OFF")
if(USE_BUNDLED_ZLIB STREQUAL "OFF" OR USE_BUNDLED_ZLIB STREQUAL "")
find_package(ZLIB)
if(ZLIB_FOUND)
list(APPEND LIBGIT2_SYSTEM_INCLUDES ${ZLIB_INCLUDE_DIRS})
@@ -17,16 +17,26 @@ if(USE_BUNDLED_ZLIB STREQUAL "OFF")
list(APPEND LIBGIT2_PC_REQUIRES "zlib")
endif()
add_feature_info(zlib ON "using system zlib")
elseif(USE_BUNDLED_ZLIB STREQUAL "OFF")
message(FATAL_ERROR "zlib was not found")
else()
message(STATUS "zlib was not found; using bundled 3rd-party sources." )
message(WARNING "zlib was not found; using bundled 3rd-party sources." )
set(USE_BUNDLED_ZLIB "Bundled")
endif()
endif()
if(USE_BUNDLED_ZLIB STREQUAL "Chromium")
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/chromium-zlib" "${PROJECT_BINARY_DIR}/deps/chromium-zlib")
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/chromium-zlib")
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:chromium_zlib>)
add_feature_info(zlib ON "using (Chromium) bundled zlib")
elseif(USE_BUNDLED_ZLIB OR NOT ZLIB_FOUND)
elseif(USE_BUNDLED_ZLIB STREQUAL "zlib-ng")
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib-ng" "${PROJECT_BINARY_DIR}/deps/zlib-ng")
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlibstatic>)
add_feature_info(zlib ON "using bundled zlib-ng")
elseif(USE_BUNDLED_ZLIB)
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib" "${PROJECT_BINARY_DIR}/deps/zlib")
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib")
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlib>)

1268
deps/zlib-ng/CMakeLists.txt vendored Normal file

File diff suppressed because it is too large Load Diff

374
deps/zlib-ng/FAQ.zlib vendored Normal file
View File

@@ -0,0 +1,374 @@
##
# THIS IS AN UNMAINTAINED COPY OF THE ORIGINAL FILE DISTRIBUTED WITH ZLIB 1.2.11
##
Frequently Asked Questions about zlib
If your question is not there, please check the zlib home page
https://zlib.net/ which may have more recent information.
The latest zlib FAQ is at https://zlib.net/zlib_faq.html
1. Is zlib Y2K-compliant?
Yes. zlib doesn't handle dates.
2. Where can I get a Windows DLL version?
The zlib sources can be compiled without change to produce a DLL. See the
file win32/DLL_FAQ.txt in the zlib distribution. Pointers to the
precompiled DLL are found in the zlib web site at https://zlib.net/ .
3. Where can I get a Visual Basic interface to zlib?
See
* https://marknelson.us/1997/01/01/zlib-engine/
* win32/DLL_FAQ.txt in the zlib distribution
4. compress() returns Z_BUF_ERROR.
Make sure that before the call of compress(), the length of the compressed
buffer is equal to the available size of the compressed buffer and not
zero. For Visual Basic, check that this parameter is passed by reference
("as any"), not by value ("as long").
5. deflate() or inflate() returns Z_BUF_ERROR.
Before making the call, make sure that avail_in and avail_out are not zero.
When setting the parameter flush equal to Z_FINISH, also make sure that
avail_out is big enough to allow processing all pending input. Note that a
Z_BUF_ERROR is not fatal--another call to deflate() or inflate() can be
made with more input or output space. A Z_BUF_ERROR may in fact be
unavoidable depending on how the functions are used, since it is not
possible to tell whether or not there is more output pending when
strm.avail_out returns with zero. See https://zlib.net/zlib_how.html for a
heavily annotated example.
6. Where's the zlib documentation (man pages, etc.)?
It's in zlib.h . Examples of zlib usage are in the files test/example.c
and test/minigzip.c, with more in examples/ .
7. Why don't you use GNU autoconf or libtool or ...?
Because we would like to keep zlib as a very small and simple package.
zlib is rather portable and doesn't need much configuration.
8. I found a bug in zlib.
Most of the time, such problems are due to an incorrect usage of zlib.
Please try to reproduce the problem with a small program and send the
corresponding source to us at zlib@gzip.org . Do not send multi-megabyte
data files without prior agreement.
9. Why do I get "undefined reference to gzputc"?
If "make test" produces something like
example.o(.text+0x154): undefined reference to `gzputc'
check that you don't have old files libz.* in /usr/lib, /usr/local/lib or
/usr/X11R6/lib. Remove any old versions, then do "make install".
10. I need a Delphi interface to zlib.
See the contrib/delphi directory in the zlib distribution.
11. Can zlib handle .zip archives?
Not by itself, no. See the directory contrib/minizip in the zlib
distribution.
12. Can zlib handle .Z files?
No, sorry. You have to spawn an uncompress or gunzip subprocess, or adapt
the code of uncompress on your own.
13. How can I make a Unix shared library?
By default a shared (and a static) library is built for Unix. So:
make distclean
./configure
make
14. How do I install a shared zlib library on Unix?
After the above, then:
make install
However, many flavors of Unix come with a shared zlib already installed.
Before going to the trouble of compiling a shared version of zlib and
trying to install it, you may want to check if it's already there! If you
can #include <zlib.h>, it's there. The -lz option will probably link to
it. You can check the version at the top of zlib.h or with the
ZLIB_VERSION symbol defined in zlib.h .
15. I have a question about OttoPDF.
We are not the authors of OttoPDF. The real author is on the OttoPDF web
site: Joel Hainley, jhainley@myndkryme.com.
16. Can zlib decode Flate data in an Adobe PDF file?
Yes. See https://www.pdflib.com/ . To modify PDF forms, see
https://sourceforge.net/projects/acroformtool/ .
17. Why am I getting this "register_frame_info not found" error on Solaris?
After installing zlib 1.1.4 on Solaris 2.6, running applications using zlib
generates an error such as:
ld.so.1: rpm: fatal: relocation error: file /usr/local/lib/libz.so:
symbol __register_frame_info: referenced symbol not found
The symbol __register_frame_info is not part of zlib, it is generated by
the C compiler (cc or gcc). You must recompile applications using zlib
which have this problem. This problem is specific to Solaris. See
http://www.sunfreeware.com/ for Solaris versions of zlib and applications
using zlib.
18. Why does gzip give an error on a file I make with compress/deflate?
The compress and deflate functions produce data in the zlib format, which
is different and incompatible with the gzip format. The gz* functions in
zlib on the other hand use the gzip format. Both the zlib and gzip formats
use the same compressed data format internally, but have different headers
and trailers around the compressed data.
19. Ok, so why are there two different formats?
The gzip format was designed to retain the directory information about a
single file, such as the name and last modification date. The zlib format
on the other hand was designed for in-memory and communication channel
applications, and has a much more compact header and trailer and uses a
faster integrity check than gzip.
20. Well that's nice, but how do I make a gzip file in memory?
You can request that deflate write the gzip format instead of the zlib
format using deflateInit2(). You can also request that inflate decode the
gzip format using inflateInit2(). Read zlib.h for more details.
21. Is zlib thread-safe?
Yes. However any library routines that zlib uses and any application-
provided memory allocation routines must also be thread-safe. zlib's gz*
functions use stdio library routines, and most of zlib's functions use the
library memory allocation routines by default. zlib's *Init* functions
allow for the application to provide custom memory allocation routines.
Of course, you should only operate on any given zlib or gzip stream from a
single thread at a time.
22. Can I use zlib in my commercial application?
Yes. Please read the license in zlib.h.
23. Is zlib under the GNU license?
No. Please read the license in zlib.h.
24. The license says that altered source versions must be "plainly marked". So
what exactly do I need to do to meet that requirement?
You need to change the ZLIB_VERSION and ZLIB_VERNUM #defines in zlib.h. In
particular, the final version number needs to be changed to "f", and an
identification string should be appended to ZLIB_VERSION. Version numbers
x.x.x.f are reserved for modifications to zlib by others than the zlib
maintainers. For example, if the version of the base zlib you are altering
is "1.2.3.4", then in zlib.h you should change ZLIB_VERNUM to 0x123f, and
ZLIB_VERSION to something like "1.2.3.f-zachary-mods-v3". You can also
update the version strings in deflate.c and inftrees.c.
For altered source distributions, you should also note the origin and
nature of the changes in zlib.h, as well as in ChangeLog and README, along
with the dates of the alterations. The origin should include at least your
name (or your company's name), and an email address to contact for help or
issues with the library.
Note that distributing a compiled zlib library along with zlib.h and
zconf.h is also a source distribution, and so you should change
ZLIB_VERSION and ZLIB_VERNUM and note the origin and nature of the changes
in zlib.h as you would for a full source distribution.
25. Will zlib work on a big-endian or little-endian architecture, and can I
exchange compressed data between them?
Yes and yes.
26. Will zlib work on a 64-bit machine?
Yes. It has been tested on 64-bit machines, and has no dependence on any
data types being limited to 32-bits in length. If you have any
difficulties, please provide a complete problem report to zlib@gzip.org
27. Will zlib decompress data from the PKWare Data Compression Library?
No. The PKWare DCL uses a completely different compressed data format than
does PKZIP and zlib. However, you can look in zlib's contrib/blast
directory for a possible solution to your problem.
28. Can I access data randomly in a compressed stream?
No, not without some preparation. If when compressing you periodically use
Z_FULL_FLUSH, carefully write all the pending data at those points, and
keep an index of those locations, then you can start decompression at those
points. You have to be careful to not use Z_FULL_FLUSH too often, since it
can significantly degrade compression. Alternatively, you can scan a
deflate stream once to generate an index, and then use that index for
random access. See examples/zran.c .
29. Does zlib work on MVS, OS/390, CICS, etc.?
It has in the past, but we have not heard of any recent evidence. There
were working ports of zlib 1.1.4 to MVS, but those links no longer work.
If you know of recent, successful applications of zlib on these operating
systems, please let us know. Thanks.
30. Is there some simpler, easier to read version of inflate I can look at to
understand the deflate format?
First off, you should read RFC 1951. Second, yes. Look in zlib's
contrib/puff directory.
31. Does zlib infringe on any patents?
As far as we know, no. In fact, that was originally the whole point behind
zlib. Look here for some more information:
https://www.gzip.org/#faq11
32. Can zlib work with greater than 4 GB of data?
Yes. inflate() and deflate() will process any amount of data correctly.
Each call of inflate() or deflate() is limited to input and output chunks
of the maximum value that can be stored in the compiler's "unsigned int"
type, but there is no limit to the number of chunks. Note however that the
strm.total_in and strm_total_out counters may be limited to 4 GB. These
counters are provided as a convenience and are not used internally by
inflate() or deflate(). The application can easily set up its own counters
updated after each call of inflate() or deflate() to count beyond 4 GB.
compress() and uncompress() may be limited to 4 GB, since they operate in a
single call. gzseek() and gztell() may be limited to 4 GB depending on how
zlib is compiled. See the zlibCompileFlags() function in zlib.h.
The word "may" appears several times above since there is a 4 GB limit only
if the compiler's "long" type is 32 bits. If the compiler's "long" type is
64 bits, then the limit is 16 exabytes.
33. Does zlib have any security vulnerabilities?
The only one that we are aware of is potentially in gzprintf(). If zlib is
compiled to use sprintf() or vsprintf(), then there is no protection
against a buffer overflow of an 8K string space (or other value as set by
gzbuffer()), other than the caller of gzprintf() assuring that the output
will not exceed 8K. On the other hand, if zlib is compiled to use
snprintf() or vsnprintf(), which should normally be the case, then there is
no vulnerability. The ./configure script will display warnings if an
insecure variation of sprintf() will be used by gzprintf(). Also the
zlibCompileFlags() function will return information on what variant of
sprintf() is used by gzprintf().
If you don't have snprintf() or vsnprintf() and would like one, you can
find a portable implementation here:
https://www.ijs.si/software/snprintf/
Note that you should be using the most recent version of zlib. Versions
1.1.3 and before were subject to a double-free vulnerability, and versions
1.2.1 and 1.2.2 were subject to an access exception when decompressing
invalid compressed data.
34. Is there a Java version of zlib?
Probably what you want is to use zlib in Java. zlib is already included
as part of the Java SDK in the java.util.zip package. If you really want
a version of zlib written in the Java language, look on the zlib home
page for links: https://zlib.net/ .
35. I get this or that compiler or source-code scanner warning when I crank it
up to maximally-pedantic. Can't you guys write proper code?
Many years ago, we gave up attempting to avoid warnings on every compiler
in the universe. It just got to be a waste of time, and some compilers
were downright silly as well as contradicted each other. So now, we simply
make sure that the code always works.
36. Valgrind (or some similar memory access checker) says that deflate is
performing a conditional jump that depends on an uninitialized value.
Isn't that a bug?
No. That is intentional for performance reasons, and the output of deflate
is not affected. This only started showing up recently since zlib 1.2.x
uses malloc() by default for allocations, whereas earlier versions used
calloc(), which zeros out the allocated memory. Even though the code was
correct, versions 1.2.4 and later was changed to not stimulate these
checkers.
37. Will zlib read the (insert any ancient or arcane format here) compressed
data format?
Probably not. Look in the comp.compression FAQ for pointers to various
formats and associated software.
38. How can I encrypt/decrypt zip files with zlib?
zlib doesn't support encryption. The original PKZIP encryption is very
weak and can be broken with freely available programs. To get strong
encryption, use GnuPG, https://www.gnupg.org/ , which already includes zlib
compression. For PKZIP compatible "encryption", look at
http://infozip.sourceforge.net/
39. What's the difference between the "gzip" and "deflate" HTTP 1.1 encodings?
"gzip" is the gzip format, and "deflate" is the zlib format. They should
probably have called the second one "zlib" instead to avoid confusion with
the raw deflate compressed data format. While the HTTP 1.1 RFC 2616
correctly points to the zlib specification in RFC 1950 for the "deflate"
transfer encoding, there have been reports of servers and browsers that
incorrectly produce or expect raw deflate data per the deflate
specification in RFC 1951, most notably Microsoft. So even though the
"deflate" transfer encoding using the zlib format would be the more
efficient approach (and in fact exactly what the zlib format was designed
for), using the "gzip" transfer encoding is probably more reliable due to
an unfortunate choice of name on the part of the HTTP 1.1 authors.
Bottom line: use the gzip format for HTTP 1.1 encoding.
40. Does zlib support the new "Deflate64" format introduced by PKWare?
No. PKWare has apparently decided to keep that format proprietary, since
they have not documented it as they have previous compression formats. In
any case, the compression improvements are so modest compared to other more
modern approaches, that it's not worth the effort to implement.
41. I'm having a problem with the zip functions in zlib, can you help?
There are no zip functions in zlib. You are probably using minizip by
Giles Vollant, which is found in the contrib directory of zlib. It is not
part of zlib. In fact none of the stuff in contrib is part of zlib. The
files in there are not supported by the zlib authors. You need to contact
the authors of the respective contribution for help.
42. The match.asm code in contrib is under the GNU General Public License.
Since it's part of zlib, doesn't that mean that all of zlib falls under the
GNU GPL?
No. The files in contrib are not part of zlib. They were contributed by
other authors and are provided as a convenience to the user within the zlib
distribution. Each item in contrib has its own license.
43. Is zlib subject to export controls? What is its ECCN?
zlib is not subject to export controls, and so is classified as EAR99.
44. Can you please sign these lengthy legal documents and fax them back to us
so that we can use your software in our product?
No. Go away. Shoo.

36
deps/zlib-ng/INDEX.md vendored Normal file
View File

@@ -0,0 +1,36 @@
Contents
--------
| Name | Description |
|:-----------------|:---------------------------------------------------------------|
| arch/ | Architecture-specific code |
| doc/ | Documentation for formats and algorithms |
| test/example.c | Zlib usages examples for build testing |
| test/minigzip.c | Minimal gzip-like functionality for build testing |
| test/infcover.c | Inflate code coverage for build testing |
| win32/ | Shared library version resources for Windows |
| CMakeLists.txt | Cmake build script |
| configure | Bash configure/build script |
| adler32.c | Compute the Adler-32 checksum of a data stream |
| chunkset.* | Inline functions to copy small data chunks |
| compress.c | Compress a memory buffer |
| deflate.* | Compress data using the deflate algorithm |
| deflate_fast.c | Compress data using the deflate algorithm with fast strategy |
| deflate_medium.c | Compress data using the deflate algorithm with medium strategy |
| deflate_slow.c | Compress data using the deflate algorithm with slow strategy |
| functable.* | Struct containing function pointers to optimized functions |
| gzguts.h | Internal definitions for gzip operations |
| gzlib.c | Functions common to reading and writing gzip files |
| gzread.c | Read gzip files |
| gzwrite.c | Write gzip files |
| infback.* | Inflate using a callback interface |
| inflate.* | Decompress data |
| inffast.* | Decompress data with speed optimizations |
| inffixed_tbl.h | Table for decoding fixed codes |
| inftrees.h | Generate Huffman trees for efficient decoding |
| trees.* | Output deflated data using Huffman coding |
| uncompr.c | Decompress a memory buffer |
| zconf.h.cmakein | zconf.h template for cmake |
| zendian.h | BYTE_ORDER for endian tests |
| zlib.map | Linux symbol information |
| zlib.pc.in | Pkg-config template |

19
deps/zlib-ng/LICENSE.md vendored Normal file
View File

@@ -0,0 +1,19 @@
(C) 1995-2013 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.

395
deps/zlib-ng/Makefile.in vendored Normal file
View File

@@ -0,0 +1,395 @@
# Makefile for zlib
# Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
# To compile and test, type:
# ./configure; make test
# Normally configure builds both a static and a shared library.
# If you want to build just a static library, use: ./configure --static
# To install /usr/local/lib/libz.* and /usr/local/include/zlib.h, type:
# make install
# To install in $HOME instead of /usr/local, use:
# make install prefix=$HOME
CC=cc
CFLAGS=-O
#CFLAGS=-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7
#CFLAGS=-g -DZLIB_DEBUG
#CFLAGS=-O3 -Wall -Wwrite-strings -Wpointer-arith -Wconversion \
# -Wstrict-prototypes -Wmissing-prototypes
SFLAGS=-O
LDFLAGS=-L.
LIBNAME1=libz-ng
LIBNAME2=zlib-ng
SUFFIX=-ng
TEST_LIBS=$(LIBNAME1).a
LDSHARED=$(CC)
LDSHAREDFLAGS=-shared
VER=2.1.2
VER1=2
STATICLIB=$(LIBNAME1).a
SHAREDLIB=$(LIBNAME1).so
SHAREDLIBV=$(LIBNAME1).so.$(VER)
SHAREDLIBM=$(LIBNAME1).so.$(VER1)
IMPORTLIB=
SHAREDTARGET=$(LIBNAME1).so.$(VER)
PKGFILE=$(LIBNAME2).pc
LIBS=$(STATICLIB) $(SHAREDTARGET)
AR=ar
ARFLAGS=rc
DEFFILE=
RC=
RCFLAGS=
RCOBJS=
STRIP=
RANLIB=ranlib
LDCONFIG=ldconfig
LDSHAREDLIBC=
EXE=
SRCDIR=.
INCLUDES=-I$(SRCDIR)
BUILDDIR=.
ARCHDIR=arch/generic
ARCH_STATIC_OBJS=
ARCH_SHARED_OBJS=
prefix = /usr/local
exec_prefix = ${prefix}
bindir = ${exec_prefix}/bin
libdir = ${exec_prefix}/lib
sharedlibdir = ${libdir}
includedir = ${prefix}/include
mandir = ${prefix}/share/man
man3dir = ${mandir}/man3
pkgconfigdir = ${libdir}/pkgconfig
OBJZ = \
adler32.o \
adler32_fold.o \
chunkset.o \
compare256.o \
compress.o \
cpu_features.o \
crc32_braid.o \
crc32_braid_comb.o \
crc32_fold.o \
deflate.o \
deflate_fast.o \
deflate_huff.o \
deflate_medium.o \
deflate_quick.o \
deflate_rle.o \
deflate_slow.o \
deflate_stored.o \
functable.o \
infback.o \
inflate.o \
inftrees.o \
insert_string.o \
insert_string_roll.o \
slide_hash.o \
trees.o \
uncompr.o \
zutil.o \
$(ARCH_STATIC_OBJS)
OBJG = \
gzlib.o \
gzread.o \
gzwrite.o
TESTOBJG =
OBJC = $(OBJZ) $(OBJG)
PIC_OBJZ = \
adler32.lo \
adler32_fold.lo \
chunkset.lo \
compare256.lo \
compress.lo \
cpu_features.lo \
crc32_braid.lo \
crc32_braid_comb.lo \
crc32_fold.lo \
deflate.lo \
deflate_fast.lo \
deflate_huff.lo \
deflate_medium.lo \
deflate_quick.lo \
deflate_rle.lo \
deflate_slow.lo \
deflate_stored.lo \
functable.lo \
infback.lo \
inflate.lo \
inftrees.lo \
insert_string.lo \
insert_string_roll.lo \
slide_hash.lo \
trees.lo \
uncompr.lo \
zutil.lo \
$(ARCH_SHARED_OBJS)
PIC_OBJG = \
gzlib.lo \
gzread.lo \
gzwrite.lo
PIC_TESTOBJG =
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
OBJS = $(OBJC)
PIC_OBJS = $(PIC_OBJC)
all: static shared
static: example$(EXE) minigzip$(EXE) makefixed$(EXE) maketrees$(EXE) makecrct$(EXE)
shared: examplesh$(EXE) minigzipsh$(EXE)
check: test
.SECONDARY:
$(ARCHDIR)/%.o: $(SRCDIR)/$(ARCHDIR)/%.c
$(MAKE) -C $(ARCHDIR) $(notdir $@)
$(ARCHDIR)/%.lo: $(SRCDIR)/$(ARCHDIR)/%.c
$(MAKE) -C $(ARCHDIR) $(notdir $@)
%.o: $(ARCHDIR)/%.o
-cp $< $@
%.lo: $(ARCHDIR)/%.lo
-cp $< $@
test: all
$(MAKE) -C test
infcover.o: $(SRCDIR)/test/infcover.c zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/test/infcover.c
infcover$(EXE): infcover.o $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ infcover.o $(STATICLIB)
ifneq ($(STRIP),)
$(STRIP) $@
endif
cover: infcover$(EXE)
rm -f *.gcda
./infcover
gcov inf*.c
$(STATICLIB): $(OBJS)
$(AR) $(ARFLAGS) $@ $(OBJS)
-@ ($(RANLIB) $@ || true) >/dev/null 2>&1
example.o:
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/example.c
minigzip.o:
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/minigzip.c
makefixed.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makefixed.c
maketrees.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/maketrees.c
makecrct.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makecrct.c
zlibrc.o: $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
$(RC) $(RCFLAGS) -o $@ $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
.SUFFIXES: .lo
%.o: $(SRCDIR)/%.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $<
%.lo: $(SRCDIR)/%.c
$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $<
gzlib.o: $(SRCDIR)/gzlib.c
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
gzlib.lo: $(SRCDIR)/gzlib.c
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
gzread.o: gzread.c
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
gzread.lo: gzread.c
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
gzwrite.o: $(SRCDIR)/gzwrite.c
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
gzwrite.lo: $(SRCDIR)/gzwrite.c
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
$(SHAREDTARGET): $(PIC_OBJS) $(DEFFILE) $(RCOBJS)
ifneq ($(SHAREDTARGET),)
$(LDSHARED) $(CFLAGS) $(LDSHAREDFLAGS) $(LDFLAGS) -o $@ $(DEFFILE) $(PIC_OBJS) $(RCOBJS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
ifneq ($(SHAREDLIB),$(SHAREDTARGET))
rm -f $(SHAREDLIB) $(SHAREDLIBM)
ln -s $@ $(SHAREDLIB)
ln -s $@ $(SHAREDLIBM)
endif
endif
example$(EXE): example.o $(TESTOBJG) $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
minigzip$(EXE): minigzip.o $(TESTOBJG) $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
minigzipsh$(EXE): minigzip.o $(PIC_TESTOBJG) $(SHAREDTARGET)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
examplesh$(EXE): example.o $(PIC_TESTOBJG) $(SHAREDTARGET)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
makefixed$(EXE): makefixed.o $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makefixed.o $(TEST_LIBS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
maketrees$(EXE): maketrees.o $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ maketrees.o $(TEST_LIBS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
makecrct$(EXE): makecrct.o $(STATICLIB)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makecrct.o $(TEST_LIBS) $(LDSHAREDLIBC)
ifneq ($(STRIP),)
$(STRIP) $@
endif
install-shared: $(SHAREDTARGET)
ifneq ($(SHAREDTARGET),)
-@if [ ! -d $(DESTDIR)$(sharedlibdir) ]; then mkdir -p $(DESTDIR)$(sharedlibdir); fi
rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
cp $(SHAREDTARGET) $(DESTDIR)$(sharedlibdir)
chmod 755 $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
ifneq ($(SHAREDLIB),$(SHAREDTARGET))
rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB)
ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
($(LDCONFIG) || true) >/dev/null 2>&1
# ldconfig is for Linux
endif
ifneq ($(IMPORTLIB),)
cp $(IMPORTLIB) $(DESTDIR)$(sharedlibdir)
chmod 644 $(DESTDIR)$(sharedlibdir)/$(IMPORTLIB)
endif
endif
install-static: $(STATICLIB)
-@if [ ! -d $(DESTDIR)$(libdir) ]; then mkdir -p $(DESTDIR)$(libdir); fi
rm -f $(DESTDIR)$(libdir)/$(STATICLIB)
cp $(STATICLIB) $(DESTDIR)$(libdir)
chmod 644 $(DESTDIR)$(libdir)/$(STATICLIB)
-@($(RANLIB) $(DESTDIR)$(libdir)/$(STATICLIB) || true) >/dev/null 2>&1
# The ranlib in install-static is needed on NeXTSTEP which checks file times
install-libs: install-shared install-static
-@if [ ! -d $(DESTDIR)$(man3dir) ]; then mkdir -p $(DESTDIR)$(man3dir); fi
-@if [ ! -d $(DESTDIR)$(pkgconfigdir) ]; then mkdir -p $(DESTDIR)$(pkgconfigdir); fi
rm -f $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
cp $(PKGFILE) $(DESTDIR)$(pkgconfigdir)
chmod 644 $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
install: install-libs
-@if [ ! -d $(DESTDIR)$(includedir) ]; then mkdir -p $(DESTDIR)$(includedir); fi
rm -f $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
cp zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zlib$(SUFFIX).h
cp zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h
cp zlib_name_mangling$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
chmod 644 $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
uninstall-static:
cd $(DESTDIR)$(libdir) && rm -f $(STATICLIB)
uninstall-shared:
ifneq ($(SHAREDLIB),)
cd $(DESTDIR)$(sharedlibdir) && rm -f $(SHAREDLIBV) $(SHAREDLIB) $(SHAREDLIBM)
endif
ifneq ($(IMPORTLIB),)
cd $(DESTDIR)$(sharedlibdir) && rm -f $(IMPORTLIB)
endif
uninstall: uninstall-static uninstall-shared
cd $(DESTDIR)$(includedir) && rm -f zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
cd $(DESTDIR)$(pkgconfigdir) && rm -f $(PKGFILE)
mostlyclean: clean
clean:
@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) clean; fi
@if [ -f test/Makefile ]; then $(MAKE) -C test clean; fi
rm -f *.o *.lo *~ \
example$(EXE) minigzip$(EXE) minigzipsh$(EXE) \
infcover makefixed$(EXE) maketrees$(EXE) makecrct$(EXE) \
$(STATICLIB) $(IMPORTLIB) $(SHAREDLIB) $(SHAREDLIBV) $(SHAREDLIBM) \
foo.gz so_locations \
_match.s maketree
rm -rf objs
rm -f *.gcda *.gcno *.gcov
rm -f a.out a.exe
rm -f *._h
rm -rf btmp1 btmp2 pkgtmp1 pkgtmp2
maintainer-clean: distclean
distclean: clean
@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) distclean; fi
@if [ -f test/Makefile ]; then $(MAKE) -C test distclean; fi
rm -f $(PKGFILE) configure.log zconf.h zconf.h.cmakein zlib$(SUFFIX).h zlib_name_mangling$(SUFFIX)}.h *.pc
-@rm -f .DS_Store
# Reset Makefile if building inside source tree
@if [ -f Makefile.in ]; then \
printf 'all:\n\t-@echo "Please use ./configure first. Thank you."\n' > Makefile ; \
printf '\ndistclean:\n\t$(MAKE) -f Makefile.in distclean\n' >> Makefile ; \
touch -r $(SRCDIR)/Makefile.in Makefile ; fi
# Reset zconf.h and zconf.h.cmakein if building inside source tree
@if [ -f zconf.h.in ]; then \
cp -p $(SRCDIR)/zconf.h.in zconf.h ; \
grep -v '^#cmakedefine' $(SRCDIR)/zconf.h.in > zconf.h.cmakein &&\
touch -r $(SRCDIR)/zconf.h.in zconf.h.cmakein ; fi
# Cleanup these files if building outside source tree
@if [ ! -f README.md ]; then rm -f Makefile; fi
# Remove arch and test directory if building outside source tree
@if [ ! -f $(ARCHDIR)/Makefile.in ]; then rm -rf arch; fi
@if [ ! -f test/Makefile.in ]; then rm -rf test; fi
tags:
etags $(SRCDIR)/*.[ch]

79
deps/zlib-ng/PORTING.md vendored Normal file
View File

@@ -0,0 +1,79 @@
Porting applications to use zlib-ng
===================================
Zlib-ng can be used/compiled in two different modes, that require some
consideration by the application developer.
zlib-compat mode
----------------
Zlib-ng can be compiled in zlib-compat mode, suitable for zlib-replacement
in a single application or system-wide.
Please note that zlib-ng in zlib-compat mode tries to maintain both API and
ABI compatibility with the original zlib. Any issues regarding compatibility
can be reported as bugs.
In certain instances you may not be able to simply replace the zlib library/dll
files and expect the application to work. The application may need to be
recompiled against the zlib-ng headers and libs to ensure full compatibility.
It is also possible for the deflate output stream to differ from the original
zlib due to algorithmic differences between the two libraries. Any tests or
applications that depend on the exact length of the deflate stream being a
certain value will need to be updated.
**Advantages:**
- Easy to port to, since it only requires a recompile of the application and
no changes to the application code.
**Disadvantages:**
- Can conflict with a system-installed zlib, as that can often be linked in
by another library you are linking into your application. This can cause
crashes or incorrect output.
- If your application is pre-allocating a memory buffer and you are providing
deflate/inflate init with your own allocator that allocates from that buffer
(looking at you nginx), you should be aware that zlib-ng needs to allocate
more memory than stock zlib needs. The same problem exists with Intels and
Cloudflares zlib forks. Doing this is not recommended since it makes it
very hard to maintain compatibility over time.
**Build Considerations:**
- Compile against the *zlib.h* provided by zlib-ng
- Configuration header is named *zconf.h*
- Static library is *libz.a* on Unix and macOS, or *zlib.lib* on Windows
- Shared library is *libz.so* on Unix, *libz.dylib* on macOS, or *zlib1.dll*
on Windows
- Type `z_size_t` is *unsigned __int64* on 64-bit Windows, and *unsigned long* on 32-bit Windows, Unix and macOS
- Type `z_uintmax_t` is *unsigned long* in zlib-compat mode, and *size_t* with zlib-ng API
zlib-ng native mode
-------------------
Zlib-ng in native mode is suitable for co-existing with the standard zlib
library, allowing applications to implement support and testing separately.
The zlib-ng native has implemented some modernization and simplifications
in its API, intended to make life easier for application developers.
**Advantages:**
- Does not conflict with other zlib implementations, and can co-exist as a
system library along with zlib.
- In certain places zlib-ng native uses more appropriate data types, removing
the need for some workarounds in the API compared to zlib.
**Disadvantages:**
- Requires minor changes to applications to use the prefixed zlib-ng
function calls and structs. Usually this means a small prefix `zng_` has to be added.
**Build Considerations:**
- Compile against *zlib-ng.h*
- Configuration header is named *zconf-ng.h*
- Static library is *libz-ng.a* on Unix and macOS, or *zlib-ng.lib* on Windows
- Shared library is *libz-ng.so* on Unix, *libz-ng.dylib* on macOS, or
*zlib-ng2.dll* on Windows
- Type `z_size_t` is *size_t*
zlib-ng compile-time detection
------------------------------
To distinguish zlib-ng from other zlib implementations at compile-time check for the
existence of `ZLIBNG_VERSION` defined in the zlib header.

216
deps/zlib-ng/README.md vendored Normal file
View File

@@ -0,0 +1,216 @@
| CI | Stable | Develop |
|:---|:-------|:--------|
| GitHub Actions | [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20CMake/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20Configure/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Stable Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20NMake/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions) | [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20CMake/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20Configure/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) <br> [![Develop Status](https://github.com/zlib-ng/zlib-ng/workflows/CI%20NMake/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions) |
| CodeFactor | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
| OSS-Fuzz | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
| Codecov | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
## zlib-ng
*zlib data compression library for the next generation systems*
Maintained by Hans Kristian Rosbach
aka Dead2 (zlib-ng àt circlestorm dót org)
Features
--------
* Zlib compatible API with support for dual-linking
* Modernized native API based on zlib API for ease of porting
* Modern C11 syntax and a clean code layout
* Deflate medium and quick algorithms based on Intels zlib fork
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, & POWER9
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
* Includes improvements from Cloudflare and Intel forks
* Configure, CMake, and NMake build system support
* Comprehensive set of CMake unit tests
* Code sanitizers, fuzzing, and coverage
* GitHub Actions continuous integration on Windows, macOS, and Linux
* Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
History
-------
The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
implemented into the official zlib repository.
Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
lower threshold for code change.
zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
The result is a better performing and easier to maintain zlib-ng.
A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
small and big improvements, or valuable testing.
Build
-----
<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
There are two ways to build zlib-ng:
### Cmake
To build zlib-ng using the cross-platform makefile generator cmake.
```
cmake .
cmake --build . --config Release
ctest --verbose -C Release
```
Alternatively, you can use the cmake configuration GUI tool ccmake:
```
ccmake .
```
### Configure
To build zlib-ng using the bash configure script:
```
./configure
make
make test
```
Build Options
-------------
| CMake | configure | Description | Default |
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
| WITH_NATIVE_INSTRUCTIONS | --native | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_GTEST | | Build gtest_zlib | ON |
| WITH_FUZZERS | | Build test/fuzz | OFF |
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
Install
-------
WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
can make the whole system unusable, requiring recovery or reinstall.
If you still want a manual install, we recommend using the /opt/ path prefix.
For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
```
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
```
### Cmake
To install zlib-ng system-wide using cmake:
```
cmake --build . --target install
```
### Configure
To install zlib-ng system-wide using the configure script:
```
make install
```
### Vcpkg
Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
```sh or powershell
git clone https://github.com/Microsoft/vcpkg.git
cd vcpkg
./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
./vcpkg integrate install
./vcpkg install zlib-ng
```
The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
Contributing
------------
Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
Help with testing and reviewing pull requests etc is also very much appreciated.
Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
Acknowledgments
----------------
Thanks go out to all the people and companies who have taken the time to contribute
code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
The deflate format used by zlib was defined by Phil Katz.<br>
The deflate and zlib specifications were written by L. Peter Deutsch.
zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
Advanced Build Options
----------------------
| CMake | configure | Description | Default |
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON |
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
| WITH_RVV | | Build with RVV intrinsics | ON |
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON |
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF |
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF |
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON |
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF |
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF |
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF |
| ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON |
Related Projects
----------------
* Fork of the popular minizip https://github.com/zlib-ng/minizip-ng
* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
* Python tool to benchmark pigz https://github.com/zlib-ng/pigzbench
* 3rd party patches for zlib-ng compatibility https://github.com/zlib-ng/patches

115
deps/zlib-ng/adler32.c vendored Normal file
View File

@@ -0,0 +1,115 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
return functable.adler32(adler, buf, len);
}
#endif
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
return functable.adler32(adler, buf, len);
}
#endif
/* ========================================================================= */
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
uint32_t sum1;
uint32_t sum2;
unsigned rem;
/* for negative len, return invalid adler32 as a clue for debugging */
if (len2 < 0)
return 0xffffffff;
/* the derivation of this formula is left as an exercise for the reader */
len2 %= BASE; /* assumes len2 >= 0 */
rem = (unsigned)len2;
sum1 = adler1 & 0xffff;
sum2 = rem * sum1;
sum2 %= BASE;
sum1 += (adler2 & 0xffff) + BASE - 1;
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
if (sum1 >= BASE) sum1 -= BASE;
if (sum1 >= BASE) sum1 -= BASE;
if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
if (sum2 >= BASE) sum2 -= BASE;
return sum1 | (sum2 << 16);
}
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
}
unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
}
#else
uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
return adler32_combine_(adler1, adler2, len2);
}
#endif

16
deps/zlib-ng/adler32_fold.c vendored Normal file
View File

@@ -0,0 +1,16 @@
/* adler32_fold.c -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_fold.h"
#include <limits.h>
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
memcpy(dst, src, len);
return adler;
}

11
deps/zlib-ng/adler32_fold.h vendored Normal file
View File

@@ -0,0 +1,11 @@
/* adler32_fold.h -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif

70
deps/zlib-ng/adler32_p.h vendored Normal file
View File

@@ -0,0 +1,70 @@
/* adler32_p.h -- Private inline functions and macros shared with
* different computation of the Adler-32 checksum
* of a data stream.
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_P_H
#define ADLER32_P_H
#define BASE 65521U /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);}
#define DO2(sum1, sum2, buf, i) {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
#define DO4(sum1, sum2, buf, i) {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
adler += buf[0];
adler %= BASE;
sum2 += adler;
sum2 %= BASE;
return adler | (sum2 << 16);
}
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
return adler | (sum2 << 16);
}
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
while (len--) {
*dst = *buf++;
adler += *dst++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
return adler | (sum2 << 16);
}
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
DO16(adler, sum2, buf);
buf += 16;
#else
while (len >= 8) {
len -= 8;
DO8(adler, sum2, buf, 0);
buf += 8;
#endif
}
/* Process tail (len < 16). */
return adler32_len_16(adler, buf, len, sum2);
}
#endif /* ADLER32_P_H */

2
deps/zlib-ng/arch/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
# ignore Makefiles; they're all automatically generated
Makefile

77
deps/zlib-ng/arch/arm/Makefile.in vendored Normal file
View File

@@ -0,0 +1,77 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
ACLEFLAG=
NEONFLAG=
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
adler32_neon.o adler32_neon.lo \
arm_features.o arm_features.lo \
chunkset_neon.o chunkset_neon.lo \
compare256_neon.o compare256_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
adler32_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
arm_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
arm_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
chunkset_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
chunkset_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
compare256_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
compare256_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
crc32_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
crc32_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
slide_hash_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
slide_hash_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
insert_string_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

215
deps/zlib-ng/arch/arm/adler32_neon.c vendored Normal file
View File

@@ -0,0 +1,215 @@
/* Copyright (C) 1995-2011, 2016 Mark Adler
* Copyright (C) 2017 ARM Holdings Inc.
* Authors:
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
* Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../adler32_p.h"
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41,
40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1 };
uint32x4_t adacc = vdupq_n_u32(0);
uint32x4_t s2acc = vdupq_n_u32(0);
uint32x4_t s2acc_0 = vdupq_n_u32(0);
uint32x4_t s2acc_1 = vdupq_n_u32(0);
uint32x4_t s2acc_2 = vdupq_n_u32(0);
adacc = vsetq_lane_u32(s[0], adacc, 0);
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
uint32x4_t s3acc = vdupq_n_u32(0);
uint32x4_t adacc_prev = adacc;
uint16x8_t s2_0, s2_1, s2_2, s2_3;
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
uint16x8_t s2_4, s2_5, s2_6, s2_7;
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
size_t num_iter = len >> 2;
int rem = len & 3;
for (size_t i = 0; i < num_iter; ++i) {
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
* bit instruction, we'll have to make due summing to 16 bits first */
uint16x8x2_t hsum, hsum_fold;
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
s3acc = vaddq_u32(s3acc, adacc_prev);
adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
/* If we do straight widening additions to the 16 bit values, we don't incur
* the usual penalties of a pairwise add. We can defer the multiplications
* until the very end. These will not overflow because we are incurring at
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
* summed into once. This means for the maximum input size, the largest value
* we will see is 255 * 102 = 26010, safely under uint16 max */
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
adacc_prev = adacc;
buf += 64;
}
s3acc = vshlq_n_u32(s3acc, 6);
if (rem) {
uint32x4_t s3acc_0 = vdupq_n_u32(0);
while (rem--) {
uint8x16_t d0 = vld1q_u8(buf);
uint16x8_t adler;
adler = vpaddlq_u8(d0);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
s2_7 = vaddw_high_u8(s2_7, d0);
adacc = vpadalq_u16(adacc, adler);
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
adacc_prev = adacc;
buf += 16;
}
s3acc_0 = vshlq_n_u32(s3acc_0, 4);
s3acc = vaddq_u32(s3acc_0, s3acc);
}
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
s2acc = vaddq_u32(s2acc_0, s2acc);
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
s2acc = vaddq_u32(s2acc, s2acc_2);
uint32x2_t adacc2, s2acc2, as;
s2acc = vaddq_u32(s2acc, s3acc);
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
as = vpadd_u32(adacc2, s2acc2);
s[0] = vget_lane_u32(as, 0);
s[1] = vget_lane_u32(as, 1);
}
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (len == 1)
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (buf == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (len < 16)
return adler32_len_16(adler, buf, len, sum2);
uint32_t pair[2];
int n = NMAX;
unsigned int done = 0;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
pair[0] = adler;
pair[1] = sum2;
/* If memory is not SIMD aligned, do scalar sums to an aligned
* offset, provided that doing so doesn't completely eliminate
* SIMD operation. Aligned loads are still faster on ARM, even
* though there's no explicit aligned load instruction */
unsigned int align_offset = ((uintptr_t)buf & 15);
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
if (align_offset && len >= (16 + align_adj)) {
NEON_handle_tail(pair, buf, align_adj);
n -= align_adj;
done += align_adj;
} else {
/* If here, we failed the len criteria test, it wouldn't be
* worthwhile to do scalar aligning sums */
align_adj = 0;
}
while (done < len) {
int remaining = (int)(len - done);
n = MIN(remaining, (done == align_adj) ? n : NMAX);
if (n < 16)
break;
NEON_accum32(pair, buf + done, n >> 4);
pair[0] %= BASE;
pair[1] %= BASE;
int actual_nsums = (n >> 4) << 4;
done += actual_nsums;
}
/* Handle the tail elements. */
if (done < len) {
NEON_handle_tail(pair, (buf + done), len - done);
pair[0] %= BASE;
pair[1] %= BASE;
}
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
#endif

82
deps/zlib-ng/arch/arm/arm_features.c vendored Normal file
View File

@@ -0,0 +1,82 @@
#include "../../zbuild.h"
#include "arm_features.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
# include <sys/auxv.h>
# ifdef ARM_ASM_HWCAP
# include <asm/hwcap.h>
# endif
#elif defined(__FreeBSD__) && defined(__aarch64__)
# include <machine/armreg.h>
# ifndef ID_AA64ISAR0_CRC32_VAL
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
# endif
#elif defined(__APPLE__)
# if !defined(_DARWIN_C_SOURCE)
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
# endif
# include <sys/sysctl.h>
#elif defined(_WIN32)
# include <windows.h>
#endif
static int arm_has_crc32() {
#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
# ifdef HWCAP_CRC32
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
# else
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
# endif
#elif defined(__FreeBSD__) && defined(__aarch64__)
return getenv("QEMU_EMULATING") == NULL
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
#elif defined(__APPLE__)
int hascrc32;
size_t size = sizeof(hascrc32);
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
&& hascrc32 == 1;
#elif defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
#elif defined(ARM_NOCHECK_ACLE)
return 1;
#else
return 0;
#endif
}
/* AArch64 has neon. */
#if !defined(__aarch64__) && !defined(_M_ARM64)
static inline int arm_has_neon() {
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
# ifdef HWCAP_ARM_NEON
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
# else
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
# endif
#elif defined(__APPLE__)
int hasneon;
size_t size = sizeof(hasneon);
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
&& hasneon == 1;
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
return 1; /* Always supported */
# endif
#endif
#if defined(ARM_NOCHECK_NEON)
return 1;
#else
return 0;
#endif
}
#endif
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
#if defined(__aarch64__) || defined(_M_ARM64)
features->has_neon = 1; /* always available */
#else
features->has_neon = arm_has_neon();
#endif
features->has_crc32 = arm_has_crc32();
}

15
deps/zlib-ng/arch/arm/arm_features.h vendored Normal file
View File

@@ -0,0 +1,15 @@
/* arm_features.h -- check for ARM features.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_H_
#define ARM_H_
struct arm_cpu_features {
int has_neon;
int has_crc32;
};
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
#endif /* ARM_H_ */

101
deps/zlib-ng/arch/arm/chunkset_neon.c vendored Normal file
View File

@@ -0,0 +1,101 @@
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../generic/chunk_permute_table.h"
typedef uint8x16_t chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
}
#define CHUNKSIZE chunksize_neon
#define CHUNKCOPY chunkcopy_neon
#define CHUNKUNROLL chunkunroll_neon
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vld1q_u8(s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vst1q_u8(out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
*chunk_rem = lut_rem.remval;
#ifdef Z_MEMORY_SANITIZER
/* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 16 - dist);
#endif
/* This version of table is only available on aarch64 */
#if defined(_M_ARM64) || defined(__aarch64__)
uint8x16_t ret_vec = vld1q_u8(buf);
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
return vqtbl1q_u8(ret_vec, perm_vec);
#else
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
a = vld1_u8(buf);
b = vld1_u8(buf + 8);
ret0 = vtbl1_u8(a, perm_vec0);
uint8x8x2_t ab = {{a, b}};
ret1 = vtbl2_u8(ab, perm_vec1);
return vcombine_u8(ret0, ret1);
#endif
}
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_neon
#include "inffast_tpl.h"
#endif

59
deps/zlib-ng/arch/arm/compare256_neon.c vendored Normal file
View File

@@ -0,0 +1,59 @@
/* compare256_neon.c - NEON version of compare256
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
#include "neon_intrins.h"
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint8x16_t a, b, cmp;
uint64_t lane;
a = vld1q_u8(src0);
b = vld1q_u8(src1);
cmp = veorq_u8(a, b);
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;
src0 += 16, src1 += 16;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
return compare256_neon_static(src0, src1);
}
#define LONGEST_MATCH longest_match_neon
#define COMPARE256 compare256_neon_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_neon
#define COMPARE256 compare256_neon_static
#include "match_tpl.h"
#endif

98
deps/zlib-ng/arch/arm/crc32_acle.c vendored Normal file
View File

@@ -0,0 +1,98 @@
/* crc32_acle.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
* Copyright (C) 2016 Yang Zhang
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <arm_acle.h>
#endif
#include "../../zbuild.h"
Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
Z_REGISTER const uint16_t *buf2;
Z_REGISTER const uint32_t *buf4;
c = ~crc;
if (len && ((ptrdiff_t)buf & 1)) {
c = __crc32b(c, *buf++);
len--;
}
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
buf2 = (const uint16_t *) buf;
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
buf4 = (const uint32_t *) buf2;
} else {
buf4 = (const uint32_t *) buf;
}
#if defined(__aarch64__) || defined(_M_ARM64)
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
}
if (len == 0) {
c = ~c;
return c;
}
const uint64_t *buf8 = (const uint64_t *) buf4;
while (len >= sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
len -= sizeof(uint64_t);
}
if (len >= sizeof(uint32_t)) {
buf4 = (const uint32_t *) buf8;
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
buf2 = (const uint16_t *) buf4;
} else {
buf2 = (const uint16_t *) buf8;
}
if (len >= sizeof(uint16_t)) {
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
}
buf = (const unsigned char *) buf2;
#else /* __aarch64__ */
if (len == 0) {
c = ~c;
return c;
}
while (len >= sizeof(uint32_t)) {
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
}
if (len >= sizeof(uint16_t)) {
buf2 = (const uint16_t *) buf4;
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
buf = (const unsigned char *) buf2;
} else {
buf = (const unsigned char *) buf4;
}
#endif /* __aarch64__ */
if (len) {
c = __crc32b(c, *buf);
}
c = ~c;
return c;
}
#endif

View File

@@ -0,0 +1,26 @@
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#ifndef _MSC_VER
# include <arm_acle.h>
#endif
#include "../../zbuild.h"
#include "../../deflate.h"
#define HASH_CALC(s, h, val) \
h = __crc32w(0, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash_acle
#define INSERT_STRING insert_string_acle
#define QUICK_INSERT_STRING quick_insert_string_acle
#include "../../insert_string_tpl.h"
#endif

57
deps/zlib-ng/arch/arm/neon_intrins.h vendored Normal file
View File

@@ -0,0 +1,57 @@
#ifndef ARM_NEON_INTRINS_H
#define ARM_NEON_INTRINS_H
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
/* Compatibility shim for the _high family of functions */
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
#endif
#ifdef ARM_NEON
#define vqsubq_u16_x4_x1(out, a, b) do { \
out.val[0] = vqsubq_u16(a.val[0], b); \
out.val[1] = vqsubq_u16(a.val[1], b); \
out.val[2] = vqsubq_u16(a.val[2], b); \
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)
# ifndef ARM_NEON_HASLD4
static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
uint16x8x4_t ret = (uint16x8x4_t) {{
vld1q_u16(a),
vld1q_u16(a+8),
vld1q_u16(a+16),
vld1q_u16(a+24)}};
return ret;
}
static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
uint8x16x4_t ret = (uint8x16x4_t) {{
vld1q_u8(a),
vld1q_u8(a+16),
vld1q_u8(a+32),
vld1q_u8(a+48)}};
return ret;
}
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p, a.val[0]);
vst1q_u16(p + 8, a.val[1]);
vst1q_u16(p + 16, a.val[2]);
vst1q_u16(p + 24, a.val[3]);
}
# endif // HASLD4 check
#endif
#endif // include guard ARM_NEON_INTRINS_H

46
deps/zlib-ng/arch/arm/slide_hash_neon.c vendored Normal file
View File

@@ -0,0 +1,46 @@
/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017-2020 Mika T. Lindqvist
*
* Authors:
* Mika T. Lindqvist <postmaster@raasu.org>
* Jun He <jun.he@arm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
Z_REGISTER uint16x8_t v;
uint16x8x4_t p0, p1;
Z_REGISTER size_t n;
size_t size = entries*sizeof(table[0]);
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(wsize);
n = size / (sizeof(uint16x8_t) * 8);
do {
p0 = vld1q_u16_x4(table);
p1 = vld1q_u16_x4(table+32);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
vst1q_u16_x4(table, p0);
vst1q_u16_x4(table+32, p1);
table += 64;
} while (--n);
}
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
unsigned int wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif

24
deps/zlib-ng/arch/generic/Makefile.in vendored Normal file
View File

@@ -0,0 +1,24 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all:
mostlyclean: clean
clean:
rm -f *.o *.lo *~ \
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

View File

@@ -0,0 +1,53 @@
/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CHUNK_PERMUTE_TABLE_H_
#define CHUNK_PERMUTE_TABLE_H_
#include "zbuild.h"
/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
static const ALIGNED_(32) uint8_t permute_table[26*32] = {
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
* this is what we're dealt.
*/
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
};
typedef struct lut_rem_pair_s {
uint16_t idx;
uint16_t remval;
} lut_rem_pair;
#endif

93
deps/zlib-ng/arch/power/Makefile.in vendored Normal file
View File

@@ -0,0 +1,93 @@
# Makefile for POWER-specific files
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
P8FLAGS=-mcpu=power8
P9FLAGS=-mcpu=power9
PPCFLAGS=-maltivec
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: power_features.o \
power_features.lo \
adler32_power8.o \
adler32_power8.lo \
adler32_vmx.o \
adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
compare256_power9.o \
compare256_power9.lo \
crc32_power8.o \
crc32_power8.lo \
slide_hash_power8.o \
slide_hash_power8.lo \
slide_hash_vmx.o \
slide_hash_vmx.lo
power_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
power_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
adler32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_vmx.o:
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
adler32_vmx.lo:
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
chunkset_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
chunkset_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
compare256_power9.o:
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
compare256_power9.lo:
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
crc32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
slide_hash_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_vmx.o:
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
slide_hash_vmx.lo:
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

153
deps/zlib-ng/arch/power/adler32_power8.c vendored Normal file
View File

@@ -0,0 +1,153 @@
/* Adler32 for POWER8 using VSX instructions.
* Copyright (C) 2020 IBM Corporation
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
* instructions.
*
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
* iteration n) is the initial value of adler - at start _0 is 1 unless
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
* after iteration N.
*
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
* N-1*c[1] + ... + c[N]
*
* In a more general way:
*
* s1_N = s1_0 + sum(i=1 to N)c[i]
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
*
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
* can process N-bit at time we can do this at once.
*
* Since VSX can support 16-bit vector instructions, we can process
* 16-bit at time using N = 16 we have:
*
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
*
* After the first iteration we calculate the adler32 checksum for 16 bytes.
*
* For more background about adler32 please check the RFC:
* https://www.ietf.org/rfc/rfc1950.txt
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "zbuild.h"
#include "adler32_p.h"
/* Vector across sum unsigned int (saturate). */
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
__b = vec_sld(__a, __a, 8);
__b = vec_add(__b, __a);
__a = vec_sld(__b, __b, 4);
__a = vec_add(__a, __b);
return __a;
}
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(s1, buf, s2);
/* If buffer is empty or len=0 we need to return adler initial value. */
if (UNLIKELY(buf == NULL))
return 1;
/* This is faster than VSX code for len < 64. */
if (len < 64)
return adler32_len_64(s1, buf, len, s2);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1};
const vector unsigned char vsh = vec_splat_u8(4);
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
vector unsigned int vs1 = { 0 };
vector unsigned int vs2 = { 0 };
vector unsigned int vs1_save = { 0 };
vector unsigned int vsum1, vsum2;
vector unsigned char vbuf;
int n;
vs1[0] = s1;
vs2[0] = s2;
/* Do length bigger than NMAX in blocks of NMAX size. */
while (len >= NMAX) {
len -= NMAX;
n = NMAX / 16;
do {
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
} while (--n);
/* Once each block of NMAX size. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
vs1[0] = vs1[0] % BASE;
/* vs2[0] = s2_i + 16*s1_save +
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
vs2[0] = vs2[0] % BASE;
vs1 = vec_and(vs1, vmask);
vs2 = vec_and(vs2, vmask);
vs1_save = v_zeros;
}
/* len is less than NMAX one modulo is needed. */
if (len >= 16) {
while (len >= 16) {
len -= 16;
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
}
/* Since the size will be always less than NMAX we do this once. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
}
/* Copy result back to s1, s2 (mod 65521). */
s1 = vs1[0] % BASE;
s2 = vs2[0] % BASE;
/* Process tail (len < 16). */
return adler32_len_16(s1, buf, len, s2);
}
#endif /* POWER8_VSX */

181
deps/zlib-ng/arch/power/adler32_vmx.c vendored Normal file
View File

@@ -0,0 +1,181 @@
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#include <altivec.h>
#include "zbuild.h"
#include "adler32_p.h"
#define vmx_zero() (vec_splat_u32(0))
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
* a 2 element vector from a single load + a subsequent shift is just barely faster
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
vector unsigned int adacc, s2acc;
vector unsigned int pair_vec = vec_ld(0, s);
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
s2acc = vec_slo(pair_vec, shift_vec);
vector unsigned int zero = vmx_zero();
vector unsigned int s3acc = zero;
vector unsigned int s3acc_0 = zero;
vector unsigned int adacc_prev = adacc;
vector unsigned int adacc_prev_0 = zero;
vector unsigned int s2acc_0 = zero;
vector unsigned int s2acc_1 = zero;
vector unsigned int s2acc_2 = zero;
/* Maintain a running sum of a second half, this might help use break yet another
* data dependency bubble in the sum */
vector unsigned int adacc_0 = zero;
int num_iter = len / 4;
int rem = len & 3;
for (int i = 0; i < num_iter; ++i) {
vector unsigned char d0 = vec_ld(0, buf);
vector unsigned char d1 = vec_ld(16, buf);
vector unsigned char d2 = vec_ld(32, buf);
vector unsigned char d3 = vec_ld(48, buf);
/* The core operation of the loop, basically
* what is being unrolled below */
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
s2acc = vec_msum(t0, d0, s2acc);
/* interleave dependent sums in here */
adacc_0 = vec_sum4s(d1, adacc_0);
s2acc_0 = vec_msum(t1, d1, s2acc_0);
adacc = vec_sum4s(d2, adacc);
s2acc_1 = vec_msum(t2, d2, s2acc_1);
s2acc_2 = vec_msum(t3, d3, s2acc_2);
adacc_0 = vec_sum4s(d3, adacc_0);
adacc_prev = adacc;
adacc_prev_0 = adacc_0;
buf += 64;
}
adacc = vec_add(adacc, adacc_0);
s3acc = vec_add(s3acc, s3acc_0);
s3acc = vec_sl(s3acc, vec_splat_u32(6));
if (rem) {
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
while (rem--) {
vector unsigned char d0 = vec_ld(0, buf);
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s2acc = vec_msum(t3, d0, s2acc);
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
buf += 16;
}
}
/* Sum up independent second sums */
s2acc = vec_add(s2acc, s2acc_0);
s2acc_2 = vec_add(s2acc_1, s2acc_2);
s2acc = vec_add(s2acc, s2acc_2);
s2acc = vec_add(s2acc, s3acc);
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
vec_ste(adacc, 0, s);
vec_ste(s2acc, 0, s+1);
}
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
int n = NMAX;
unsigned int done = 0, i;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
pair[0] = adler;
pair[1] = sum2;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
// Align buffer
unsigned int al = 0;
if ((uintptr_t)buf & 0xf) {
al = 16-((uintptr_t)buf & 0xf);
if (al > len) {
al=len;
}
vmx_handle_head_or_tail(pair, buf, al);
done += al;
/* Rather than rebasing, we can reduce the max sums for the
* first round only */
n -= al;
}
for (i = al; i < len; i += n) {
int remaining = (int)(len-i);
n = MIN(remaining, (i == al) ? n : NMAX);
if (n < 16)
break;
vmx_accum32(pair, buf + i, n / 16);
pair[0] %= BASE;
pair[1] %= BASE;
done += (n / 16) * 16;
}
/* Handle the tail elements. */
if (done < len) {
vmx_handle_head_or_tail(pair, (buf + done), len - done);
pair[0] %= BASE;
pair[1] %= BASE;
}
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
#endif

View File

@@ -0,0 +1,55 @@
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "../../zbuild.h"
typedef vector unsigned char chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vec_xl(0, s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vec_xst(*chunk, 0, out);
}
#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_power8
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,66 @@
/* compare256_power9.c - Power9 version of compare256
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER9
#include <altivec.h>
#include "../../zbuild.h"
#include "../../zendian.h"
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
#if defined(__GNUC__) && (__GNUC__ < 12)
# define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
# define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
#else
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
# define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
#endif
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0, cmplen;
do {
vector unsigned char vsrc0, vsrc1, vc;
vsrc0 = *((vector unsigned char *)src0);
vsrc1 = *((vector unsigned char *)src1);
/* Compare 16 bytes at a time. Each byte of vc will be either
* all ones or all zeroes, depending on the result of the comparison. */
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
/* Since the index of matching bytes will contain only zeroes
* on vc (since we used cmpne), counting the number of consecutive
* bytes where LSB == 0 is the same as counting the length of the match. */
#if BYTE_ORDER == LITTLE_ENDIAN
zng_vec_vctzlsbb(vc, cmplen);
#else
zng_vec_vclzlsbb(vc, cmplen);
#endif
if (cmplen != 16)
return len + cmplen;
src0 += 16, src1 += 16, len += 16;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
return compare256_power9_static(src0, src1);
}
#define LONGEST_MATCH longest_match_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#endif

1123
deps/zlib-ng/arch/power/crc32_constants.h vendored Normal file

File diff suppressed because it is too large Load Diff

589
deps/zlib-ng/arch/power/crc32_power8.c vendored Normal file
View File

@@ -0,0 +1,589 @@
/* crc32 for POWER8 using VSX instructions
* Copyright (C) 2021 IBM Corporation
*
* Author: Rogerio Alves <rogealve@br.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
*
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
* chunks in order to mask the latency of the vpmsum instructions. If we
* have more than 32 kB of data to checksum we repeat this step multiple
* times, passing in the previous 1024 bits.
*
* The next step is to reduce the 1024 bits to 64 bits. This step adds
* 32 bits of 0s to the end - this matches what a CRC does. We just
* calculate constants that land the data in this 32 bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* This code uses gcc vector builtins instead using assembly directly.
*/
#include <altivec.h>
#include "zendian.h"
#include "zbuild.h"
#include "crc32_constants.h"
#include "crc32_braid_tbl.h"
#if defined (__clang__)
#include "fallback_builtins.h"
#endif
#define MAX_SIZE 32768
#define VMX_ALIGN 16
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
while (len--)
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
return crc;
}
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
unsigned int prealign;
unsigned int tail;
unsigned long len = (unsigned long) _len;
if (p == (const unsigned char *) 0x0)
return 0;
crc ^= 0xffffffff;
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
crc = crc32_align(crc, p, len);
goto out;
}
if ((unsigned long)p & VMX_ALIGN_MASK) {
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
crc = crc32_align(crc, p, prealign);
len -= prealign;
p += prealign;
}
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
tail = len & VMX_ALIGN_MASK;
if (tail) {
p += len & ~VMX_ALIGN_MASK;
crc = crc32_align(crc, p, tail);
}
out:
crc ^= 0xffffffff;
return crc;
}
/* When we have a load-store in a single-dispatch group and address overlap
* such that forward is not allowed (load-hit-store) the group must be flushed.
* A group ending NOP prevents the flush.
*/
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
#if BYTE_ORDER == BIG_ENDIAN
#define BYTESWAP_DATA
#endif
#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
#if BYTE_ORDER == LITTLE_ENDIAN
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
const __vector unsigned long long vmask_64bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
__vector unsigned long long vcrc;
__vector unsigned long long vconst1, vconst2;
/* vdata0-vdata7 will contain our data (p). */
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
/* v0-v7 will contain our checksums */
__vector unsigned long long v0 = {0,0};
__vector unsigned long long v1 = {0,0};
__vector unsigned long long v2 = {0,0};
__vector unsigned long long v3 = {0,0};
__vector unsigned long long v4 = {0,0};
__vector unsigned long long v5 = {0,0};
__vector unsigned long long v6 = {0,0};
__vector unsigned long long v7 = {0,0};
/* Vector auxiliary variables. */
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
unsigned int offset; /* Constant table offset. */
unsigned long i; /* Counter. */
unsigned long chunks;
unsigned long block_size;
int next_block = 0;
/* Align by 128 bits. The last 128 bit block will be processed at end. */
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
/* Short version. */
if (len < 256) {
/* Calculate where in the constant table we need to start. */
offset = 256 - len;
vconst1 = vec_ld(offset, vcrc_short_const);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
/* xor initial value */
vdata0 = vec_xor(vdata0, vcrc);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
for (i = 16; i < len; i += 16) {
vconst1 = vec_ld(offset + i, vcrc_short_const);
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
}
} else {
/* Load initial values. */
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
/* xor in initial value */
vdata0 = vec_xor(vdata0, vcrc);
p = (char *)p + 128;
do {
/* Checksum in blocks of MAX_SIZE. */
block_size = length;
if (block_size > MAX_SIZE) {
block_size = MAX_SIZE;
}
length = length - block_size;
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
offset = (MAX_SIZE/8) - (block_size/8);
/* We reduce our final 128 bytes in a separate step */
chunks = (block_size/128)-1;
vconst1 = vec_ld(offset, vcrc_const);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
if (chunks > 1) {
offset += 16;
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
/*
* main loop. Each iteration calculates the CRC for a 128-byte
* block.
*/
for (i = 0; i < chunks-2; i++) {
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
GROUP_ENDING_NOP;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst2);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst2);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
vdata2, (__vector unsigned long long)vconst2);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst2);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
}
/* First cool down */
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
}/* else */
/* Second cool down. */
v0 = vec_xor(v0, va0);
v1 = vec_xor(v1, va1);
v2 = vec_xor(v2, va2);
v3 = vec_xor(v3, va3);
v4 = vec_xor(v4, va4);
v5 = vec_xor(v5, va5);
v6 = vec_xor(v6, va6);
v7 = vec_xor(v7, va7);
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
(__vector unsigned char)vzero, 4);
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
(__vector unsigned char)vzero, 4);
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
(__vector unsigned char)vzero, 4);
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
(__vector unsigned char)vzero, 4);
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
(__vector unsigned char)vzero, 4);
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
(__vector unsigned char)vzero, 4);
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
(__vector unsigned char)vzero, 4);
/* xor with the last 1024 bits. */
va0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(va0, va0, va0, vperm_const);
va1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(va1, va1, va1, vperm_const);
va2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(va2, va2, va2, vperm_const);
va3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(va3, va3, va3, vperm_const);
va4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(va4, va4, va4, vperm_const);
va5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(va5, va5, va5, vperm_const);
va6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(va6, va6, va6, vperm_const);
va7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(va7, va7, va7, vperm_const);
p = (char *)p + 128;
vdata0 = vec_xor(v0, va0);
vdata1 = vec_xor(v1, va1);
vdata2 = vec_xor(v2, va2);
vdata3 = vec_xor(v3, va3);
vdata4 = vec_xor(v4, va4);
vdata5 = vec_xor(v5, va5);
vdata6 = vec_xor(v6, va6);
vdata7 = vec_xor(v7, va7);
/* Check if we have more blocks to process */
next_block = 0;
if (length != 0) {
next_block = 1;
/* zero v0-v7 */
v0 = vec_xor(v0, v0);
v1 = vec_xor(v1, v1);
v2 = vec_xor(v2, v2);
v3 = vec_xor(v3, v3);
v4 = vec_xor(v4, v4);
v5 = vec_xor(v5, v5);
v6 = vec_xor(v6, v6);
v7 = vec_xor(v7, v7);
}
length = length + 128;
} while (next_block);
/* Calculate how many bytes we have left. */
length = (len & 127);
/* Calculate where in (short) constant table we need to start. */
offset = 128 - length;
v0 = vec_ld(offset, vcrc_short_const);
v1 = vec_ld(offset + 16, vcrc_short_const);
v2 = vec_ld(offset + 32, vcrc_short_const);
v3 = vec_ld(offset + 48, vcrc_short_const);
v4 = vec_ld(offset + 64, vcrc_short_const);
v5 = vec_ld(offset + 80, vcrc_short_const);
v6 = vec_ld(offset + 96, vcrc_short_const);
v7 = vec_ld(offset + 112, vcrc_short_const);
offset += 128;
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
/* Now reduce the tail (0-112 bytes). */
for (i = 0; i < length; i+=16) {
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
va0 = vec_ld(offset + i,vcrc_short_const);
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
v0 = vec_xor(v0, va0);
}
/* xor all parallel chunks together. */
v0 = vec_xor(v0, v1);
v2 = vec_xor(v2, v3);
v4 = vec_xor(v4, v5);
v6 = vec_xor(v6, v7);
v0 = vec_xor(v0, v2);
v4 = vec_xor(v4, v6);
v0 = vec_xor(v0, v4);
}
/* Barrett Reduction */
vconst1 = vec_ld(0, v_Barrett_const);
vconst2 = vec_ld(16, v_Barrett_const);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)v0, 8);
v0 = vec_xor(v1,v0);
/* shift left one bit */
__vector unsigned char vsht_splat = vec_splat_u8 (1);
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
v0 = vec_and(v0, vmask_64bit);
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
/* bottom 32 bits of a */
v1 = vec_and(v0, vmask_32bit);
/* ma */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst1);
/* bottom 32bits of ma */
v1 = vec_and(v1, vmask_32bit);
/* qn */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
/* shift result into top 64 bits of */
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
#if BYTE_ORDER == BIG_ENDIAN
return v0[0];
#else
return v0[1];
#endif
}

View File

@@ -0,0 +1,31 @@
/* Helper functions to work around issues with clang builtins
* Copyright (C) 2021 IBM Corporation
*
* Authors:
* Daniel Black <daniel@linux.vnet.ibm.com>
* Rogerio Alves <rogealve@br.ibm.com>
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_BUILTINS_H
#define POWER_BUILTINS_H
/*
* These stubs fix clang incompatibilities with GCC builtins.
*/
#ifndef __builtin_crypto_vpmsumw
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
#endif
#ifndef __builtin_crypto_vpmsumd
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
#endif
static inline __vector unsigned long long __attribute__((overloadable))
vec_ld(int __a, const __vector unsigned long long* __b) {
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
}
#endif

View File

@@ -0,0 +1,42 @@
/* power_features.c - POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "../../zbuild.h"
#include "power_features.h"
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
#ifdef PPC_FEATURES
unsigned long hwcap;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
hwcap = getauxval(AT_HWCAP);
#endif
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
features->has_altivec = 1;
#endif
#ifdef POWER_FEATURES
unsigned long hwcap2;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
#else
hwcap2 = getauxval(AT_HWCAP2);
#endif
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
features->has_arch_2_07 = 1;
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
features->has_arch_3_00 = 1;
#endif
}

View File

@@ -0,0 +1,18 @@
/* power_features.h -- check for POWER CPU features
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
struct power_cpu_features {
int has_altivec;
int has_arch_2_07;
int has_arch_3_00;
};
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
#endif /* POWER_H_ */

View File

@@ -0,0 +1,12 @@
/* Optimized slide_hash for POWER processors
* Copyright (C) 2019-2020 IBM Corporation
* Author: Matheus Castanho <msc@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#define SLIDE_PPC slide_hash_power8
#include "slide_ppc_tpl.h"
#endif /* POWER8_VSX */

View File

@@ -0,0 +1,10 @@
/* Optimized slide_hash for PowerPC processors with VMX instructions
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#define SLIDE_PPC slide_hash_vmx
#include "slide_ppc_tpl.h"
#endif /* PPC_VMX */

31
deps/zlib-ng/arch/power/slide_ppc_tpl.h vendored Normal file
View File

@@ -0,0 +1,31 @@
/* Optimized slide_hash for PowerPC processors
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <altivec.h>
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
const vector unsigned short vmx_wsize = vec_splats(wsize);
Pos *p = table;
do {
vector unsigned short value, result;
value = vec_ld(0, p);
result = vec_subs(value, vmx_wsize);
vec_st(result, 0, p);
p += 8;
entries -= 8;
} while (entries > 0);
}
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
uint16_t wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}

45
deps/zlib-ng/arch/riscv/README.md vendored Normal file
View File

@@ -0,0 +1,45 @@
# Building RISC-V Target with Cmake #
> **Warning**
> We cannot detect rvv support at runtime, running the rvv code on a no-rvv target is a risk. Users should disable the rvv when the target does not support it.
>
> We will have a better solution when the kernels update `hwcap` or `hwprobe` for risc-v.
## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
```bash
./prepare_riscv_toolchain_qemu.sh
```
After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
## Cross-Compile for RISC-V Target ##
```bash
cmake -G Ninja -B ./build-riscv \
-D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
-D CMAKE_INSTALL_PREFIX=./build-riscv/install \
-D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
-D QEMU_PATH={QEMU_PATH} \
.
cmake --build ./build-riscv
```
Disable the option if there is no RVV support:
```
-D WITH_RVV=OFF
```
## Run Unittests on User Mode QEMU ##
```bash
cd ./build-riscv && ctest --verbose
```

View File

@@ -0,0 +1,15 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../../zbuild.h"
#include "riscv_features.h"
/* TODO: detect risc-v cpu info at runtime when the kernel updates hwcap or hwprobe for risc-v */
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
#if defined(__riscv_v) && defined(__linux__)
features->has_rvv = 1;
#else
features->has_rvv = 0;
#endif
}

View File

@@ -0,0 +1,18 @@
/* riscv_features.h -- check for riscv features.
*
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_H_
#define RISCV_H_
struct riscv_cpu_features {
int has_rvv;
};
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
#endif /* RISCV_H_ */

54
deps/zlib-ng/arch/s390/Makefile.in vendored Normal file
View File

@@ -0,0 +1,54 @@
# Makefile for zlib-ng
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
VGFMAFLAG=
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
s390_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
s390_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
dfltcc_common.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
dfltcc_common.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
dfltcc_deflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_deflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_inflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
dfltcc_inflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
crc32-vx.o:
$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
crc32-vx.lo:
$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

284
deps/zlib-ng/arch/s390/README.md vendored Normal file
View File

@@ -0,0 +1,284 @@
# Introduction
This directory contains SystemZ deflate hardware acceleration support.
It can be enabled using the following build commands:
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
$ make
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
$ make
When built like this, zlib-ng would compress using hardware on level 1,
and using software on all other levels. Decompression will always happen
in hardware. In order to enable hardware compression for levels 1-6
(i.e. to make it used by default) one could add
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
SystemZ deflate hardware acceleration is available on [IBM z15](
https://www.ibm.com/products/z15) and newer machines under the name [
"Integrated Accelerator for zEnterprise Data Compression"](
https://www.ibm.com/support/z-content-solutions/compression/). The
programming interface to it is a machine instruction called DEFLATE
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
the code and the rest of this document refer to this feature simply as
"DFLTCC".
# Performance
Performance figures are published [here](
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
). The compression speed-up can be as high as 110x and the decompression
speed-up can be as high as 15x.
# Limitations
Two DFLTCC compression calls with identical inputs are not guaranteed to
produce identical outputs. Therefore care should be taken when using
hardware compression when reproducible results are desired. In
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
particular stream.
DFLTCC does not support every single zlib-ng feature, in particular:
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
* `inflateMark()`
* `inflatePrime()`
* `inflateSyncPoint()`
When used, these functions will either switch to software, or, in case
this is not possible, gracefully fail.
# Code structure
All SystemZ-specific code lives in `arch/s390` directory and is
integrated with the rest of zlib-ng using hook macros.
## Hook macros
DFLTCC takes as arguments a parameter block, an input buffer, an output
buffer and a window. `ZALLOC_DEFLATE_STATE()`, `ZALLOC_INFLATE_STATE()`,
`ZFREE_STATE()`, `ZCOPY_DEFLATE_STATE()`, `ZCOPY_INFLATE_STATE()`,
`ZALLOC_WINDOW()`, `ZCOPY_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate
allocation details for the parameter block (which is allocated alongside
zlib-ng state) and the window (which must be page-aligned and large enough).
Software and hardware window formats do not match, therefore,
`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
and `inflateGetDictionary()` need special handling, which is triggered using
`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
`INFLATE_RESET_KEEP_HOOK()` macros.
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
calls gracefully fail.
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
software compression mid-stream using `deflateParams()`. Switching
normally entails flushing the current block, which might not be possible
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
in order to detect and gracefully handle such situations.
The algorithm implemented in hardware has different compression ratio
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
return the correct results for the hardware implementation.
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
window on its own, calling `updatewindow()` is suppressed using
`INFLATE_NEED_UPDATEWINDOW()` macro.
In addition to compression, DFLTCC computes CRC-32 and Adler-32
checksums, therefore, whenever it's used, software checksumming is
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
macros.
While software always produces reproducible compression results, this
is not the case for DFLTCC. Therefore, zlib-ng users are given the
ability to specify whether or not reproducible compression results
are required. While it is always possible to specify this setting
before the compression begins, it is not always possible to do so in
the middle of a deflate stream - the exact conditions for that are
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
## SystemZ-specific code
When zlib-ng is built with DFLTCC, the hooks described above are
converted to calls to functions, which are implemented in
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
categories:
* Base DFLTCC support, e.g. wrapping the machine instruction -
`dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
* Translating between software and hardware data formats, e.g.
`dfltcc_deflate_set_dictionary()`.
* Translating between software and hardware state machines, e.g.
`dfltcc_deflate()` and `dfltcc_inflate()`.
The functions from the first two categories are fairly simple, however,
various quirks in both software and hardware state machines make the
functions from the third category quite complicated.
### `dfltcc_deflate()` function
This function is called by `deflate()` and has the following
responsibilities:
* Checking whether DFLTCC can be used with the current stream. If this
is not the case, then it returns `0`, making `deflate()` use some
other function in order to compress in software. Otherwise it returns
`1`.
* Block management and Huffman table generation. DFLTCC ends blocks only
when explicitly instructed to do so by the software. Furthermore,
whether to use fixed or dynamic Huffman tables must also be determined
by the software. Since looking at data in order to gather statistics
would negate performance benefits, the following approach is used: the
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
dynamic blocks.
* Writing EOBS. Block Closing Control bit in the parameter block
instructs DFLTCC to write EOBS, however, certain conditions need to be
met: input data length must be non-zero or Continuation Flag must be
set. To put this in simpler terms, DFLTCC will silently refuse to
write EOBS if this is the only thing that it is asked to do. Since the
code has to be able to emit EOBS in software anyway, in order to avoid
tricky corner cases Block Closing Control is never used. Whether to
write EOBS is instead controlled by `soft_bcc` variable.
* Triggering block post-processing. Depending on flush mode, `deflate()`
must perform various additional actions when a block or a stream ends.
`dfltcc_deflate()` informs `deflate()` about this using
`block_state *result` parameter.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
and Sub-Byte Boundary. Certain fields cannot be translated and must
persist untouched in the parameter block between calls, for example,
Continuation Flag or Continuation State Buffer.
* Handling flush modes and low-memory situations. These aspects are
quite intertwined and pervasive. The general idea here is that the
code must not do anything in software - whether explicitly by e.g.
calling `send_eobs()`, or implicitly - by returning to `deflate()`
with certain return and `*result` values, when Continuation Flag is
set.
* Ending streams. When a new block is started and flush mode is
`Z_FINISH`, Block Header Final parameter block bit is used to mark
this block as final. However, sometimes an empty final block is
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
refuse to do this. The general idea of DFLTCC implementation is to
rely as much as possible on the existing code. Here in order to do
this, the code pretends that it does not support DFLTCC, which makes
`deflate()` call a software compression function, which writes an
empty final block. Whether this is required is controlled by
`need_empty_block` variable.
* Error handling. This is simply converting
Operation-Ending-Supplemental Code to string. Errors can only happen
due to things like memory corruption, and therefore they don't affect
the `deflate()` return code.
### `dfltcc_inflate()` function
This function is called by `inflate()` from the `TYPEDO` state (that is,
when all the metadata is parsed and the stream is positioned at the type
bits of deflate block header) and it's responsible for the following:
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
block or tree boundary.
* `inflate()` decompression loop management. This is controlled using
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
`DFLTCC_INFLATE_CONTINUE`.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `whave` and History Length or `wnext` and
History Offset.
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
and is controlled by `last` state field.
* Error handling. Like deflate, error handling comprises
Operation-Ending-Supplemental Code to string conversion. Unlike
deflate, errors may happen due to bad inputs, therefore they are
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
# Testing
Given complexity of DFLTCC machine instruction, it is not clear whether
QEMU TCG will ever support it. At the time of writing, one has to have
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
DFLTCC is a non-privileged instruction, neither special VM/LPAR
configuration nor root are required.
zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
testing. There are no IBM Z builds of GitHub Actions runner, and
stable qemu-user has problems with .NET apps, so the builder runs the
x86_64 runner version with qemu-user built from the master branch.
## Configuring the builder.
### Install prerequisites.
```
$ sudo dnf install docker
```
### Add services.
```
$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
$ sudo systemctl daemon-reload
```
### Create a config file.
```
$ sudo tee /etc/actions-runner
repo=<owner>/<name>
access_token=<ghp_***>
```
Access token should have the repo scope, consult
https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
for details.
### Autostart the x86_64 emulation support.
```
$ sudo systemctl enable --now qemu-user-static
```
### Autostart the runner.
```
$ sudo systemctl enable --now actions-runner
```
## Rebuilding the image
In order to update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
latest OS security fixes, use the following commands:
```
$ sudo docker build \
--pull \
-f self-hosted-builder/actions-runner.Dockerfile \
-t iiilinuxibmcom/actions-runner
$ sudo systemctl restart actions-runner
```
## Removing persistent data
The `actions-runner` service stores various temporary data, such as runner
registration information, work directories and logs, in the `actions-runner`
volume. In order to remove it and start from scratch, e.g. when switching the
runner to a different repository, use the following commands:
```
$ sudo systemctl stop actions-runner
$ sudo docker rm -f actions-runner
$ sudo docker volume rm actions-runner
```

222
deps/zlib-ng/arch/s390/crc32-vx.c vendored Normal file
View File

@@ -0,0 +1,222 @@
/*
* Hardware-accelerated CRC-32 variants for Linux on z Systems
*
* Use the z/Architecture Vector Extension Facility to accelerate the
* computing of bitreflected CRC-32 checksums.
*
* This CRC-32 implementation algorithm is bitreflected and processes
* the least-significant bit first (Little-Endian).
*
* This code was originally written by Hendrik Brueckner
* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
* relicensed under the zlib license.
*/
#include "../../zbuild.h"
#include "crc32_braid_p.h"
#include <vecintrin.h>
typedef unsigned char uv16qi __attribute__((vector_size(16)));
typedef unsigned int uv4si __attribute__((vector_size(16)));
typedef unsigned long long uv2di __attribute__((vector_size(16)));
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
/*
* The CRC-32 constant block contains reduction constants to fold and
* process particular chunks of the input data stream in parallel.
*
* For the CRC-32 variants, the constants are precomputed according to
* these definitions:
*
* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
* R3 = [(x128+32 mod P'(x) << 32)]' << 1
* R4 = [(x128-32 mod P'(x) << 32)]' << 1
* R5 = [(x64 mod P'(x) << 32)]' << 1
* R6 = [(x32 mod P'(x) << 32)]' << 1
*
* The bitreflected Barret reduction constant, u', is defined as
* the bit reversal of floor(x**64 / P(x)).
*
* where P(x) is the polynomial in the normal domain and the P'(x) is the
* polynomial in the reversed (bitreflected) domain.
*
* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
*
* P(x) = 0x04C11DB7
* P'(x) = 0xEDB88320
*/
const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
const uv2di r5 = {0, 0x163CD6124}; /* R5 */
const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
/*
* Load the initial CRC value.
*
* The CRC value is loaded into the rightmost word of the
* vector register and is later XORed with the LSB portion
* of the loaded input data.
*/
uv2di v0 = {0, 0};
v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
/* Load a 64-byte data chunk and XOR with CRC */
uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
v1 ^= v0;
buf += 64;
len -= 64;
while (len >= 64) {
/* Load the next 64-byte data chunk */
uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
/*
* Perform a GF(2) multiplication of the doublewords in V1 with
* the R1 and R2 reduction constants in V0. The intermediate result
* is then folded (accumulated) with the next data chunk in PART1 and
* stored in V1. Repeat this step for the register contents
* in V2, V3, and V4 respectively.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
buf += 64;
len -= 64;
}
/*
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
* value remains.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
while (len >= 16) {
/* Load next data chunk */
v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
/* Fold next data chunk */
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
buf += 16;
len -= 16;
}
/*
* Set up a vector register for byte shifts. The shift value must
* be loaded in bits 1-4 in byte element 7 of a vector register.
* Shift by 8 bytes: 0x40
* Shift by 4 bytes: 0x20
*/
uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
v9 = vec_insert((unsigned char)0x40, v9, 7);
/*
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
* to move R4 into the rightmost doubleword and set the leftmost
* doubleword to 0x1.
*/
v0 = vec_srb(r4r3, (uv2di)v9);
v0[0] = 1;
/*
* Compute GF(2) product of V1 and V0. The rightmost doubleword
* of V1 is multiplied with R4. The leftmost doubleword of V1 is
* multiplied by 0x1 and is then XORed with rightmost product.
* Implicitly, the intermediate leftmost product becomes padded
*/
v1 = (uv2di)vec_gfmsum_128(v0, v1);
/*
* Now do the final 32-bit fold by multiplying the rightmost word
* in V1 with R5 and XOR the result with the remaining bits in V1.
*
* To achieve this by a single VGFMAG, right shift V1 by a word
* and store the result in V2 which is then accumulated. Use the
* vector unpack instruction to load the rightmost half of the
* doubleword into the rightmost doubleword element of V1; the other
* half is loaded in the leftmost doubleword.
* The vector register with CONST_R5 contains the R5 constant in the
* rightmost doubleword and the leftmost doubleword is zero to ignore
* the leftmost product of V1.
*/
v9 = vec_insert((unsigned char)0x20, v9, 7);
v2 = vec_srb(v1, (uv2di)v9);
v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
/*
* Apply a Barret reduction to compute the final 32-bit CRC value.
*
* The input values to the Barret reduction are the degree-63 polynomial
* in V1 (R(x)), degree-32 generator polynomial, and the reduction
* constant u. The Barret reduction result is the CRC value of R(x) mod
* P(x).
*
* The Barret reduction algorithm is defined as:
*
* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
* 3. C(x) = R(x) XOR T2(x) mod x^32
*
* Note: The leftmost doubleword of vector register containing
* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
* is zero and does not contribute to the final result.
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
v2 = vec_unpackl((uv4si)v1);
v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
/*
* Compute the GF(2) product of the CRC polynomial with T1(x) in
* V2 and XOR the intermediate result, T2(x), with the value in V1.
* The final result is stored in word element 2 of V2.
*/
v2 = vec_unpackl((uv4si)v2);
v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
return ((uv4si)v2)[2];
}
#define VX_MIN_LEN 64
#define VX_ALIGNMENT 16L
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
size_t prealign, aligned, remaining;
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
return PREFIX(crc32_braid)(crc, buf, len);
if ((uintptr_t)buf & VX_ALIGN_MASK) {
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
len -= prealign;
crc = PREFIX(crc32_braid)(crc, buf, prealign);
buf += prealign;
}
aligned = len & ~VX_ALIGN_MASK;
remaining = len & VX_ALIGN_MASK;
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
if (remaining)
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
return crc;
}

40
deps/zlib-ng/arch/s390/dfltcc_common.c vendored Normal file
View File

@@ -0,0 +1,40 @@
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */
#include "zbuild.h"
#include "dfltcc_common.h"
#include "dfltcc_detail.h"
/*
Memory management.
DFLTCC requires parameter blocks and window to be aligned. zlib-ng allows
users to specify their own allocation functions, so using e.g.
`posix_memalign' is not an option. Thus, we overallocate and take the
aligned portion of the buffer.
*/
static const int PAGE_ALIGN = 0x1000;
void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size) {
void *p;
void *w;
/* To simplify freeing, we store the pointer to the allocated buffer right
* before the window. Note that DFLTCC always uses HB_SIZE bytes.
*/
p = ZALLOC(strm, sizeof(void *) + MAX(items * size, HB_SIZE) + PAGE_ALIGN, sizeof(unsigned char));
if (p == NULL)
return NULL;
w = ALIGN_UP((char *)p + sizeof(void *), PAGE_ALIGN);
*(void **)((char *)w - sizeof(void *)) = p;
return w;
}
void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n) {
memcpy(dest, src, MAX(n, HB_SIZE));
}
void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w) {
if (w)
ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
}

44
deps/zlib-ng/arch/s390/dfltcc_common.h vendored Normal file
View File

@@ -0,0 +1,44 @@
#ifndef DFLTCC_COMMON_H
#define DFLTCC_COMMON_H
#include "zutil.h"
void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size);
void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n);
void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w);
#define ZFREE_STATE ZFREE
#define ZALLOC_WINDOW PREFIX(dfltcc_alloc_window)
#define ZCOPY_WINDOW PREFIX(dfltcc_copy_window)
#define ZFREE_WINDOW PREFIX(dfltcc_free_window)
#define TRY_FREE_WINDOW PREFIX(dfltcc_free_window)
#define DFLTCC_BLOCK_HEADER_BITS 3
#define DFLTCC_HLITS_COUNT_BITS 5
#define DFLTCC_HDISTS_COUNT_BITS 5
#define DFLTCC_HCLENS_COUNT_BITS 4
#define DFLTCC_MAX_HCLENS 19
#define DFLTCC_HCLEN_BITS 3
#define DFLTCC_MAX_HLITS 286
#define DFLTCC_MAX_HDISTS 30
#define DFLTCC_MAX_HLIT_HDIST_BITS 7
#define DFLTCC_MAX_SYMBOL_BITS 16
#define DFLTCC_MAX_EOBS_BITS 15
#define DFLTCC_MAX_PADDING_BITS 7
#define DEFLATE_BOUND_COMPLEN(source_len) \
((DFLTCC_BLOCK_HEADER_BITS + \
DFLTCC_HLITS_COUNT_BITS + \
DFLTCC_HDISTS_COUNT_BITS + \
DFLTCC_HCLENS_COUNT_BITS + \
DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
(DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
(source_len) * DFLTCC_MAX_SYMBOL_BITS + \
DFLTCC_MAX_EOBS_BITS + \
DFLTCC_MAX_PADDING_BITS) >> 3)
#endif

404
deps/zlib-ng/arch/s390/dfltcc_deflate.c vendored Normal file
View File

@@ -0,0 +1,404 @@
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
/*
Use the following commands to build zlib-ng with DFLTCC compression support:
$ ./configure --with-dfltcc-deflate
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "deflate.h"
#include "trees_emit.h"
#include "dfltcc_deflate.h"
#include "dfltcc_detail.h"
struct dfltcc_deflate_state {
struct dfltcc_state common;
uint16_t level_mask; /* Levels on which to use DFLTCC */
uint32_t block_size; /* New block each X bytes */
size_t block_threshold; /* New block after total_in > X */
uint32_t dht_threshold; /* New block only if avail_in >= X */
};
#define GET_DFLTCC_DEFLATE_STATE(state) ((struct dfltcc_deflate_state *)GET_DFLTCC_STATE(state))
void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp) strm) {
return dfltcc_alloc_state(strm, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
}
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
dfltcc_reset_state(&dfltcc_state->common);
/* Initialize tuning parameters */
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
}
void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src) {
dfltcc_copy_state(dst, src, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
}
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
/* Unsupported compression settings */
if ((dfltcc_state->level_mask & (1 << level)) == 0)
return 0;
if (window_bits != HB_BITS)
return 0;
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
return 0;
if (reproducible)
return 0;
/* Unsupported hardware */
if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
!is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
return 0;
return 1;
}
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
}
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
}
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->total_in += (strm->avail_in - avail_in);
strm->total_out += (strm->avail_out - avail_out);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
deflate_state *state = (deflate_state *)strm->state;
send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
PREFIX(flush_pending)(strm);
if (state->pending != 0) {
/* The remaining data is located in pending_out[0:pending]. If someone
* calls put_byte() - this might happen in deflate() - the byte will be
* placed into pending_buf[pending], which is incorrect. Move the
* remaining data to the beginning of pending_buf so that put_byte() is
* usable again.
*/
memmove(state->pending_buf, state->pending_out, state->pending);
state->pending_out = state->pending_buf;
}
#ifdef ZLIB_DEBUG
state->compressed_len += param->eobl;
#endif
}
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
uInt masked_avail_in;
dfltcc_cc cc;
int need_empty_block;
int soft_bcc;
int no_flush;
if (!PREFIX(dfltcc_can_deflate)(strm)) {
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
return 0;
}
again:
masked_avail_in = 0;
soft_bcc = 0;
no_flush = flush == Z_NO_FLUSH;
/* No input data. Return, except when Continuation Flag is set, which means
* that DFLTCC has buffered some output in the parameter block and needs to
* be called again in order to flush it.
*/
if (strm->avail_in == 0 && !param->cf) {
/* A block is still open, and the hardware does not support closing
* blocks without adding data. Thus, close it manually.
*/
if (!no_flush && param->bcf) {
send_eobs(strm, param);
param->bcf = 0;
}
/* Let one of deflate_* functions write a trailing empty block. */
if (flush == Z_FINISH)
return 0;
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
/* Trigger block post-processing if necessary. */
*result = no_flush ? need_more : block_done;
return 1;
}
/* There is an open non-BFINAL block, we are not going to close it just
* yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
* more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
* DHT in order to adapt to a possibly changed input data distribution.
*/
if (param->bcf && no_flush &&
strm->total_in > dfltcc_state->block_threshold &&
strm->avail_in >= dfltcc_state->dht_threshold) {
if (param->cf) {
/* We need to flush the DFLTCC buffer before writing the
* End-of-block Symbol. Mask the input data and proceed as usual.
*/
masked_avail_in += strm->avail_in;
strm->avail_in = 0;
no_flush = 0;
} else {
/* DFLTCC buffer is empty, so we can manually write the
* End-of-block Symbol right away.
*/
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
}
}
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
* set BCF=1, which is wrong. Avoid complications and return early.
*/
if (strm->avail_out == 0) {
*result = need_more;
return 1;
}
/* The caller gave us too much data. Pass only one block worth of
* uncompressed data to DFLTCC and mask the rest, so that on the next
* iteration we start a new block.
*/
if (no_flush && strm->avail_in > dfltcc_state->block_size) {
masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
strm->avail_in = dfltcc_state->block_size;
}
/* When we have an open non-BFINAL deflate block and caller indicates that
* the stream is ending, we need to close an open deflate block and open a
* BFINAL one.
*/
need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
/* Translate stream to parameter block */
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
if (!no_flush)
/* We need to close a block. Always do this in software - when there is
* no input data, the hardware will not honor BCC. */
soft_bcc = 1;
if (flush == Z_FINISH && !param->bcf)
/* We are about to open a BFINAL block, set Block Header Final bit
* until the stream ends.
*/
param->bhf = 1;
/* DFLTCC-CMPR will write to next_out, so make sure that buffers with
* higher precedence are empty.
*/
Assert(state->pending == 0, "There must be no pending bytes");
Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
param->sbb = (unsigned int)state->bi_valid;
if (param->sbb > 0)
*strm->next_out = (unsigned char)state->bi_buf;
/* Honor history and check value */
param->nt = 0;
if (state->wrap == 1)
param->cv = strm->adler;
else if (state->wrap == 2)
param->cv = ZSWAP32(state->crc_fold.value);
/* When opening a block, choose a Huffman-Table Type */
if (!param->bcf) {
if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
param->htt = HTT_FIXED;
else {
param->htt = HTT_DYNAMIC;
dfltcc_gdht(strm);
}
}
/* Deflate */
do {
cc = dfltcc_cmpr(strm);
if (strm->avail_in < 4096 && masked_avail_in > 0)
/* We are about to call DFLTCC with a small input buffer, which is
* inefficient. Since there is masked data, there will be at least
* one more DFLTCC call, so skip the current one and make the next
* one handle more data.
*/
break;
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
state->bi_valid = param->sbb;
if (state->bi_valid == 0)
state->bi_buf = 0; /* Avoid accessing next_out */
else
state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
if (state->wrap == 1)
strm->adler = param->cv;
else if (state->wrap == 2)
state->crc_fold.value = ZSWAP32(param->cv);
/* Unmask the input data */
strm->avail_in += masked_avail_in;
masked_avail_in = 0;
/* If we encounter an error, it means there is a bug in DFLTCC call */
Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
/* Update Block-Continuation Flag. It will be used to check whether to call
* GDHT the next time.
*/
if (cc == DFLTCC_CC_OK) {
if (soft_bcc) {
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
} else
param->bcf = 1;
if (flush == Z_FINISH) {
if (need_empty_block)
/* Make the current deflate() call also close the stream */
return 0;
else {
bi_windup(state);
*result = finish_done;
}
} else {
if (flush == Z_FULL_FLUSH)
param->hl = 0; /* Clear history */
*result = flush == Z_NO_FLUSH ? need_more : block_done;
}
} else {
param->bcf = 1;
*result = need_more;
}
if (strm->avail_in != 0 && strm->avail_out != 0)
goto again; /* deflate() must use all input or all output */
return 1;
}
/*
Switching between hardware and software compression.
DFLTCC does not support all zlib settings, e.g. generation of non-compressed
blocks or alternative window sizes. When such settings are applied on the
fly with deflateParams, we need to convert between hardware and software
window formats.
*/
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
deflate_state *state = (deflate_state *)strm->state;
int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
if (can_deflate == could_deflate)
/* We continue to work in the same mode - no changes needed */
return Z_OK;
if (!dfltcc_was_deflate_used(strm))
/* DFLTCC was not used yet - no changes needed */
return Z_OK;
/* For now, do not convert between window formats - simply get rid of the old data instead */
*flush = Z_FULL_FLUSH;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
* close the block without resetting the compression state. Detect this
* situation and return that deflation is not done.
*/
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
return 0;
/* Return that deflation is not done if DFLTCC is used and either it
* buffered some data (Continuation Flag is set), or has not written EOBS
* yet (Block-Continuation Flag is set).
*/
return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
}
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
append_history(param, state->window, dictionary, dict_length);
state->strstart = 1; /* Add FDICT to zlib header */
state->block_start = state->strstart; /* Make deflate_stored happy */
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (dictionary)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

60
deps/zlib-ng/arch/s390/dfltcc_deflate.h vendored Normal file
View File

@@ -0,0 +1,60 @@
#ifndef DFLTCC_DEFLATE_H
#define DFLTCC_DEFLATE_H
#include "dfltcc_common.h"
void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp));
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src);
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
#define ZALLOC_DEFLATE_STATE PREFIX(dfltcc_alloc_deflate_state)
#define ZCOPY_DEFLATE_STATE PREFIX(dfltcc_copy_deflate_state)
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
do { \
int err; \
\
err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
if (err == Z_STREAM_ERROR) \
return err; \
} while (0)
#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
do { \
if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
} while (0)
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
#endif

312
deps/zlib-ng/arch/s390/dfltcc_detail.h vendored Normal file
View File

@@ -0,0 +1,312 @@
#include "../../zbuild.h"
#include <stdio.h>
#ifdef HAVE_SYS_SDT_H
#include <sys/sdt.h>
#endif
/*
Tuning parameters.
*/
#ifndef DFLTCC_LEVEL_MASK
#define DFLTCC_LEVEL_MASK 0x2
#endif
#ifndef DFLTCC_BLOCK_SIZE
#define DFLTCC_BLOCK_SIZE 1048576
#endif
#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
#endif
#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
#endif
#ifndef DFLTCC_RIBM
#define DFLTCC_RIBM 0
#endif
/*
Parameter Block for Query Available Functions.
*/
#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
struct dfltcc_qaf_param {
char fns[16];
char reserved1[8];
char fmts[2];
char reserved2[6];
};
#define DFLTCC_SIZEOF_QAF 32
static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
static inline int is_bit_set(const char *bits, int n) {
return bits[n / 8] & (1 << (7 - (n % 8)));
}
static inline void clear_bit(char *bits, int n) {
bits[n / 8] &= ~(1 << (7 - (n % 8)));
}
#define DFLTCC_FACILITY 151
static inline int is_dfltcc_enabled(void) {
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
Z_REGISTER uint8_t r0 __asm__("r0");
memset(facilities, 0, sizeof(facilities));
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
* is 64-bit, it's always z/Architecture mode at runtime.
*/
__asm__ volatile(
#ifndef __clang__
".machinemode push\n"
".machinemode zarch\n"
#endif
"stfle %[facilities]\n"
#ifndef __clang__
".machinemode pop\n"
#endif
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
}
#define DFLTCC_FMT0 0
/*
Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
*/
#define CVT_CRC32 0
#define CVT_ADLER32 1
#define HTT_FIXED 0
#define HTT_DYNAMIC 1
struct dfltcc_param_v0 {
uint16_t pbvn; /* Parameter-Block-Version Number */
uint8_t mvn; /* Model-Version Number */
uint8_t ribm; /* Reserved for IBM use */
uint32_t reserved32 : 31;
uint32_t cf : 1; /* Continuation Flag */
uint8_t reserved64[8];
uint32_t nt : 1; /* New Task */
uint32_t reserved129 : 1;
uint32_t cvt : 1; /* Check Value Type */
uint32_t reserved131 : 1;
uint32_t htt : 1; /* Huffman-Table Type */
uint32_t bcf : 1; /* Block-Continuation Flag */
uint32_t bcc : 1; /* Block Closing Control */
uint32_t bhf : 1; /* Block Header Final */
uint32_t reserved136 : 1;
uint32_t reserved137 : 1;
uint32_t dhtgc : 1; /* DHT Generation Control */
uint32_t reserved139 : 5;
uint32_t reserved144 : 5;
uint32_t sbb : 3; /* Sub-Byte Boundary */
uint8_t oesc; /* Operation-Ending-Supplemental Code */
uint32_t reserved160 : 12;
uint32_t ifs : 4; /* Incomplete-Function Status */
uint16_t ifl; /* Incomplete-Function Length */
uint8_t reserved192[8];
uint8_t reserved256[8];
uint8_t reserved320[4];
uint16_t hl; /* History Length */
uint32_t reserved368 : 1;
uint16_t ho : 15; /* History Offset */
uint32_t cv; /* Check Value */
uint32_t eobs : 15; /* End-of-block Symbol */
uint32_t reserved431: 1;
uint8_t eobl : 4; /* End-of-block Length */
uint32_t reserved436 : 12;
uint32_t reserved448 : 4;
uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
Length */
uint8_t reserved464[6];
uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
uint8_t reserved[24];
uint8_t ribm2[8]; /* Reserved for IBM use */
uint8_t csb[1152]; /* Continuation-State Buffer */
};
#define DFLTCC_SIZEOF_GDHT_V0 384
#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
static inline z_const char *oesc_msg(char *buf, int oesc) {
if (oesc == 0x00)
return NULL; /* Successful completion */
else {
sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
return buf;
}
}
/*
C wrapper for the DEFLATE CONVERSION CALL instruction.
*/
typedef enum {
DFLTCC_CC_OK = 0,
DFLTCC_CC_OP1_TOO_SHORT = 1,
DFLTCC_CC_OP2_TOO_SHORT = 2,
DFLTCC_CC_OP2_CORRUPT = 2,
DFLTCC_CC_AGAIN = 3,
} dfltcc_cc;
#define DFLTCC_QAF 0
#define DFLTCC_GDHT 1
#define DFLTCC_CMPR 2
#define DFLTCC_XPND 4
#define HBT_CIRCULAR (1 << 7)
#define DFLTCC_FN_MASK ((1 << 7) - 1)
#define HB_BITS 15
#define HB_SIZE (1 << HB_BITS)
static inline dfltcc_cc dfltcc(int fn, void *param,
unsigned char **op1, size_t *len1,
z_const unsigned char **op2, size_t *len2, void *hist) {
unsigned char *t2 = op1 ? *op1 : NULL;
#ifdef Z_MEMORY_SANITIZER
unsigned char *orig_t2 = t2;
#endif
size_t t3 = len1 ? *len1 : 0;
z_const unsigned char *t4 = op2 ? *op2 : NULL;
size_t t5 = len2 ? *len2 : 0;
Z_REGISTER int r0 __asm__("r0") = fn;
Z_REGISTER void *r1 __asm__("r1") = param;
Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
Z_REGISTER size_t r3 __asm__("r3") = t3;
Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
Z_REGISTER size_t r5 __asm__("r5") = t5;
int cc;
__asm__ volatile(
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
#endif
".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
#endif
"ipm %[cc]\n"
: [r2] "+r" (r2)
, [r3] "+r" (r3)
, [r4] "+r" (r4)
, [r5] "+r" (r5)
, [cc] "=r" (cc)
: [r0] "r" (r0)
, [r1] "r" (r1)
, [hist] "r" (hist)
#ifdef HAVE_SYS_SDT_H
, STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
#endif
: "cc", "memory");
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
#ifdef Z_MEMORY_SANITIZER
switch (fn & DFLTCC_FN_MASK) {
case DFLTCC_QAF:
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
break;
case DFLTCC_GDHT:
__msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
break;
case DFLTCC_CMPR:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
break;
case DFLTCC_XPND:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2);
break;
}
#endif
if (op1)
*op1 = t2;
if (len1)
*len1 = t3;
if (op2)
*op2 = t4;
if (len2)
*len2 = t5;
return (cc >> 28) & 3;
}
/*
Extension of inflate_state and deflate_state. Must be doubleword-aligned.
*/
struct dfltcc_state {
struct dfltcc_param_v0 param; /* Parameter block. */
struct dfltcc_qaf_param af; /* Available functions. */
char msg[64]; /* Buffer for strm->msg */
};
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
static inline void *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt size, uInt extension_size) {
return ZALLOC(strm, 1, ALIGN_UP(size, 8) + extension_size);
}
static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
/* Initialize available functions */
if (is_dfltcc_enabled()) {
dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
} else
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Initialize parameter block */
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
dfltcc_state->param.nt = 1;
dfltcc_state->param.ribm = DFLTCC_RIBM;
}
static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
}
static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
const unsigned char *buf, uInt count) {
size_t offset;
size_t n;
/* Do not use more than 32K */
if (count > HB_SIZE) {
buf += count - HB_SIZE;
count = HB_SIZE;
}
offset = (param->ho + param->hl) % HB_SIZE;
if (offset + count <= HB_SIZE)
/* Circular history buffer does not wrap - copy one chunk */
memcpy(history + offset, buf, count);
else {
/* Circular history buffer wraps - copy two chunks */
n = HB_SIZE - offset;
memcpy(history + offset, buf, n);
memcpy(history, buf + n, count - n);
}
n = param->hl + count;
if (n <= HB_SIZE)
/* All history fits into buffer - no need to discard anything */
param->hl = n;
else {
/* History does not fit into buffer - discard extra bytes */
param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
param->hl = HB_SIZE;
}
}
static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
unsigned char *buf) {
if (param->ho + param->hl <= HB_SIZE)
/* Circular history buffer does not wrap - copy one chunk */
memcpy(buf, history + param->ho, param->hl);
else {
/* Circular history buffer wraps - copy two chunks */
memcpy(buf, history + param->ho, HB_SIZE - param->ho);
memcpy(buf + HB_SIZE - param->ho, history, param->ho + param->hl - HB_SIZE);
}
}

205
deps/zlib-ng/arch/s390/dfltcc_inflate.c vendored Normal file
View File

@@ -0,0 +1,205 @@
/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
/*
Use the following commands to build zlib-ng with DFLTCC decompression support:
$ ./configure --with-dfltcc-inflate
or
$ cmake -DWITH_DFLTCC_INFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "dfltcc_inflate.h"
#include "dfltcc_detail.h"
struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm) {
return (struct inflate_state *)dfltcc_alloc_state(strm, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
}
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
dfltcc_reset_state(dfltcc_state);
}
void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src) {
dfltcc_copy_state(dst, src, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
}
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
/* Unsupported hardware */
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
}
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
dfltcc_cc cc;
if (flush == Z_BLOCK || flush == Z_TREES) {
/* DFLTCC does not support stopping on block boundaries */
if (PREFIX(dfltcc_inflate_disable)(strm)) {
*ret = Z_STREAM_ERROR;
return DFLTCC_INFLATE_BREAK;
} else
return DFLTCC_INFLATE_SOFTWARE;
}
if (state->last) {
if (state->bits != 0) {
strm->next_in++;
strm->avail_in--;
state->bits = 0;
}
state->mode = CHECK;
return DFLTCC_INFLATE_CONTINUE;
}
if (strm->avail_in == 0 && !param->cf)
return DFLTCC_INFLATE_BREAK;
if (PREFIX(inflate_ensure_window)(state)) {
state->mode = MEM;
return DFLTCC_INFLATE_CONTINUE;
}
/* Translate stream to parameter block */
param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
param->sbb = state->bits;
if (param->hl)
param->nt = 0; /* Honor history for the first block */
if (state->wrap & 4)
param->cv = state->flags ? ZSWAP32(state->check) : state->check;
/* Inflate */
do {
cc = dfltcc_xpnd(strm);
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
state->last = cc == DFLTCC_CC_OK;
state->bits = param->sbb;
if (state->wrap & 4)
strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
/* Report an error if stream is corrupted */
state->mode = BAD;
return DFLTCC_INFLATE_CONTINUE;
}
state->mode = TYPEDO;
/* Break if operands are exhausted, otherwise continue looping */
return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
}
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
return !param->nt;
}
/*
Rotates a circular buffer.
The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
*/
static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
unsigned char *p = pivot;
unsigned char tmp;
while (p != start) {
tmp = *start;
*start = *p;
*p = tmp;
start++;
p++;
if (p == end)
p = pivot;
else if (start == pivot)
pivot = p;
}
}
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (!PREFIX(dfltcc_can_inflate)(strm))
return 0;
if (PREFIX(dfltcc_was_inflate_used)(strm))
/* DFLTCC has already decompressed some data. Since there is not
* enough information to resume decompression in software, the call
* must fail.
*/
return 1;
/* DFLTCC was not used yet - decompress in software */
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Convert the window from the hardware to the software format */
rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
state->whave = state->wnext = MIN(param->hl, state->wsize);
return 0;
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (PREFIX(inflate_ensure_window)(state)) {
state->mode = MEM;
return Z_MEM_ERROR;
}
append_history(param, state->window, dictionary, dict_length);
state->havedict = 1;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt *dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (dictionary && state->window)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

70
deps/zlib-ng/arch/s390/dfltcc_inflate.h vendored Normal file
View File

@@ -0,0 +1,70 @@
#ifndef DFLTCC_INFLATE_H
#define DFLTCC_INFLATE_H
#include "dfltcc_common.h"
struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm);
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src);
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
typedef enum {
DFLTCC_INFLATE_CONTINUE,
DFLTCC_INFLATE_BREAK,
DFLTCC_INFLATE_SOFTWARE,
} dfltcc_inflate_action;
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt* dict_length);
#define ZALLOC_INFLATE_STATE PREFIX(dfltcc_alloc_inflate_state)
#define ZCOPY_INFLATE_STATE PREFIX(dfltcc_copy_inflate_state)
#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
#define INFLATE_PRIME_HOOK(strm, bits, value) \
do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) \
if (PREFIX(dfltcc_can_inflate)((strm))) { \
dfltcc_inflate_action action; \
\
RESTORE(); \
action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
LOAD(); \
if (action == DFLTCC_INFLATE_CONTINUE) \
break; \
else if (action == DFLTCC_INFLATE_BREAK) \
goto inf_leave; \
}
#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_MARK_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
} while (0)
#define INFLATE_SYNC_POINT_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
} while (0)
#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#endif

14
deps/zlib-ng/arch/s390/s390_features.c vendored Normal file
View File

@@ -0,0 +1,14 @@
#include "../../zbuild.h"
#include "s390_features.h"
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifndef HWCAP_S390_VXRS
#define HWCAP_S390_VXRS HWCAP_S390_VX
#endif
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
}

10
deps/zlib-ng/arch/s390/s390_features.h vendored Normal file
View File

@@ -0,0 +1,10 @@
#ifndef S390_FEATURES_H_
#define S390_FEATURES_H_
struct s390_cpu_features {
int has_vx;
};
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
#endif

View File

@@ -0,0 +1,45 @@
# Self-Hosted IBM Z Github Actions Runner.
# Temporary image: amd64 dependencies.
FROM amd64/ubuntu:20.04 as ld-prefix
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get -y install ca-certificates libicu66 libssl1.1
# Main image.
FROM s390x/ubuntu:20.04
# Packages for zlib-ng testing.
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get -y install \
clang-11 \
cmake \
curl \
gcc \
git \
jq \
libxml2-dev \
libxslt-dev \
llvm-11-tools \
ninja-build \
python-is-python3 \
python3 \
python3-dev \
python3-pip
# amd64 dependencies.
COPY --from=ld-prefix / /usr/x86_64-linux-gnu/
RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/
RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/
ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu
# amd64 Github Actions Runner.
RUN useradd -m actions-runner
USER actions-runner
WORKDIR /home/actions-runner
RUN curl -L https://github.com/actions/runner/releases/download/v2.287.1/actions-runner-linux-x64-2.287.1.tar.gz | tar -xz
VOLUME /home/actions-runner
# Scripts.
COPY fs/ /
ENTRYPOINT ["/usr/bin/entrypoint"]
CMD ["/usr/bin/actions-runner"]

View File

@@ -0,0 +1,24 @@
[Unit]
Description=Self-Hosted IBM Z Github Actions Runner
Wants=qemu-user-static
After=qemu-user-static
StartLimitIntervalSec=0
[Service]
Type=simple
Restart=always
ExecStartPre=-/usr/bin/docker rm --force actions-runner
ExecStart=/usr/bin/docker run \
--env-file=/etc/actions-runner \
--init \
--interactive \
--name=actions-runner \
--rm \
--volume=actions-runner:/home/actions-runner \
iiilinuxibmcom/actions-runner
ExecStop=/bin/sh -c "docker exec actions-runner kill -INT -- -1"
ExecStop=/bin/sh -c "docker wait actions-runner"
ExecStop=/bin/sh -c "docker rm actions-runner"
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,40 @@
#!/bin/bash
#
# Ephemeral runner startup script.
#
# Expects the following environment variables:
#
# - repo=<owner>/<name>
# - access_token=<ghp_***>
#
set -e -u
# Check the cached registration token.
token_file=registration-token.json
set +e
expires_at=$(jq --raw-output .expires_at "$token_file" 2>/dev/null)
status=$?
set -e
if [[ $status -ne 0 || $(date +%s) -ge $(date -d "$expires_at" +%s) ]]; then
# Refresh the cached registration token.
curl \
-X POST \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: token $access_token" \
"https://api.github.com/repos/$repo/actions/runners/registration-token" \
-o "$token_file"
fi
# (Re-)register the runner.
registration_token=$(jq --raw-output .token "$token_file")
./config.sh remove --token "$registration_token" || true
./config.sh \
--url "https://github.com/$repo" \
--token "$registration_token" \
--labels z15 \
--ephemeral
# Run one job.
./run.sh

View File

@@ -0,0 +1,30 @@
#!/bin/bash
#
# Container entrypoint that waits for all spawned processes.
#
set -e -u
# Create a FIFO and start reading from its read end.
tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
trap 'rm -r "$tempdir"' EXIT
done="$tempdir/pipe"
mkfifo "$done"
cat "$done" & waiter=$!
# Start the workload. Its descendants will inherit the FIFO's write end.
status=0
if [ "$#" -eq 0 ]; then
bash 9>"$done" || status=$?
else
"$@" 9>"$done" || status=$?
fi
# When the workload and all of its descendants exit, the FIFO's write end will
# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
# in order to handle SelfUpdater, which the workload may start in background
# before exiting.
wait "$waiter"
exit "$status"

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Support for transparent execution of non-native binaries with QEMU user emulation
[Service]
Type=oneshot
# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1
# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available
ExecStart=/usr/bin/docker run --rm --interactive --privileged iiilinuxibmcom/qemu-user-static --reset -p yes
[Install]
WantedBy=multi-user.target

147
deps/zlib-ng/arch/x86/Makefile.in vendored Normal file
View File

@@ -0,0 +1,147 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
AVX512VNNIFLAG=-mavx512vnni
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
XSAVEFLAG=-mxsave
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
x86_features.o x86_features.lo \
adler32_avx2.o adler32_avx2.lo \
adler32_avx512.o adler32_avx512.lo \
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx2.o chunkset_avx2.lo \
chunkset_sse2.o chunkset_sse2.lo \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
slide_hash_sse2.o slide_hash_sse2.lo
x86_features.o:
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
x86_features.lo:
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
chunkset_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_ssse3.o:
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
chunkset_ssse3.lo:
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
compare256_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_vpclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
crc32_vpclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
slide_hash_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

17
deps/zlib-ng/arch/x86/adler32_avx2.c vendored Normal file
View File

@@ -0,0 +1,17 @@
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <immintrin.h>
#ifdef X86_AVX2
#include "adler32_avx2_tpl.h"
#define COPY
#include "adler32_avx2_tpl.h"
#endif

32
deps/zlib-ng/arch/x86/adler32_avx2_p.h vendored Normal file
View File

@@ -0,0 +1,32 @@
/* adler32_avx2_p.h -- adler32 avx2 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_AVX2_P_H_
#define ADLER32_AVX2_P_H_
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
static inline uint32_t hsum256(__m256i x) {
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
_mm256_castsi256_si128(x));
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
static inline uint32_t partial_hsum256(__m256i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros */
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif
#endif

141
deps/zlib-ng/arch/x86/adler32_avx2_tpl.h vendored Normal file
View File

@@ -0,0 +1,141 @@
/* adler32_avx2_tpl.h -- adler32 avx2 vectorized function templates
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "../../adler32_p.h"
#include "../../fallback_builtins.h"
#include "adler32_avx2_p.h"
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
#endif
#ifdef COPY
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
#endif
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
#ifdef COPY
return adler32_copy_len_16(adler0, src, dst, len, adler1);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
} else if (len < 32) {
#ifdef COPY
return copy_sub32(adler, dst, src, len);
#else
return sub32(adler, src, len);
#endif
}
__m256i vs1, vs2;
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m256i dot3v = _mm256_set1_epi16(1);
const __m256i zero = _mm256_setzero_si256();
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
*/
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
//
#ifdef COPY
_mm256_storeu_si256((__m256i*)dst, vbuf);
dst += 32;
#endif
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
vs2 = _mm256_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
/* Defer the multiplication with 32 to outside of the loop */
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
/* The compiler is generating the following sequence for this integer modulus
* when done the scalar way, in GPRs:
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
...
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
shr $0x2f,%rsi // shift right by 47
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
...
// repeats for each element with vpextract instructions
This is tricky with AVX2 for a number of reasons:
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
back down to 32 bit precision later (there is in AVX512)
3.) Full width integer multiplications aren't cheap
We can, however, and do a relatively cheap sequence for horizontal sums.
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
performed on the maximum possible inputs before overflow
*/
/* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
* what the compiler is doing to avoid integer divisions. */
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
if (len) {
goto rem_peel;
}
return adler;
}

16
deps/zlib-ng/arch/x86/adler32_avx512.c vendored Normal file
View File

@@ -0,0 +1,16 @@
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512
#include "adler32_avx512_tpl.h"
#define COPY
#include "adler32_avx512_tpl.h"
#endif

View File

@@ -0,0 +1,46 @@
#ifndef AVX512_FUNCS_H
#define AVX512_FUNCS_H
#include <immintrin.h>
#include <stdint.h>
/* Written because *_add_epi32(a) sets off ubsan */
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
__m256i a = _mm512_extracti64x4_epi64(x, 1);
__m256i b = _mm512_extracti64x4_epi64(x, 0);
__m256i a_plus_b = _mm256_add_epi32(a, b);
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
__m128i c_plus_d = _mm_add_epi32(c, d);
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
static inline uint32_t partial_hsum(__m512i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros. Marking this const so the compiler stands
* a better chance of keeping this resident in a register through entire
* loop execution. We certainly have enough zmm registers (32) */
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
1, 1, 1, 1, 1, 1, 1, 1);
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
/* From here, it's a simple 256 bit wide reduction sum */
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
* pretty slow, much slower than the longer instruction sequence below */
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
_mm256_castsi256_si128(non_zero_avx));
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif

View File

@@ -0,0 +1,106 @@
/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "../../cpu_features.h"
#include "../../fallback_builtins.h"
#include <immintrin.h>
#include "adler32_avx512_p.h"
#ifdef X86_AVX512
#ifdef COPY
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
#endif
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 64) {
/* This handles the remaining copies, just call normal adler checksum after this */
#ifdef COPY
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
#endif
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
__m512i vbuf, vs1_0, vs3;
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i dot3v = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
size_t k;
while (len >= 64) {
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
vs1_0 = vs1;
vs3 = _mm512_setzero_si512();
k = MIN(len, NMAX);
k -= k % 64;
len -= k;
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf = _mm512_loadu_si512(src);
#ifdef COPY
_mm512_storeu_si512(dst, vbuf);
dst += 64;
#endif
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
vs1 = _mm512_add_epi32(vs1_sad, vs1);
vs3 = _mm512_add_epi32(vs3, vs1_0);
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm512_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
#endif

View File

@@ -0,0 +1,225 @@
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
* Based on Brian Bockelman's AVX2 version
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512VNNI
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../cpu_features.h"
#include "../../fallback_builtins.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 32)
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
if (len < 64)
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i zero = _mm512_setzero_si512();
__m512i vs1, vs2;
while (len >= 64) {
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
__m512i vs3 = _mm512_setzero_si512();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m512i vs2_1 = _mm512_setzero_si512();
__m512i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 128) {
vbuf1 = _mm512_loadu_si512((__m512i*)src);
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 128) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm512_loadu_si512((__m512i*)src);
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
src += 128;
k -= 128;
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm512_add_epi32(vs3, vs1);
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
vs2 = _mm512_add_epi32(vs2, vs2_1);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel_copy:
if (len < 32) {
/* This handles the remaining copies, just call normal adler checksum after this */
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
const __m256i zero = _mm256_setzero_si256();
__m256i vs1, vs2;
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m256i vs2_1 = _mm256_setzero_si256();
__m256i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 64) {
vbuf1 = _mm256_loadu_si256((__m256i*)src);
_mm256_storeu_si256((__m256i*)dst, vbuf1);
dst += 32;
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm256_loadu_si256((__m256i*)src);
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
_mm256_storeu_si256((__m256i*)dst, vbuf0);
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
dst += 64;
src += 64;
k -= 64;
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm256_add_epi32(vs3, vs1);
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
vs2 = _mm256_add_epi32(vs2, vs2_1);
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel_copy;
}
return adler;
}
#endif

121
deps/zlib-ng/arch/x86/adler32_sse42.c vendored Normal file
View File

@@ -0,0 +1,121 @@
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "adler32_ssse3_p.h"
#include <immintrin.h>
#ifdef X86_SSE42
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
return adler32_copy_len_16(adler0, src, dst, len, adler1);
}
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
v_sad_sum2, vsum2, vsum2_0;
__m128i zero = _mm_setzero_si128();
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
size_t k;
while (len >= 16) {
k = MIN(len, NMAX);
k -= k % 16;
len -= k;
vs1 = _mm_cvtsi32_si128(adler0);
vs2 = _mm_cvtsi32_si128(adler1);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
src += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
_mm_storeu_si128((__m128i*)dst, vbuf);
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
dst += 32;
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
vs2 = _mm_add_epi32(vsum2, vs2);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
src += 16;
k -= 16;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
_mm_storeu_si128((__m128i*)dst, vbuf);
dst += 16;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = hsum(vs2) % BASE;
}
/* If this is true, there's fewer than 16 elements remaining */
if (len) {
goto rem_peel;
}
return adler0 | (adler1 << 16);
}
#endif

156
deps/zlib-ng/arch/x86/adler32_ssse3.c vendored Normal file
View File

@@ -0,0 +1,156 @@
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3
#include <immintrin.h>
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
/* If our buffer is unaligned (likely), make the determination whether
* or not there's enough of a buffer to consume to make the scalar, aligning
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
* load. This is a pretty simple test, just test if 16 - the remainder + len is
* < 16 */
size_t max_iters = NMAX;
size_t rem = (uintptr_t)buf & 15;
size_t align_offset = 16 - rem;
size_t k = 0;
if (rem) {
if (len < 16 + align_offset) {
/* Let's eat the cost of this one unaligned load so that
* we don't completely skip over the vectorization. Doing
* 16 bytes at a time unaligned is is better than 16 + <= 15
* sums */
vbuf = _mm_loadu_si128((__m128i*)buf);
len -= 16;
buf += 16;
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs1_0 = vs1;
goto unaligned_jmp;
}
for (size_t i = 0; i < align_offset; ++i) {
adler += *(buf++);
sum2 += adler;
}
/* lop off the max number of sums based on the scalar sums done
* above */
len -= align_offset;
max_iters -= align_offset;
}
while (len >= 16) {
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
k = (len < max_iters ? len : max_iters);
k -= k % 16;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
buf += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs2 = _mm_add_epi32(vsum2, vs2);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
buf += 16;
k -= 16;
unaligned_jmp:
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
* a partial reduction sum implicitly and only summing to integers in vector positions
* 0 and 2. This saves us some contention on the shuffle port(s) */
adler = partial_hsum(vs1) % BASE;
sum2 = hsum(vs2) % BASE;
max_iters = NMAX;
}
/* Process tail (len < 16). */
return adler32_len_16(adler, buf, len, sum2);
}
#endif

29
deps/zlib-ng/arch/x86/adler32_ssse3_p.h vendored Normal file
View File

@@ -0,0 +1,29 @@
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_SSSE3_P_H_
#define ADLER32_SSSE3_P_H_
#ifdef X86_SSSE3
#include <immintrin.h>
#include <stdint.h>
static inline uint32_t partial_hsum(__m128i x) {
__m128i second_int = _mm_srli_si128(x, 8);
__m128i sum = _mm_add_epi32(x, second_int);
return _mm_cvtsi128_si32(sum);
}
static inline uint32_t hsum(__m128i x) {
__m128i sum1 = _mm_unpackhi_epi64(x, x);
__m128i sum2 = _mm_add_epi32(x, sum1);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
#endif
#endif

135
deps/zlib-ng/arch/x86/chunkset_avx2.c vendored Normal file
View File

@@ -0,0 +1,135 @@
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#ifdef X86_AVX2
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
typedef __m256i chunk_t;
#define CHUNK_SIZE 32
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
static const lut_rem_pair perm_idx_lut[29] = {
{ 0, 2}, /* 3 */
{ 0, 0}, /* don't care */
{ 1 * 32, 2}, /* 5 */
{ 2 * 32, 2}, /* 6 */
{ 3 * 32, 4}, /* 7 */
{ 0 * 32, 0}, /* don't care */
{ 4 * 32, 5}, /* 9 */
{ 5 * 32, 22}, /* 10 */
{ 6 * 32, 21}, /* 11 */
{ 7 * 32, 20}, /* 12 */
{ 8 * 32, 6}, /* 13 */
{ 9 * 32, 4}, /* 14 */
{10 * 32, 2}, /* 15 */
{ 0 * 32, 0}, /* don't care */
{11 * 32, 15}, /* 17 */
{11 * 32 + 16, 14}, /* 18 */
{11 * 32 + 16 * 2, 13}, /* 19 */
{11 * 32 + 16 * 3, 12}, /* 20 */
{11 * 32 + 16 * 4, 11}, /* 21 */
{11 * 32 + 16 * 5, 10}, /* 22 */
{11 * 32 + 16 * 6, 9}, /* 23 */
{11 * 32 + 16 * 7, 8}, /* 24 */
{11 * 32 + 16 * 8, 7}, /* 25 */
{11 * 32 + 16 * 9, 6}, /* 26 */
{11 * 32 + 16 * 10, 5}, /* 27 */
{11 * 32 + 16 * 11, 4}, /* 28 */
{11 * 32 + 16 * 12, 3}, /* 29 */
{11 * 32 + 16 * 13, 2}, /* 30 */
{11 * 32 + 16 * 14, 1} /* 31 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
*chunk_rem = lut_rem.remval;
#ifdef Z_MEMORY_SANITIZER
/* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 32 - dist);
#endif
if (dist < 16) {
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
* shuffles and combining the halves later */
const __m256i permute_xform =
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
} else if (dist == 16) {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
} else {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
* shuffle those values */
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
}
return ret_vec;
}
#define CHUNKSIZE chunksize_avx2
#define CHUNKCOPY chunkcopy_avx2
#define CHUNKUNROLL chunkunroll_avx2
#define CHUNKMEMSET chunkmemset_avx2
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_avx2
#include "inffast_tpl.h"
#endif

56
deps/zlib-ng/arch/x86/chunkset_sse2.c vendored Normal file
View File

@@ -0,0 +1,56 @@
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
#define CHUNKSIZE chunksize_sse2
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKMEMSET chunkmemset_sse2
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_sse2
#include "inffast_tpl.h"
#endif

103
deps/zlib-ng/arch/x86/chunkset_ssse3.c vendored Normal file
View File

@@ -0,0 +1,103 @@
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
#if defined(X86_SSSE3) && defined(X86_SSE2)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
#define HAVE_CHUNKCOPY
#define HAVE_CHUNKUNROLL
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
#ifdef Z_MEMORY_SANITIZER
/* Important to note:
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
* bytes we willingly and purposefully load uninitialized that we swizzle over
* in a vector register, anyway. If what we assume is wrong about what is used,
* the memory sanitizer will still usefully flag it */
__msan_unpoison(buf + dist, 16 - dist);
#endif
ret_vec = _mm_loadu_si128((__m128i*)buf);
*chunk_rem = lut_rem.remval;
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
#define CHUNKSIZE chunksize_ssse3
#define CHUNKMEMSET chunkmemset_ssse3
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_ssse3
#include "inffast_tpl.h"
#endif

63
deps/zlib-ng/arch/x86/compare256_avx2.c vendored Normal file
View File

@@ -0,0 +1,63 @@
/* compare256_avx2.c -- AVX2 version of compare256
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
__m256i ymm_src0, ymm_src1, ymm_cmp;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
return compare256_avx2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#endif

96
deps/zlib-ng/arch/x86/compare256_sse2.c vendored Normal file
View File

@@ -0,0 +1,96 @@
/* compare256_sse2.c -- SSE2 version of compare256
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
#include <emmintrin.h>
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
int align_offset = ((uintptr_t)src0) & 15;
const uint8_t *end0 = src0 + 256;
const uint8_t *end1 = src1 + 256;
__m128i xmm_src0, xmm_src1, xmm_cmp;
/* Do the first load unaligned, than all subsequent ones we have at least
* one aligned load. Sadly aligning both loads is probably unrealistic */
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
int align_adv = 16 - align_offset;
len += align_adv;
src0 += align_adv;
src1 += align_adv;
/* Do a flooring division (should just be a shift right) */
int num_iter = (256 - len) / 16;
for (int i = 0; i < num_iter; ++i) {
xmm_src0 = _mm_load_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
len += 16, src0 += 16, src1 += 16;
}
if (align_offset) {
src0 = end0 - 16;
src1 = end1 - 16;
len = 256 - 16;
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
}
return 256;
}
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
return compare256_sse2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#endif

View File

@@ -0,0 +1,186 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
#endif
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
#ifdef COPY
char ALIGNED_(16) partial_buf[16] = { 0 };
#else
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
#endif
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
#ifdef COPY
if (len == 0)
return;
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
#endif
goto partial;
}
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
dst += algn_diff;
#else
XOR_INITIAL128(xmm_crc_part);
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
}
#endif
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
src += algn_diff;
len -= algn_diff;
}
#ifdef X86_VPCLMULQDQ
if (len >= 256) {
#ifdef COPY
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
dst += n;
#else
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
xmm_initial, first);
first = 0;
#endif
len -= n;
src += n;
}
#endif
while (len >= 64) {
len -= 64;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
src += 64;
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
dst += 64;
#else
XOR_INITIAL128(xmm_t0);
#endif
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
}
/*
* len = num bytes left - 64
*/
if (len >= 48) {
len -= 48;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
src += 48;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
dst += 48;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
} else if (len >= 32) {
len -= 32;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
src += 32;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
dst += 32;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
} else if (len >= 16) {
len -= 16;
xmm_t0 = _mm_load_si128((__m128i *)src);
src += 16;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
dst += 16;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
}
partial:
if (len) {
memcpy(&xmm_crc_part, src, len);
#ifdef COPY
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
memcpy(dst, partial_buf, len);
#endif
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
}
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
}

View File

@@ -0,0 +1,107 @@
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
#else
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
__m128i init_crc, int32_t first) {
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
#endif
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
size_t len_tmp = len;
const __m512i zmm_fold4 = _mm512_set4_epi32(
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
#ifndef COPY
XOR_INITIAL512(zmm_t0);
#endif
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
dst += 256;
#endif
len -= 256;
src += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
dst += 256;
#endif
len -= 256;
src += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}

30
deps/zlib-ng/arch/x86/crc32_pclmulqdq.c vendored Normal file
View File

@@ -0,0 +1,30 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_PCLMULQDQ_CRC
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
#define CRC32_FOLD crc32_fold_pclmulqdq
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
#define CRC32 crc32_pclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

View File

@@ -0,0 +1,363 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
#ifdef X86_VPCLMULQDQ
# include <immintrin.h>
#endif
#include "../../crc32_fold.h"
#include "../../crc32_braid_p.h"
#include "../../fallback_builtins.h"
#include <assert.h>
#ifdef X86_VPCLMULQDQ
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
int32_t first);
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
#endif
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc3, ps_res;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
*xmm_crc0 = *xmm_crc1;
*xmm_crc1 = *xmm_crc2;
*xmm_crc2 = x_tmp3;
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3, x_tmp2;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
x_tmp3 = *xmm_crc3;
x_tmp2 = *xmm_crc2;
*xmm_crc3 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
*xmm_crc2 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
*xmm_crc0 = x_tmp2;
*xmm_crc1 = x_tmp3;
*xmm_crc2 = _mm_castps_si128(ps_res20);
*xmm_crc3 = _mm_castps_si128(ps_res31);
}
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc2;
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
*xmm_crc2 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
*xmm_crc1 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
*xmm_crc0 = x_tmp3;
*xmm_crc1 = _mm_castps_si128(ps_res10);
*xmm_crc2 = _mm_castps_si128(ps_res21);
*xmm_crc3 = _mm_castps_si128(ps_res32);
}
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
x_tmp0 = *xmm_crc0;
x_tmp1 = *xmm_crc1;
x_tmp2 = *xmm_crc2;
x_tmp3 = *xmm_crc3;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_t0 = _mm_castsi128_ps(x_tmp0);
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_t1 = _mm_castsi128_ps(x_tmp1);
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_t2 = _mm_castsi128_ps(x_tmp2);
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_t3 = _mm_castsi128_ps(x_tmp3);
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
*xmm_crc0 = _mm_castps_si128(ps_res0);
*xmm_crc1 = _mm_castps_si128(ps_res1);
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
};
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
__m128i xmm_a0_0, xmm_a0_1;
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
xmm_shr = xmm_shl;
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
ps_res = _mm_xor_ps(ps_res, psa0_1);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
*fold0 = _mm_load_si128(fold + 0);
*fold1 = _mm_load_si128(fold + 1);
*fold2 = _mm_load_si128(fold + 2);
*fold3 = _mm_load_si128(fold + 3);
}
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
const __m128i *fold2, const __m128i *fold3) {
_mm_storeu_si128(fold + 0, *fold0);
_mm_storeu_si128(fold + 1, *fold1);
_mm_storeu_si128(fold + 2, *fold2);
_mm_storeu_si128(fold + 3, *fold3);
}
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_zero = _mm_setzero_si128();
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
return 0;
}
#define ONCE(op) if (first) { first = 0; op; }
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
#ifdef X86_VPCLMULQDQ
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
#endif
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
#define COPY
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
static const unsigned ALIGNED_(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
0xccaa009e, 0x00000000, /* rk5 */
0x63cd6124, 0x00000001, /* rk6 */
0xf7011640, 0x00000001, /* rk7 */
0xdb710640, 0x00000001 /* rk8 */
};
static const unsigned ALIGNED_(16) crc_mask[4] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
};
static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
/*
* k1
*/
crc_fold = _mm_load_si128((__m128i *)crc_k);
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
/*
* k5
*/
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
/*
* k7
*/
xmm_crc1 = xmm_crc3;
xmm_crc2 = xmm_crc3;
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
xmm_crc2 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
return crc->value;
}
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
if (len < 64)
return PREFIX(crc32_braid)(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
CRC32_FOLD_RESET(&crc_state);
CRC32_FOLD(&crc_state, buf, len, crc32);
return CRC32_FOLD_FINAL(&crc_state);
}

View File

@@ -0,0 +1,17 @@
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
#define X86_VPCLMULQDQ
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
#define CRC32_FOLD crc32_fold_vpclmulqdq
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
#define CRC32 crc32_vpclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

View File

@@ -0,0 +1,50 @@
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#include "../../zbuild.h"
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
#include "../../deflate.h"
#ifdef X86_SSE42_CRC_INTRIN
# ifdef _MSC_VER
# define HASH_CALC(s, h, val)\
h = _mm_crc32_u32(h, val)
# else
# define HASH_CALC(s, h, val)\
h = __builtin_ia32_crc32si(h, val)
# endif
#else
# ifdef _MSC_VER
# define HASH_CALC(s, h, val) {\
__asm mov edx, h\
__asm mov eax, val\
__asm crc32 eax, edx\
__asm mov h, eax\
}
# else
# define HASH_CALC(s, h, val) \
__asm__ __volatile__ (\
"crc32 %1,%0\n\t"\
: "+r" (h)\
: "r" (val)\
);
# endif
#endif
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash_sse42
#define INSERT_STRING insert_string_sse42
#define QUICK_INSERT_STRING quick_insert_string_sse42
#ifdef X86_SSE42
# include "../../insert_string_tpl.h"
#endif

39
deps/zlib-ng/arch/x86/slide_hash_avx2.c vendored Normal file
View File

@@ -0,0 +1,39 @@
/*
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
* Mika T. Lindqvist <postmaster@raasu.org>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
table += entries;
table -= 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)table);
result = _mm256_subs_epu16(value, wsize);
_mm256_storeu_si256((__m256i *)table, result);
table -= 16;
entries -= 16;
} while (entries > 0);
}
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
uint16_t wsize = (uint16_t)s->w_size;
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
slide_hash_chain(s->prev, wsize, ymm_wsize);
}

62
deps/zlib-ng/arch/x86/slide_hash_sse2.c vendored Normal file
View File

@@ -0,0 +1,62 @@
/*
* SSE optimized hash slide
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
#include <assert.h>
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
uint32_t entries1, const __m128i wsize) {
uint32_t entries;
Pos *table;
__m128i value0, value1, result0, result1;
int on_chain = 0;
next_chain:
table = (on_chain) ? table1 : table0;
entries = (on_chain) ? entries1 : entries0;
table += entries;
table -= 16;
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
* Our alloc function is aligned to 64 byte boundaries */
do {
value0 = _mm_load_si128((__m128i *)table);
value1 = _mm_load_si128((__m128i *)(table + 8));
result0 = _mm_subs_epu16(value0, wsize);
result1 = _mm_subs_epu16(value1, wsize);
_mm_store_si128((__m128i *)table, result0);
_mm_store_si128((__m128i *)(table + 8), result1);
table -= 16;
entries -= 16;
} while (entries > 0);
++on_chain;
if (on_chain > 1) {
return;
} else {
goto next_chain;
}
}
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
uint16_t wsize = (uint16_t)s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
assert(((uintptr_t)s->head & 15) == 0);
assert(((uintptr_t)s->prev & 15) == 0);
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
}

97
deps/zlib-ng/arch/x86/x86_features.c vendored Normal file
View File

@@ -0,0 +1,97 @@
/* x86_features.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
* Jim Kukunas
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "x86_features.h"
#ifdef _WIN32
# include <intrin.h>
#else
// Newer versions of GCC and clang come with cpuid.h
# include <cpuid.h>
#endif
#include <string.h>
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _WIN32
unsigned int registers[4];
__cpuid((int *)registers, info);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _WIN32
unsigned int registers[4];
__cpuidex((int *)registers, info, subinfo);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
static inline uint64_t xgetbv(unsigned int xcr) {
#ifdef _WIN32
return _xgetbv(xcr);
#else
uint32_t eax, edx;
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
return (uint64_t)(edx) << 32 | eax;
#endif
}
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
features->has_sse2 = edx & 0x4000000;
features->has_ssse3 = ecx & 0x200;
features->has_sse42 = ecx & 0x100000;
features->has_pclmulqdq = ecx & 0x2;
if (ecx & 0x08000000) {
uint64_t xfeature = xgetbv(0);
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
}
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
features->has_vpclmulqdq = ecx & 0x400;
// check AVX2 bit if the OS supports saving YMM registers
if (features->has_os_save_ymm) {
features->has_avx2 = ebx & 0x20;
}
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512 = ebx & 0x00010000;
features->has_avx512vnni = ecx & 0x800;
}
}
}

24
deps/zlib-ng/arch/x86/x86_features.h vendored Normal file
View File

@@ -0,0 +1,24 @@
/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FEATURES_H_
#define X86_FEATURES_H_
struct x86_cpu_features {
int has_avx2;
int has_avx512;
int has_avx512vnni;
int has_sse2;
int has_ssse3;
int has_sse42;
int has_pclmulqdq;
int has_vpclmulqdq;
int has_os_save_ymm;
int has_os_save_zmm;
};
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
#endif /* CPU_H_ */

42
deps/zlib-ng/chunkset.c vendored Normal file
View File

@@ -0,0 +1,42 @@
/* chunkset.c -- inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
typedef uint64_t chunk_t;
#define CHUNK_SIZE 8
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint8_t *dest = (uint8_t *)chunk;
memcpy(dest, from, sizeof(uint32_t));
memcpy(dest+4, from, sizeof(uint32_t));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
memcpy(chunk, from, sizeof(uint64_t));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
memcpy(out, chunk, sizeof(uint64_t));
}
#define CHUNKSIZE chunksize_c
#define CHUNKCOPY chunkcopy_c
#define CHUNKUNROLL chunkunroll_c
#define CHUNKMEMSET chunkmemset_c
#define CHUNKMEMSET_SAFE chunkmemset_safe_c
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_c
#include "inffast_tpl.h"

200
deps/zlib-ng/chunkset_tpl.h vendored Normal file
View File

@@ -0,0 +1,200 @@
/* chunkset_tpl.h -- inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include <stdlib.h>
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return sizeof(chunk_t);
}
/* Behave like memcpy, but assume that it's OK to overwrite at least
chunk_t bytes of output even if the length is shorter than this,
that the length is non-zero, and that `from` lags `out` by at least
sizeof chunk_t bytes (or that they don't overlap at all or simply that
the distance is less than the length of the copy).
Aside from better memory bus utilisation, this means that short copies
(chunk_t bytes or fewer) will fall straight through the loop
without iteration, which will hopefully make the branch prediction more
reliable. */
#ifndef HAVE_CHUNKCOPY
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += align;
from += align;
len -= align;
while (len > 0) {
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += sizeof(chunk_t);
from += sizeof(chunk_t);
len -= sizeof(chunk_t);
}
return out;
}
#endif
/* Perform short copies until distance can be rewritten as being at least
sizeof chunk_t.
This assumes that it's OK to overwrite at least the first
2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
This assumption holds because inflate_fast() starts every iteration with at
least 258 bytes of output space available (258 being the maximum length
output from a single token; see inflate_fast()'s assumptions below). */
#ifndef HAVE_CHUNKUNROLL
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
unsigned char const *from = out - *dist;
chunk_t chunk;
while (*dist < *len && *dist < sizeof(chunk_t)) {
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += *dist;
*len -= *dist;
*dist += *dist;
}
return out;
}
#endif
#ifndef HAVE_CHUNK_MAG
/* Loads a magazine to feed into memory of the pattern */
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
/* This code takes string of length dist from "from" and repeats
* it for as many times as can fit in a chunk_t (vector register) */
uint32_t cpy_dist;
uint32_t bytes_remaining = sizeof(chunk_t);
chunk_t chunk_load;
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
while (bytes_remaining) {
cpy_dist = MIN(dist, bytes_remaining);
memcpy(cur_chunk, buf, cpy_dist);
bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively
* counting in this loop, anyway */
*chunk_rem = cpy_dist;
}
return chunk_load;
}
#endif
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}
#endif
uint8_t *from = out - dist;
if (dist == 1) {
memset(out, *from, len);
return out + len;
} else if (dist > sizeof(chunk_t)) {
return CHUNKCOPY(out, out - dist, len);
}
chunk_t chunk_load;
uint32_t chunk_mod = 0;
/* TODO: possibly build up a permutation table for this if not an even modulus */
#ifdef HAVE_CHUNKMEMSET_2
if (dist == 2) {
chunkmemset_2(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_4
if (dist == 4) {
chunkmemset_4(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_8
if (dist == 8) {
chunkmemset_8(from, &chunk_load);
} else if (dist == sizeof(chunk_t)) {
loadchunk(from, &chunk_load);
} else
#endif
{
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
if (chunk_mod == 0) {
while (len >= (2 * sizeof(chunk_t))) {
storechunk(out, &chunk_load);
storechunk(out + sizeof(chunk_t), &chunk_load);
out += 2 * sizeof(chunk_t);
len -= 2 * sizeof(chunk_t);
}
}
/* If we don't have a "dist" length that divides evenly into a vector
* register, we can write the whole vector register but we need only
* advance by the amount of the whole string that fits in our chunk_t.
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
while (len >= sizeof(chunk_t)) {
storechunk(out, &chunk_load);
len -= adv_amount;
out += adv_amount;
}
if (len) {
memcpy(out, &chunk_load, len);
out += len;
}
return out;
}
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
#if !defined(UNALIGNED64_OK)
# if !defined(UNALIGNED_OK)
static const uint32_t align_mask = 7;
# else
static const uint32_t align_mask = 3;
# endif
#endif
len = MIN(len, left);
uint8_t *from = out - dist;
#if !defined(UNALIGNED64_OK)
while (((uintptr_t)out & align_mask) && (len > 0)) {
*out++ = *from++;
--len;
--left;
}
#endif
if (left < (unsigned)(3 * sizeof(chunk_t))) {
while (len > 0) {
*out++ = *from++;
--len;
}
return out;
}
if (len)
return CHUNKMEMSET(out, dist, len);
return out;
}

111
deps/zlib-ng/cmake/detect-arch.c vendored Normal file
View File

@@ -0,0 +1,111 @@
// archdetect.c -- Detect compiler architecture and raise preprocessor error
// containing a simple arch identifier.
// Copyright (C) 2019 Hans Kristian Rosbach
// Licensed under the Zlib license, see LICENSE.md for details
// x86_64
#if defined(__x86_64__) || defined(_M_X64)
#error archfound x86_64
// x86
#elif defined(__i386) || defined(_M_IX86)
#error archfound i686
// ARM
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#error archfound aarch64
#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
#error archfound armv8
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
#error archfound armv7
#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
#error archfound armv6
#elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
#error archfound armv5
#elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
#error archfound armv4
#elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
#error archfound armv3
#elif defined(__ARM_ARCH_2__)
#error archfound armv2
#endif
// PowerPC
#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
#if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#error archfound powerpc64le
#else
#error archfound powerpc64
#endif
#else
#error archfound powerpc
#endif
// --------------- Less common architectures alphabetically below ---------------
// ALPHA
#elif defined(__alpha__) || defined(__alpha)
#error archfound alpha
// Blackfin
#elif defined(__BFIN__)
#error archfound blackfin
// Itanium
#elif defined(__ia64) || defined(_M_IA64)
#error archfound ia64
// MIPS
#elif defined(__mips__) || defined(__mips)
#error archfound mips
// Motorola 68000-series
#elif defined(__m68k__)
#error archfound m68k
// SuperH
#elif defined(__sh__)
#error archfound sh
// SPARC
#elif defined(__sparc__) || defined(__sparc)
#if defined(__sparcv9) || defined(__sparc_v9__)
#error archfound sparc9
#elif defined(__sparcv8) || defined(__sparc_v8__)
#error archfound sparc8
#endif
// SystemZ
#elif defined(__370__)
#error archfound s370
#elif defined(__s390__)
#error archfound s390
#elif defined(__s390x) || defined(__zarch__)
#error archfound s390x
// PARISC
#elif defined(__hppa__)
#error archfound parisc
// RS-6000
#elif defined(__THW_RS6000)
#error archfound rs6000
// RISC-V
#elif defined(__riscv)
#if __riscv_xlen == 64
#error archfound riscv64
#elif __riscv_xlen == 32
#error archfound riscv32
#endif
// Emscripten (WebAssembly)
#elif defined(__EMSCRIPTEN__)
#error archfound wasm32
// return 'unrecognized' if we do not know what architecture this is
#else
#error archfound unrecognized
#endif

101
deps/zlib-ng/cmake/detect-arch.cmake vendored Normal file
View File

@@ -0,0 +1,101 @@
# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
# Copyright (C) 2019 Hans Kristian Rosbach
# Licensed under the Zlib license, see LICENSE.md for details
set(ARCHDETECT_FOUND TRUE)
if(CMAKE_OSX_ARCHITECTURES)
# If multiple architectures are requested (universal build), pick only the first
list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
elseif(MSVC)
if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
set(ARCH "i686")
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
set(ARCH "x86_64")
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
set(ARCH "arm")
elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64")
set(ARCH "aarch64")
endif()
elseif(EMSCRIPTEN)
set(ARCH "wasm32")
elseif(CMAKE_CROSSCOMPILING)
set(ARCH ${CMAKE_C_COMPILER_TARGET})
else()
# Let preprocessor parse archdetect.c and raise an error containing the arch identifier
enable_language(C)
try_run(
run_result_unused
compile_result_unused
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
COMPILE_OUTPUT_VARIABLE RAWOUTPUT
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)
# Find basearch tag, and extract the arch word into BASEARCH variable
string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
if(NOT ARCH)
set(ARCH unknown)
endif()
endif()
# Make sure we have ARCH set
if(NOT ARCH OR ARCH STREQUAL "unknown")
set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
else()
message(STATUS "Arch detected: '${ARCH}'")
endif()
# Base arch detection
if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
set(BASEARCH "x86")
set(BASEARCH_X86_FOUND TRUE)
elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64)")
set(BASEARCH "arm")
set(BASEARCH_ARM_FOUND TRUE)
elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
set(BASEARCH "ppc")
set(BASEARCH_PPC_FOUND TRUE)
elseif("${ARCH}" MATCHES "alpha")
set(BASEARCH "alpha")
set(BASEARCH_ALPHA_FOUND TRUE)
elseif("${ARCH}" MATCHES "blackfin")
set(BASEARCH "blackfin")
set(BASEARCH_BLACKFIN_FOUND TRUE)
elseif("${ARCH}" MATCHES "ia64")
set(BASEARCH "ia64")
set(BASEARCH_IA64_FOUND TRUE)
elseif("${ARCH}" MATCHES "mips")
set(BASEARCH "mips")
set(BASEARCH_MIPS_FOUND TRUE)
elseif("${ARCH}" MATCHES "m68k")
set(BASEARCH "m68k")
set(BASEARCH_M68K_FOUND TRUE)
elseif("${ARCH}" MATCHES "sh")
set(BASEARCH "sh")
set(BASEARCH_SH_FOUND TRUE)
elseif("${ARCH}" MATCHES "sparc[89]?")
set(BASEARCH "sparc")
set(BASEARCH_SPARC_FOUND TRUE)
elseif("${ARCH}" MATCHES "s3[679]0x?")
set(BASEARCH "s360")
set(BASEARCH_S360_FOUND TRUE)
elseif("${ARCH}" MATCHES "parisc")
set(BASEARCH "parisc")
set(BASEARCH_PARISC_FOUND TRUE)
elseif("${ARCH}" MATCHES "rs6000")
set(BASEARCH "rs6000")
set(BASEARCH_RS6000_FOUND TRUE)
elseif("${ARCH}" MATCHES "riscv(32|64)")
set(BASEARCH "riscv")
set(BASEARCH_RISCV_FOUND TRUE)
elseif("${ARCH}" MATCHES "wasm32")
set(BASEARCH "wasm32")
set(BASEARCH_WASM32_FOUND TRUE)
else()
set(BASEARCH "x86")
set(BASEARCH_X86_FOUND TRUE)
message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
endif()
message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")

View File

@@ -0,0 +1,46 @@
# detect-coverage.cmake -- Detect supported compiler coverage flags
# Licensed under the Zlib license, see LICENSE.md for details
macro(add_code_coverage)
# Check for -coverage flag support for Clang/GCC
if(CMAKE_VERSION VERSION_LESS 3.14)
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
else()
set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
endif()
check_c_compiler_flag(-coverage HAVE_COVERAGE)
set(CMAKE_REQUIRED_LIBRARIES)
set(CMAKE_REQUIRED_LINK_OPTIONS)
if(HAVE_COVERAGE)
add_compile_options(-coverage)
add_link_options(-coverage)
message(STATUS "Code coverage enabled using: -coverage")
else()
# Some versions of GCC don't support -coverage shorthand
if(CMAKE_VERSION VERSION_LESS 3.14)
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
else()
set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
endif()
check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
set(CMAKE_REQUIRED_LIBRARIES)
set(CMAKE_REQUIRED_LINK_OPTIONS)
if(HAVE_TEST_COVERAGE)
add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
add_link_options(-lgcov -fprofile-arcs)
message(STATUS "Code coverage enabled using: -ftest-coverage")
else()
message(WARNING "Compiler does not support code coverage")
set(WITH_CODE_COVERAGE OFF)
endif()
endif()
# Set optimization level to zero for code coverage builds
if (WITH_CODE_COVERAGE)
# Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
endif()
endmacro()

View File

@@ -0,0 +1,43 @@
# detect-install-dirs.cmake -- Detect install directory parameters
# Copyright (C) 2021 Hans Kristian Rosbach
# Licensed under the Zlib license, see LICENSE.md for details
# Determine installation directory for executables
if (DEFINED BIN_INSTALL_DIR)
set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
elseif (DEFINED INSTALL_BIN_DIR)
set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
endif()
# Determine installation directory for libraries
if (DEFINED LIB_INSTALL_DIR)
set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
elseif (DEFINED INSTALL_LIB_DIR)
set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
endif()
# Determine installation directory for include files
if (DEFINED INC_INSTALL_DIR)
set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
elseif (DEFINED INSTALL_INC_DIR)
set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
endif()
# Define GNU standard installation directories
include(GNUInstallDirs)
# Determine installation directory for pkgconfig files
if (DEFINED PKGCONFIG_INSTALL_DIR)
set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED INSTALL_PKGCONFIG_DIR)
set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
else()
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
endif()

View File

@@ -0,0 +1,548 @@
# detect-intrinsics.cmake -- Detect compiler intrinsics support
# Licensed under the Zlib license, see LICENSE.md for details
macro(check_acle_compiler_flag)
if(MSVC)
# Both ARM and ARM64-targeting msvc support intrinsics, but
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
set(HAVE_ACLE_FLAG TRUE)
endif()
else()
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
endif()
endif()
# Check whether compiler supports ACLE flag
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG FAIL_REGEX "not supported")
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
# Check whether compiler supports ACLE flag
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
endif()
set(CMAKE_REQUIRED_FLAGS)
endif()
endmacro()
macro(check_avx512_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
else()
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
else()
set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512FLAG "/arch:AVX512")
endif()
# Check whether compiler supports AVX512 intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m512i x = _mm512_set1_epi8(2);
const __m512i y = _mm512_set_epi32(0x1020304, 0x5060708, 0x90a0b0c, 0xd0e0f10,
0x11121314, 0x15161718, 0x191a1b1c, 0x1d1e1f20,
0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30,
0x31323334, 0x35363738, 0x393a3b3c, 0x3d3e3f40);
x = _mm512_sub_epi8(x, y);
(void)x;
return 0;
}"
HAVE_AVX512_INTRIN
)
# Evidently both GCC and clang were late to implementing these
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__mmask16 a = 0xFF;
a = _knot_mask16(a);
(void)a;
return 0;
}"
HAVE_MASK_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx512vnni_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
else()
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
# Check whether compiler supports AVX512vnni intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m512i x = _mm512_set1_epi8(2);
const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
__m512i z = _mm512_setzero_epi32();
z = _mm512_dpbusd_epi32(z, x, y);
(void)z;
return 0;
}"
HAVE_AVX512VNNI_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX2FLAG "-mavx2")
else()
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX2FLAG "-mavx2")
endif()
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
# Check whether compiler supports AVX2 intrinics
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m256i x = _mm256_set1_epi16(2);
const __m256i y = _mm256_set1_epi16(1);
x = _mm256_subs_epu16(x, y);
(void)x;
return 0;
}"
HAVE_AVX2_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_neon_compiler_flag)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
set(NEONFLAG "-mfpu=neon")
endif()
endif()
endif()
# Check whether compiler supports NEON flag
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int main() { return 0; }"
MFPU_NEON_AVAILABLE FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_neon_ld4_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
set(NEONFLAG "-mfpu=neon")
endif()
endif()
endif()
# Check whether compiler supports loading 4 neon vecs into a register range
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
check_c_source_compiles(
"#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int main(void) {
int stack_var[16];
int32x4x4_t v = vld1q_s32_x4(stack_var);
(void)v;
return 0;
}"
NEON_HAS_LD4)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_pclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(PCLMULFLAG "-mpclmul")
endif()
endif()
# Check whether compiler supports PCLMULQDQ intrinsics
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
# The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m128i a = _mm_setzero_si128();
__m128i b = _mm_setzero_si128();
__m128i c = _mm_clmulepi64_si128(a, b, 0x10);
(void)c;
return 0;
}"
HAVE_PCLMULQDQ_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
else()
set(HAVE_PCLMULQDQ_INTRIN OFF)
endif()
endmacro()
macro(check_vpclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
endif()
endif()
# Check whether compiler supports VPCLMULQDQ intrinsics
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m512i a = _mm512_setzero_si512();
__m512i b = _mm512_setzero_si512();
__m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
(void)c;
return 0;
}"
HAVE_VPCLMULQDQ_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
else()
set(HAVE_VPCLMULQDQ_INTRIN OFF)
endif()
endmacro()
macro(check_ppc_intrinsics)
# Check if compiler supports AltiVec
set(CMAKE_REQUIRED_FLAGS "-maltivec")
check_c_source_compiles(
"#include <altivec.h>
int main(void)
{
vector int a = vec_splats(0);
vector int b = vec_splats(0);
a = vec_add(a, b);
return 0;
}"
HAVE_ALTIVEC
)
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_ALTIVEC)
set(PPCFLAGS "-maltivec")
endif()
set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx")
check_c_source_compiles(
"#include <altivec.h>
int main(void)
{
vector int a = vec_splats(0);
vector int b = vec_splats(0);
a = vec_add(a, b);
return 0;
}"
HAVE_NOVSX
)
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_NOVSX)
set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
endif()
# Check if we have what we need for AltiVec optimizations
set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG}")
check_c_source_compiles(
"#include <sys/auxv.h>
#ifdef __FreeBSD__
#include <machine/cpu.h>
#endif
int main() {
#ifdef __FreeBSD__
unsigned long hwcap;
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
#else
return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
#endif
}"
HAVE_VMX
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_power8_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(POWER8FLAG "-mcpu=power8")
endif()
endif()
# Check if we have what we need for POWER8 optimizations
set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"#include <sys/auxv.h>
#ifdef __FreeBSD__
#include <machine/cpu.h>
#endif
int main() {
#ifdef __FreeBSD__
unsigned long hwcap;
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
return (hwcap & PPC_FEATURE2_ARCH_2_07);
#else
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
#endif
}"
HAVE_POWER8_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_rvv_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(RISCVFLAG "-march=rv64gcv")
endif()
endif()
# Check whether compiler supports RVV
set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"#include <riscv_vector.h>
int main() {
return 0;
}"
HAVE_RVV_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_s390_intrinsics)
check_c_source_compiles(
"#include <sys/auxv.h>
#ifndef HWCAP_S390_VXRS
#define HWCAP_S390_VXRS HWCAP_S390_VX
#endif
int main() {
return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
}"
HAVE_S390_INTRIN
)
endmacro()
macro(check_power9_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(POWER9FLAG "-mcpu=power9")
endif()
endif()
# Check if we have what we need for POWER9 optimizations
set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"int main() {
return 0;
}"
HAVE_POWER9_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_sse2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE2FLAG "-msse2")
else()
set(SSE2FLAG "/arch:SSE2")
endif()
elseif(MSVC)
if(NOT "${ARCH}" MATCHES "x86_64")
set(SSE2FLAG "/arch:SSE2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE2FLAG "-msse2")
endif()
endif()
# Check whether compiler supports SSE2 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m128i zero = _mm_setzero_si128();
(void)zero;
return 0;
}"
HAVE_SSE2_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_ssse3_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSSE3FLAG "-mssse3")
else()
set(SSSE3FLAG "/arch:SSSE3")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSSE3FLAG "-mssse3")
endif()
endif()
# Check whether compiler supports SSSE3 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
__m128i u, v, w;
u = _mm_set1_epi32(1);
v = _mm_set1_epi32(2);
w = _mm_hadd_epi32(u, v);
(void)w;
return 0;
}"
HAVE_SSSE3_INTRIN
)
endmacro()
macro(check_sse42_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE42FLAG "-msse4.2")
else()
set(SSE42FLAG "/arch:SSE4.2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE42FLAG "-msse4.2")
endif()
endif()
# Check whether compiler supports SSE4.2 CRC inline asm
set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG}")
check_c_source_compile_or_run(
"int main(void) {
unsigned val = 0, h = 0;
#if defined(_MSC_VER)
{ __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov h, eax }
#else
__asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) );
#endif
return (int)h;
}"
HAVE_SSE42CRC_INLINE_ASM
)
# Check whether compiler supports SSE4.2 CRC intrinsics
check_c_source_compile_or_run(
"#include <immintrin.h>
int main(void) {
unsigned crc = 0;
char c = 'c';
#if defined(_MSC_VER)
crc = _mm_crc32_u32(crc, c);
#else
crc = __builtin_ia32_crc32qi(crc, c);
#endif
(void)crc;
return 0;
}"
HAVE_SSE42CRC_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_vgfma_intrinsics)
if(NOT NATIVEFLAG)
set(VGFMAFLAG "-march=z13")
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
endif()
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
endif()
endif()
# Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"#include <vecintrin.h>
int main(void) {
unsigned long long a __attribute__((vector_size(16))) = { 0 };
unsigned long long b __attribute__((vector_size(16))) = { 0 };
unsigned char c __attribute__((vector_size(16))) = { 0 };
c = vec_gfmsum_accum_128(a, b, c);
return c[0];
}"
HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_xsave_intrinsics)
if(NOT NATIVEFLAG AND NOT MSVC)
set(XSAVEFLAG "-mxsave")
endif()
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG}")
check_c_source_compiles(
"#ifdef _WIN32
# include <intrin.h>
#else
# include <x86gprintrin.h>
#endif
int main(void) {
return _xgetbv(0);
}"
HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()

View File

@@ -0,0 +1,166 @@
# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
# Licensed under the Zlib license, see LICENSE.md for details
macro(add_common_sanitizer_flags)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
add_compile_options(-g3)
endif()
check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
if(HAVE_NO_OMIT_FRAME_POINTER)
add_compile_options(-fno-omit-frame-pointer)
add_link_options(-fno-omit-frame-pointer)
endif()
check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
add_compile_options(-fno-optimize-sibling-calls)
add_link_options(-fno-optimize-sibling-calls)
endif()
endmacro()
macro(check_sanitizer_support known_checks supported_checks)
set(available_checks "")
# Build list of supported sanitizer flags by incrementally trying compilation with
# known sanitizer checks
foreach(check ${known_checks})
if(available_checks STREQUAL "")
set(compile_checks "${check}")
else()
set(compile_checks "${available_checks},${check}")
endif()
set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
FAIL_REGEX "not supported|unrecognized command|unknown option")
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_SANITIZER_${check})
set(available_checks ${compile_checks})
endif()
endforeach()
set(${supported_checks} ${available_checks})
endmacro()
macro(add_address_sanitizer)
set(known_checks
address
pointer-compare
pointer-subtract
)
check_sanitizer_support("${known_checks}" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Address sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Address sanitizer is not supported")
endif()
if(CMAKE_CROSSCOMPILING_EMULATOR)
# Only check for leak sanitizer if not cross-compiling due to qemu crash
message(WARNING "Leak sanitizer is not supported when cross compiling")
else()
# Leak sanitizer requires address sanitizer
check_sanitizer_support("leak" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Leak sanitizer is not supported")
endif()
endif()
endmacro()
macro(add_memory_sanitizer)
check_sanitizer_support("memory" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
if(HAVE_MEMORY_TRACK_ORIGINS)
add_compile_options(-fsanitize-memory-track-origins)
add_link_options(-fsanitize-memory-track-origins)
endif()
else()
message(STATUS "Memory sanitizer is not supported")
endif()
endmacro()
macro(add_thread_sanitizer)
check_sanitizer_support("thread" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Thread sanitizer is not supported")
endif()
endmacro()
macro(add_undefined_sanitizer)
set(known_checks
array-bounds
bool
bounds
builtin
enum
float-cast-overflow
float-divide-by-zero
function
integer-divide-by-zero
local-bounds
null
nonnull-attribute
pointer-overflow
return
returns-nonnull-attribute
shift
shift-base
shift-exponent
signed-integer-overflow
undefined
unsigned-integer-overflow
unsigned-shift-base
vla-bound
vptr
)
# Only check for alignment sanitizer flag if unaligned access is not supported
if(NOT WITH_UNALIGNED)
list(APPEND known_checks alignment)
endif()
# Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
if(NOT CMAKE_C_FLAGS MATCHES "-O0")
list(APPEND known_checks object-size)
endif()
check_sanitizer_support("${known_checks}" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
# Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
# it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
if(WITH_UNALIGNED)
add_compile_options(-fno-sanitize=alignment)
endif()
add_common_sanitizer_flags()
else()
message(STATUS "Undefined behavior sanitizer is not supported")
endif()
endmacro()

View File

@@ -0,0 +1,19 @@
# fallback-macros.cmake -- CMake fallback macros
# Copyright (C) 2022 Nathan Moinvaziri
# Licensed under the Zlib license, see LICENSE.md for details
# CMake less than version 3.5.2
if(NOT COMMAND add_compile_options)
macro(add_compile_options options)
string(APPEND CMAKE_C_FLAGS ${options})
string(APPEND CMAKE_CXX_FLAGS ${options})
endmacro()
endif()
# CMake less than version 3.14
if(NOT COMMAND add_link_options)
macro(add_link_options options)
string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
endmacro()
endif()

View File

@@ -0,0 +1,24 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_C_COMPILER_TARGET "aarch64-linux-gnu")
set(CMAKE_CXX_COMPILER_TARGET "aarch64-linux-gnu")
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-aarch64 -L /usr/${CMAKE_C_COMPILER_TARGET}/)
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()

29
deps/zlib-ng/cmake/toolchain-arm.cmake vendored Normal file
View File

@@ -0,0 +1,29 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR arm)
set(CMAKE_SYSTEM_VERSION 1)
if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabi)
endif()
if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabi)
endif()
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()

View File

@@ -0,0 +1,25 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR arm)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabihf)
set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabihf)
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()

View File

@@ -0,0 +1,35 @@
set(CMAKE_SYSTEM_NAME Windows)
set(CMAKE_C_COMPILER_TARGET i686-w64-mingw32)
set(CMAKE_CXX_COMPILER_TARGET i686-w64-mingw32)
set(CMAKE_RC_COMPILER_TARGET i686-w64-mingw32)
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR wine)
set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
# Prefer posix gcc variant for gtest pthread support
find_program(C_COMPILER_FULL_PATH NAMES
${CMAKE_C_COMPILER_TARGET}-gcc-posix
${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES
${CMAKE_CXX_COMPILER_TARGET}-g++-posix
${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()
find_program(RC_COMPILER_FULL_PATH NAMES
${CMAKE_RC_COMPILER_TARGET}-windres)
if(RC_COMPILER_FULL_PATH)
set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
endif()

View File

@@ -0,0 +1,34 @@
set(CMAKE_SYSTEM_NAME Windows)
set(CMAKE_C_COMPILER_TARGET x86_64-w64-mingw32)
set(CMAKE_CXX_COMPILER_TARGET x86_64-w64-mingw32)
set(CMAKE_RC_COMPILER_TARGET x86_64-w64-mingw32)
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR wine)
set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
# Prefer posix gcc variant for gtest pthread support
find_program(C_COMPILER_FULL_PATH NAMES
${CMAKE_C_COMPILER_TARGET}-gcc-posix
${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES
${CMAKE_CXX_COMPILER_TARGET}-g++-posix
${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()
find_program(RC_COMPILER_FULL_PATH NAMES ${CMAKE_RC_COMPILER_TARGET}-windres)
if(RC_COMPILER_FULL_PATH)
set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
endif()

29
deps/zlib-ng/cmake/toolchain-mips.cmake vendored Normal file
View File

@@ -0,0 +1,29 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR mips)
set(CMAKE_SYSTEM_VERSION 1)
if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
set(CMAKE_C_COMPILER_TARGET mips-linux-gnu)
endif()
if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
set(CMAKE_CXX_COMPILER_TARGET mips-linux-gnu)
endif()
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-mips -L /usr/${CMAKE_C_COMPILER_TARGET}/)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
if(NOT C_COMPILER_FULL_PATH)
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
endif()
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
if(CXX_COMPILER_FULL_PATH)
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
endif()

Some files were not shown because too many files have changed in this diff Show More