mirror of
https://github.com/libgit2/libgit2.git
synced 2026-06-22 06:26:26 +00:00
zlib-ng
This commit is contained in:
@@ -38,7 +38,7 @@ option(USE_GSSAPI "Link with libgssapi for SPNEGO auth" OFF)
|
||||
set(USE_HTTP_PARSER "" CACHE STRING "Specifies the HTTP Parser implementation; either system or builtin.")
|
||||
# set(USE_XDIFF "" CACHE STRING "Specifies the xdiff implementation; either system or builtin.")
|
||||
set(REGEX_BACKEND "" CACHE STRING "Regular expression implementation. One of regcomp_l, pcre2, pcre, regcomp, or builtin.")
|
||||
option(USE_BUNDLED_ZLIB "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL" OFF)
|
||||
set(USE_BUNDLED_ZLIB "" CACHE STRING "Use the bundled version of zlib. Can be set to one of Bundled(ON)/Chromium/zlibg-ng. The Chromium option requires a x86_64 processor with SSE4.2 and CLMUL")
|
||||
|
||||
# Debugging options
|
||||
option(USE_LEAK_CHECKER "Run tests with leak checker" OFF)
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
include(SanitizeBool)
|
||||
|
||||
SanitizeBool(USE_BUNDLED_ZLIB)
|
||||
if(USE_BUNDLED_ZLIB STREQUAL ON)
|
||||
if(USE_BUNDLED_ZLIB STREQUAL "ON")
|
||||
set(USE_BUNDLED_ZLIB "Bundled")
|
||||
endif()
|
||||
|
||||
if(USE_BUNDLED_ZLIB STREQUAL "OFF")
|
||||
if(USE_BUNDLED_ZLIB STREQUAL "OFF" OR USE_BUNDLED_ZLIB STREQUAL "")
|
||||
find_package(ZLIB)
|
||||
if(ZLIB_FOUND)
|
||||
list(APPEND LIBGIT2_SYSTEM_INCLUDES ${ZLIB_INCLUDE_DIRS})
|
||||
@@ -17,16 +17,26 @@ if(USE_BUNDLED_ZLIB STREQUAL "OFF")
|
||||
list(APPEND LIBGIT2_PC_REQUIRES "zlib")
|
||||
endif()
|
||||
add_feature_info(zlib ON "using system zlib")
|
||||
elseif(USE_BUNDLED_ZLIB STREQUAL "OFF")
|
||||
message(FATAL_ERROR "zlib was not found")
|
||||
else()
|
||||
message(STATUS "zlib was not found; using bundled 3rd-party sources." )
|
||||
message(WARNING "zlib was not found; using bundled 3rd-party sources." )
|
||||
set(USE_BUNDLED_ZLIB "Bundled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_BUNDLED_ZLIB STREQUAL "Chromium")
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/chromium-zlib" "${PROJECT_BINARY_DIR}/deps/chromium-zlib")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/chromium-zlib")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:chromium_zlib>)
|
||||
add_feature_info(zlib ON "using (Chromium) bundled zlib")
|
||||
elseif(USE_BUNDLED_ZLIB OR NOT ZLIB_FOUND)
|
||||
elseif(USE_BUNDLED_ZLIB STREQUAL "zlib-ng")
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib-ng" "${PROJECT_BINARY_DIR}/deps/zlib-ng")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib-ng")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlibstatic>)
|
||||
add_feature_info(zlib ON "using bundled zlib-ng")
|
||||
elseif(USE_BUNDLED_ZLIB)
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/deps/zlib" "${PROJECT_BINARY_DIR}/deps/zlib")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_INCLUDES "${PROJECT_SOURCE_DIR}/deps/zlib")
|
||||
list(APPEND LIBGIT2_DEPENDENCY_OBJECTS $<TARGET_OBJECTS:zlib>)
|
||||
|
||||
1268
deps/zlib-ng/CMakeLists.txt
vendored
Normal file
1268
deps/zlib-ng/CMakeLists.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
374
deps/zlib-ng/FAQ.zlib
vendored
Normal file
374
deps/zlib-ng/FAQ.zlib
vendored
Normal file
@@ -0,0 +1,374 @@
|
||||
##
|
||||
# THIS IS AN UNMAINTAINED COPY OF THE ORIGINAL FILE DISTRIBUTED WITH ZLIB 1.2.11
|
||||
##
|
||||
|
||||
|
||||
|
||||
|
||||
Frequently Asked Questions about zlib
|
||||
|
||||
|
||||
If your question is not there, please check the zlib home page
|
||||
https://zlib.net/ which may have more recent information.
|
||||
The latest zlib FAQ is at https://zlib.net/zlib_faq.html
|
||||
|
||||
|
||||
1. Is zlib Y2K-compliant?
|
||||
|
||||
Yes. zlib doesn't handle dates.
|
||||
|
||||
2. Where can I get a Windows DLL version?
|
||||
|
||||
The zlib sources can be compiled without change to produce a DLL. See the
|
||||
file win32/DLL_FAQ.txt in the zlib distribution. Pointers to the
|
||||
precompiled DLL are found in the zlib web site at https://zlib.net/ .
|
||||
|
||||
3. Where can I get a Visual Basic interface to zlib?
|
||||
|
||||
See
|
||||
* https://marknelson.us/1997/01/01/zlib-engine/
|
||||
* win32/DLL_FAQ.txt in the zlib distribution
|
||||
|
||||
4. compress() returns Z_BUF_ERROR.
|
||||
|
||||
Make sure that before the call of compress(), the length of the compressed
|
||||
buffer is equal to the available size of the compressed buffer and not
|
||||
zero. For Visual Basic, check that this parameter is passed by reference
|
||||
("as any"), not by value ("as long").
|
||||
|
||||
5. deflate() or inflate() returns Z_BUF_ERROR.
|
||||
|
||||
Before making the call, make sure that avail_in and avail_out are not zero.
|
||||
When setting the parameter flush equal to Z_FINISH, also make sure that
|
||||
avail_out is big enough to allow processing all pending input. Note that a
|
||||
Z_BUF_ERROR is not fatal--another call to deflate() or inflate() can be
|
||||
made with more input or output space. A Z_BUF_ERROR may in fact be
|
||||
unavoidable depending on how the functions are used, since it is not
|
||||
possible to tell whether or not there is more output pending when
|
||||
strm.avail_out returns with zero. See https://zlib.net/zlib_how.html for a
|
||||
heavily annotated example.
|
||||
|
||||
6. Where's the zlib documentation (man pages, etc.)?
|
||||
|
||||
It's in zlib.h . Examples of zlib usage are in the files test/example.c
|
||||
and test/minigzip.c, with more in examples/ .
|
||||
|
||||
7. Why don't you use GNU autoconf or libtool or ...?
|
||||
|
||||
Because we would like to keep zlib as a very small and simple package.
|
||||
zlib is rather portable and doesn't need much configuration.
|
||||
|
||||
8. I found a bug in zlib.
|
||||
|
||||
Most of the time, such problems are due to an incorrect usage of zlib.
|
||||
Please try to reproduce the problem with a small program and send the
|
||||
corresponding source to us at zlib@gzip.org . Do not send multi-megabyte
|
||||
data files without prior agreement.
|
||||
|
||||
9. Why do I get "undefined reference to gzputc"?
|
||||
|
||||
If "make test" produces something like
|
||||
|
||||
example.o(.text+0x154): undefined reference to `gzputc'
|
||||
|
||||
check that you don't have old files libz.* in /usr/lib, /usr/local/lib or
|
||||
/usr/X11R6/lib. Remove any old versions, then do "make install".
|
||||
|
||||
10. I need a Delphi interface to zlib.
|
||||
|
||||
See the contrib/delphi directory in the zlib distribution.
|
||||
|
||||
11. Can zlib handle .zip archives?
|
||||
|
||||
Not by itself, no. See the directory contrib/minizip in the zlib
|
||||
distribution.
|
||||
|
||||
12. Can zlib handle .Z files?
|
||||
|
||||
No, sorry. You have to spawn an uncompress or gunzip subprocess, or adapt
|
||||
the code of uncompress on your own.
|
||||
|
||||
13. How can I make a Unix shared library?
|
||||
|
||||
By default a shared (and a static) library is built for Unix. So:
|
||||
|
||||
make distclean
|
||||
./configure
|
||||
make
|
||||
|
||||
14. How do I install a shared zlib library on Unix?
|
||||
|
||||
After the above, then:
|
||||
|
||||
make install
|
||||
|
||||
However, many flavors of Unix come with a shared zlib already installed.
|
||||
Before going to the trouble of compiling a shared version of zlib and
|
||||
trying to install it, you may want to check if it's already there! If you
|
||||
can #include <zlib.h>, it's there. The -lz option will probably link to
|
||||
it. You can check the version at the top of zlib.h or with the
|
||||
ZLIB_VERSION symbol defined in zlib.h .
|
||||
|
||||
15. I have a question about OttoPDF.
|
||||
|
||||
We are not the authors of OttoPDF. The real author is on the OttoPDF web
|
||||
site: Joel Hainley, jhainley@myndkryme.com.
|
||||
|
||||
16. Can zlib decode Flate data in an Adobe PDF file?
|
||||
|
||||
Yes. See https://www.pdflib.com/ . To modify PDF forms, see
|
||||
https://sourceforge.net/projects/acroformtool/ .
|
||||
|
||||
17. Why am I getting this "register_frame_info not found" error on Solaris?
|
||||
|
||||
After installing zlib 1.1.4 on Solaris 2.6, running applications using zlib
|
||||
generates an error such as:
|
||||
|
||||
ld.so.1: rpm: fatal: relocation error: file /usr/local/lib/libz.so:
|
||||
symbol __register_frame_info: referenced symbol not found
|
||||
|
||||
The symbol __register_frame_info is not part of zlib, it is generated by
|
||||
the C compiler (cc or gcc). You must recompile applications using zlib
|
||||
which have this problem. This problem is specific to Solaris. See
|
||||
http://www.sunfreeware.com/ for Solaris versions of zlib and applications
|
||||
using zlib.
|
||||
|
||||
18. Why does gzip give an error on a file I make with compress/deflate?
|
||||
|
||||
The compress and deflate functions produce data in the zlib format, which
|
||||
is different and incompatible with the gzip format. The gz* functions in
|
||||
zlib on the other hand use the gzip format. Both the zlib and gzip formats
|
||||
use the same compressed data format internally, but have different headers
|
||||
and trailers around the compressed data.
|
||||
|
||||
19. Ok, so why are there two different formats?
|
||||
|
||||
The gzip format was designed to retain the directory information about a
|
||||
single file, such as the name and last modification date. The zlib format
|
||||
on the other hand was designed for in-memory and communication channel
|
||||
applications, and has a much more compact header and trailer and uses a
|
||||
faster integrity check than gzip.
|
||||
|
||||
20. Well that's nice, but how do I make a gzip file in memory?
|
||||
|
||||
You can request that deflate write the gzip format instead of the zlib
|
||||
format using deflateInit2(). You can also request that inflate decode the
|
||||
gzip format using inflateInit2(). Read zlib.h for more details.
|
||||
|
||||
21. Is zlib thread-safe?
|
||||
|
||||
Yes. However any library routines that zlib uses and any application-
|
||||
provided memory allocation routines must also be thread-safe. zlib's gz*
|
||||
functions use stdio library routines, and most of zlib's functions use the
|
||||
library memory allocation routines by default. zlib's *Init* functions
|
||||
allow for the application to provide custom memory allocation routines.
|
||||
|
||||
Of course, you should only operate on any given zlib or gzip stream from a
|
||||
single thread at a time.
|
||||
|
||||
22. Can I use zlib in my commercial application?
|
||||
|
||||
Yes. Please read the license in zlib.h.
|
||||
|
||||
23. Is zlib under the GNU license?
|
||||
|
||||
No. Please read the license in zlib.h.
|
||||
|
||||
24. The license says that altered source versions must be "plainly marked". So
|
||||
what exactly do I need to do to meet that requirement?
|
||||
|
||||
You need to change the ZLIB_VERSION and ZLIB_VERNUM #defines in zlib.h. In
|
||||
particular, the final version number needs to be changed to "f", and an
|
||||
identification string should be appended to ZLIB_VERSION. Version numbers
|
||||
x.x.x.f are reserved for modifications to zlib by others than the zlib
|
||||
maintainers. For example, if the version of the base zlib you are altering
|
||||
is "1.2.3.4", then in zlib.h you should change ZLIB_VERNUM to 0x123f, and
|
||||
ZLIB_VERSION to something like "1.2.3.f-zachary-mods-v3". You can also
|
||||
update the version strings in deflate.c and inftrees.c.
|
||||
|
||||
For altered source distributions, you should also note the origin and
|
||||
nature of the changes in zlib.h, as well as in ChangeLog and README, along
|
||||
with the dates of the alterations. The origin should include at least your
|
||||
name (or your company's name), and an email address to contact for help or
|
||||
issues with the library.
|
||||
|
||||
Note that distributing a compiled zlib library along with zlib.h and
|
||||
zconf.h is also a source distribution, and so you should change
|
||||
ZLIB_VERSION and ZLIB_VERNUM and note the origin and nature of the changes
|
||||
in zlib.h as you would for a full source distribution.
|
||||
|
||||
25. Will zlib work on a big-endian or little-endian architecture, and can I
|
||||
exchange compressed data between them?
|
||||
|
||||
Yes and yes.
|
||||
|
||||
26. Will zlib work on a 64-bit machine?
|
||||
|
||||
Yes. It has been tested on 64-bit machines, and has no dependence on any
|
||||
data types being limited to 32-bits in length. If you have any
|
||||
difficulties, please provide a complete problem report to zlib@gzip.org
|
||||
|
||||
27. Will zlib decompress data from the PKWare Data Compression Library?
|
||||
|
||||
No. The PKWare DCL uses a completely different compressed data format than
|
||||
does PKZIP and zlib. However, you can look in zlib's contrib/blast
|
||||
directory for a possible solution to your problem.
|
||||
|
||||
28. Can I access data randomly in a compressed stream?
|
||||
|
||||
No, not without some preparation. If when compressing you periodically use
|
||||
Z_FULL_FLUSH, carefully write all the pending data at those points, and
|
||||
keep an index of those locations, then you can start decompression at those
|
||||
points. You have to be careful to not use Z_FULL_FLUSH too often, since it
|
||||
can significantly degrade compression. Alternatively, you can scan a
|
||||
deflate stream once to generate an index, and then use that index for
|
||||
random access. See examples/zran.c .
|
||||
|
||||
29. Does zlib work on MVS, OS/390, CICS, etc.?
|
||||
|
||||
It has in the past, but we have not heard of any recent evidence. There
|
||||
were working ports of zlib 1.1.4 to MVS, but those links no longer work.
|
||||
If you know of recent, successful applications of zlib on these operating
|
||||
systems, please let us know. Thanks.
|
||||
|
||||
30. Is there some simpler, easier to read version of inflate I can look at to
|
||||
understand the deflate format?
|
||||
|
||||
First off, you should read RFC 1951. Second, yes. Look in zlib's
|
||||
contrib/puff directory.
|
||||
|
||||
31. Does zlib infringe on any patents?
|
||||
|
||||
As far as we know, no. In fact, that was originally the whole point behind
|
||||
zlib. Look here for some more information:
|
||||
|
||||
https://www.gzip.org/#faq11
|
||||
|
||||
32. Can zlib work with greater than 4 GB of data?
|
||||
|
||||
Yes. inflate() and deflate() will process any amount of data correctly.
|
||||
Each call of inflate() or deflate() is limited to input and output chunks
|
||||
of the maximum value that can be stored in the compiler's "unsigned int"
|
||||
type, but there is no limit to the number of chunks. Note however that the
|
||||
strm.total_in and strm_total_out counters may be limited to 4 GB. These
|
||||
counters are provided as a convenience and are not used internally by
|
||||
inflate() or deflate(). The application can easily set up its own counters
|
||||
updated after each call of inflate() or deflate() to count beyond 4 GB.
|
||||
compress() and uncompress() may be limited to 4 GB, since they operate in a
|
||||
single call. gzseek() and gztell() may be limited to 4 GB depending on how
|
||||
zlib is compiled. See the zlibCompileFlags() function in zlib.h.
|
||||
|
||||
The word "may" appears several times above since there is a 4 GB limit only
|
||||
if the compiler's "long" type is 32 bits. If the compiler's "long" type is
|
||||
64 bits, then the limit is 16 exabytes.
|
||||
|
||||
33. Does zlib have any security vulnerabilities?
|
||||
|
||||
The only one that we are aware of is potentially in gzprintf(). If zlib is
|
||||
compiled to use sprintf() or vsprintf(), then there is no protection
|
||||
against a buffer overflow of an 8K string space (or other value as set by
|
||||
gzbuffer()), other than the caller of gzprintf() assuring that the output
|
||||
will not exceed 8K. On the other hand, if zlib is compiled to use
|
||||
snprintf() or vsnprintf(), which should normally be the case, then there is
|
||||
no vulnerability. The ./configure script will display warnings if an
|
||||
insecure variation of sprintf() will be used by gzprintf(). Also the
|
||||
zlibCompileFlags() function will return information on what variant of
|
||||
sprintf() is used by gzprintf().
|
||||
|
||||
If you don't have snprintf() or vsnprintf() and would like one, you can
|
||||
find a portable implementation here:
|
||||
|
||||
https://www.ijs.si/software/snprintf/
|
||||
|
||||
Note that you should be using the most recent version of zlib. Versions
|
||||
1.1.3 and before were subject to a double-free vulnerability, and versions
|
||||
1.2.1 and 1.2.2 were subject to an access exception when decompressing
|
||||
invalid compressed data.
|
||||
|
||||
34. Is there a Java version of zlib?
|
||||
|
||||
Probably what you want is to use zlib in Java. zlib is already included
|
||||
as part of the Java SDK in the java.util.zip package. If you really want
|
||||
a version of zlib written in the Java language, look on the zlib home
|
||||
page for links: https://zlib.net/ .
|
||||
|
||||
35. I get this or that compiler or source-code scanner warning when I crank it
|
||||
up to maximally-pedantic. Can't you guys write proper code?
|
||||
|
||||
Many years ago, we gave up attempting to avoid warnings on every compiler
|
||||
in the universe. It just got to be a waste of time, and some compilers
|
||||
were downright silly as well as contradicted each other. So now, we simply
|
||||
make sure that the code always works.
|
||||
|
||||
36. Valgrind (or some similar memory access checker) says that deflate is
|
||||
performing a conditional jump that depends on an uninitialized value.
|
||||
Isn't that a bug?
|
||||
|
||||
No. That is intentional for performance reasons, and the output of deflate
|
||||
is not affected. This only started showing up recently since zlib 1.2.x
|
||||
uses malloc() by default for allocations, whereas earlier versions used
|
||||
calloc(), which zeros out the allocated memory. Even though the code was
|
||||
correct, versions 1.2.4 and later was changed to not stimulate these
|
||||
checkers.
|
||||
|
||||
37. Will zlib read the (insert any ancient or arcane format here) compressed
|
||||
data format?
|
||||
|
||||
Probably not. Look in the comp.compression FAQ for pointers to various
|
||||
formats and associated software.
|
||||
|
||||
38. How can I encrypt/decrypt zip files with zlib?
|
||||
|
||||
zlib doesn't support encryption. The original PKZIP encryption is very
|
||||
weak and can be broken with freely available programs. To get strong
|
||||
encryption, use GnuPG, https://www.gnupg.org/ , which already includes zlib
|
||||
compression. For PKZIP compatible "encryption", look at
|
||||
http://infozip.sourceforge.net/
|
||||
|
||||
39. What's the difference between the "gzip" and "deflate" HTTP 1.1 encodings?
|
||||
|
||||
"gzip" is the gzip format, and "deflate" is the zlib format. They should
|
||||
probably have called the second one "zlib" instead to avoid confusion with
|
||||
the raw deflate compressed data format. While the HTTP 1.1 RFC 2616
|
||||
correctly points to the zlib specification in RFC 1950 for the "deflate"
|
||||
transfer encoding, there have been reports of servers and browsers that
|
||||
incorrectly produce or expect raw deflate data per the deflate
|
||||
specification in RFC 1951, most notably Microsoft. So even though the
|
||||
"deflate" transfer encoding using the zlib format would be the more
|
||||
efficient approach (and in fact exactly what the zlib format was designed
|
||||
for), using the "gzip" transfer encoding is probably more reliable due to
|
||||
an unfortunate choice of name on the part of the HTTP 1.1 authors.
|
||||
|
||||
Bottom line: use the gzip format for HTTP 1.1 encoding.
|
||||
|
||||
40. Does zlib support the new "Deflate64" format introduced by PKWare?
|
||||
|
||||
No. PKWare has apparently decided to keep that format proprietary, since
|
||||
they have not documented it as they have previous compression formats. In
|
||||
any case, the compression improvements are so modest compared to other more
|
||||
modern approaches, that it's not worth the effort to implement.
|
||||
|
||||
41. I'm having a problem with the zip functions in zlib, can you help?
|
||||
|
||||
There are no zip functions in zlib. You are probably using minizip by
|
||||
Giles Vollant, which is found in the contrib directory of zlib. It is not
|
||||
part of zlib. In fact none of the stuff in contrib is part of zlib. The
|
||||
files in there are not supported by the zlib authors. You need to contact
|
||||
the authors of the respective contribution for help.
|
||||
|
||||
42. The match.asm code in contrib is under the GNU General Public License.
|
||||
Since it's part of zlib, doesn't that mean that all of zlib falls under the
|
||||
GNU GPL?
|
||||
|
||||
No. The files in contrib are not part of zlib. They were contributed by
|
||||
other authors and are provided as a convenience to the user within the zlib
|
||||
distribution. Each item in contrib has its own license.
|
||||
|
||||
43. Is zlib subject to export controls? What is its ECCN?
|
||||
|
||||
zlib is not subject to export controls, and so is classified as EAR99.
|
||||
|
||||
44. Can you please sign these lengthy legal documents and fax them back to us
|
||||
so that we can use your software in our product?
|
||||
|
||||
No. Go away. Shoo.
|
||||
36
deps/zlib-ng/INDEX.md
vendored
Normal file
36
deps/zlib-ng/INDEX.md
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
Contents
|
||||
--------
|
||||
|
||||
| Name | Description |
|
||||
|:-----------------|:---------------------------------------------------------------|
|
||||
| arch/ | Architecture-specific code |
|
||||
| doc/ | Documentation for formats and algorithms |
|
||||
| test/example.c | Zlib usages examples for build testing |
|
||||
| test/minigzip.c | Minimal gzip-like functionality for build testing |
|
||||
| test/infcover.c | Inflate code coverage for build testing |
|
||||
| win32/ | Shared library version resources for Windows |
|
||||
| CMakeLists.txt | Cmake build script |
|
||||
| configure | Bash configure/build script |
|
||||
| adler32.c | Compute the Adler-32 checksum of a data stream |
|
||||
| chunkset.* | Inline functions to copy small data chunks |
|
||||
| compress.c | Compress a memory buffer |
|
||||
| deflate.* | Compress data using the deflate algorithm |
|
||||
| deflate_fast.c | Compress data using the deflate algorithm with fast strategy |
|
||||
| deflate_medium.c | Compress data using the deflate algorithm with medium strategy |
|
||||
| deflate_slow.c | Compress data using the deflate algorithm with slow strategy |
|
||||
| functable.* | Struct containing function pointers to optimized functions |
|
||||
| gzguts.h | Internal definitions for gzip operations |
|
||||
| gzlib.c | Functions common to reading and writing gzip files |
|
||||
| gzread.c | Read gzip files |
|
||||
| gzwrite.c | Write gzip files |
|
||||
| infback.* | Inflate using a callback interface |
|
||||
| inflate.* | Decompress data |
|
||||
| inffast.* | Decompress data with speed optimizations |
|
||||
| inffixed_tbl.h | Table for decoding fixed codes |
|
||||
| inftrees.h | Generate Huffman trees for efficient decoding |
|
||||
| trees.* | Output deflated data using Huffman coding |
|
||||
| uncompr.c | Decompress a memory buffer |
|
||||
| zconf.h.cmakein | zconf.h template for cmake |
|
||||
| zendian.h | BYTE_ORDER for endian tests |
|
||||
| zlib.map | Linux symbol information |
|
||||
| zlib.pc.in | Pkg-config template |
|
||||
19
deps/zlib-ng/LICENSE.md
vendored
Normal file
19
deps/zlib-ng/LICENSE.md
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
(C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
395
deps/zlib-ng/Makefile.in
vendored
Normal file
395
deps/zlib-ng/Makefile.in
vendored
Normal file
@@ -0,0 +1,395 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
# To compile and test, type:
|
||||
# ./configure; make test
|
||||
# Normally configure builds both a static and a shared library.
|
||||
# If you want to build just a static library, use: ./configure --static
|
||||
|
||||
# To install /usr/local/lib/libz.* and /usr/local/include/zlib.h, type:
|
||||
# make install
|
||||
# To install in $HOME instead of /usr/local, use:
|
||||
# make install prefix=$HOME
|
||||
|
||||
CC=cc
|
||||
|
||||
CFLAGS=-O
|
||||
#CFLAGS=-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7
|
||||
#CFLAGS=-g -DZLIB_DEBUG
|
||||
#CFLAGS=-O3 -Wall -Wwrite-strings -Wpointer-arith -Wconversion \
|
||||
# -Wstrict-prototypes -Wmissing-prototypes
|
||||
|
||||
SFLAGS=-O
|
||||
LDFLAGS=-L.
|
||||
LIBNAME1=libz-ng
|
||||
LIBNAME2=zlib-ng
|
||||
SUFFIX=-ng
|
||||
TEST_LIBS=$(LIBNAME1).a
|
||||
LDSHARED=$(CC)
|
||||
LDSHAREDFLAGS=-shared
|
||||
|
||||
VER=2.1.2
|
||||
VER1=2
|
||||
|
||||
STATICLIB=$(LIBNAME1).a
|
||||
SHAREDLIB=$(LIBNAME1).so
|
||||
SHAREDLIBV=$(LIBNAME1).so.$(VER)
|
||||
SHAREDLIBM=$(LIBNAME1).so.$(VER1)
|
||||
IMPORTLIB=
|
||||
SHAREDTARGET=$(LIBNAME1).so.$(VER)
|
||||
PKGFILE=$(LIBNAME2).pc
|
||||
|
||||
LIBS=$(STATICLIB) $(SHAREDTARGET)
|
||||
|
||||
AR=ar
|
||||
ARFLAGS=rc
|
||||
DEFFILE=
|
||||
RC=
|
||||
RCFLAGS=
|
||||
RCOBJS=
|
||||
STRIP=
|
||||
RANLIB=ranlib
|
||||
LDCONFIG=ldconfig
|
||||
LDSHAREDLIBC=
|
||||
EXE=
|
||||
|
||||
SRCDIR=.
|
||||
INCLUDES=-I$(SRCDIR)
|
||||
|
||||
BUILDDIR=.
|
||||
|
||||
ARCHDIR=arch/generic
|
||||
ARCH_STATIC_OBJS=
|
||||
ARCH_SHARED_OBJS=
|
||||
|
||||
prefix = /usr/local
|
||||
exec_prefix = ${prefix}
|
||||
bindir = ${exec_prefix}/bin
|
||||
libdir = ${exec_prefix}/lib
|
||||
sharedlibdir = ${libdir}
|
||||
includedir = ${prefix}/include
|
||||
mandir = ${prefix}/share/man
|
||||
man3dir = ${mandir}/man3
|
||||
pkgconfigdir = ${libdir}/pkgconfig
|
||||
|
||||
OBJZ = \
|
||||
adler32.o \
|
||||
adler32_fold.o \
|
||||
chunkset.o \
|
||||
compare256.o \
|
||||
compress.o \
|
||||
cpu_features.o \
|
||||
crc32_braid.o \
|
||||
crc32_braid_comb.o \
|
||||
crc32_fold.o \
|
||||
deflate.o \
|
||||
deflate_fast.o \
|
||||
deflate_huff.o \
|
||||
deflate_medium.o \
|
||||
deflate_quick.o \
|
||||
deflate_rle.o \
|
||||
deflate_slow.o \
|
||||
deflate_stored.o \
|
||||
functable.o \
|
||||
infback.o \
|
||||
inflate.o \
|
||||
inftrees.o \
|
||||
insert_string.o \
|
||||
insert_string_roll.o \
|
||||
slide_hash.o \
|
||||
trees.o \
|
||||
uncompr.o \
|
||||
zutil.o \
|
||||
$(ARCH_STATIC_OBJS)
|
||||
|
||||
OBJG = \
|
||||
gzlib.o \
|
||||
gzread.o \
|
||||
gzwrite.o
|
||||
|
||||
TESTOBJG =
|
||||
OBJC = $(OBJZ) $(OBJG)
|
||||
|
||||
PIC_OBJZ = \
|
||||
adler32.lo \
|
||||
adler32_fold.lo \
|
||||
chunkset.lo \
|
||||
compare256.lo \
|
||||
compress.lo \
|
||||
cpu_features.lo \
|
||||
crc32_braid.lo \
|
||||
crc32_braid_comb.lo \
|
||||
crc32_fold.lo \
|
||||
deflate.lo \
|
||||
deflate_fast.lo \
|
||||
deflate_huff.lo \
|
||||
deflate_medium.lo \
|
||||
deflate_quick.lo \
|
||||
deflate_rle.lo \
|
||||
deflate_slow.lo \
|
||||
deflate_stored.lo \
|
||||
functable.lo \
|
||||
infback.lo \
|
||||
inflate.lo \
|
||||
inftrees.lo \
|
||||
insert_string.lo \
|
||||
insert_string_roll.lo \
|
||||
slide_hash.lo \
|
||||
trees.lo \
|
||||
uncompr.lo \
|
||||
zutil.lo \
|
||||
$(ARCH_SHARED_OBJS)
|
||||
|
||||
PIC_OBJG = \
|
||||
gzlib.lo \
|
||||
gzread.lo \
|
||||
gzwrite.lo
|
||||
|
||||
PIC_TESTOBJG =
|
||||
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
|
||||
|
||||
OBJS = $(OBJC)
|
||||
|
||||
PIC_OBJS = $(PIC_OBJC)
|
||||
|
||||
all: static shared
|
||||
|
||||
static: example$(EXE) minigzip$(EXE) makefixed$(EXE) maketrees$(EXE) makecrct$(EXE)
|
||||
|
||||
shared: examplesh$(EXE) minigzipsh$(EXE)
|
||||
|
||||
check: test
|
||||
|
||||
.SECONDARY:
|
||||
|
||||
$(ARCHDIR)/%.o: $(SRCDIR)/$(ARCHDIR)/%.c
|
||||
$(MAKE) -C $(ARCHDIR) $(notdir $@)
|
||||
|
||||
$(ARCHDIR)/%.lo: $(SRCDIR)/$(ARCHDIR)/%.c
|
||||
$(MAKE) -C $(ARCHDIR) $(notdir $@)
|
||||
|
||||
%.o: $(ARCHDIR)/%.o
|
||||
-cp $< $@
|
||||
|
||||
%.lo: $(ARCHDIR)/%.lo
|
||||
-cp $< $@
|
||||
|
||||
test: all
|
||||
$(MAKE) -C test
|
||||
|
||||
infcover.o: $(SRCDIR)/test/infcover.c zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/test/infcover.c
|
||||
|
||||
infcover$(EXE): infcover.o $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ infcover.o $(STATICLIB)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
cover: infcover$(EXE)
|
||||
rm -f *.gcda
|
||||
./infcover
|
||||
gcov inf*.c
|
||||
|
||||
$(STATICLIB): $(OBJS)
|
||||
$(AR) $(ARFLAGS) $@ $(OBJS)
|
||||
-@ ($(RANLIB) $@ || true) >/dev/null 2>&1
|
||||
|
||||
example.o:
|
||||
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/example.c
|
||||
|
||||
minigzip.o:
|
||||
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $(SRCDIR)/test/minigzip.c
|
||||
|
||||
makefixed.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makefixed.c
|
||||
|
||||
maketrees.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/maketrees.c
|
||||
|
||||
makecrct.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/tools/makecrct.c
|
||||
|
||||
zlibrc.o: $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
|
||||
$(RC) $(RCFLAGS) -o $@ $(SRCDIR)/win32/zlib$(SUFFIX)1.rc
|
||||
|
||||
.SUFFIXES: .lo
|
||||
|
||||
%.o: $(SRCDIR)/%.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $<
|
||||
|
||||
%.lo: $(SRCDIR)/%.c
|
||||
$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzlib.o: $(SRCDIR)/gzlib.c
|
||||
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzlib.lo: $(SRCDIR)/gzlib.c
|
||||
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzread.o: gzread.c
|
||||
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzread.lo: gzread.c
|
||||
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzwrite.o: $(SRCDIR)/gzwrite.c
|
||||
$(CC) $(CFLAGS) -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
gzwrite.lo: $(SRCDIR)/gzwrite.c
|
||||
$(CC) $(SFLAGS) -DPIC -DWITH_GZFILEOP $(INCLUDES) -c -o $@ $<
|
||||
|
||||
$(SHAREDTARGET): $(PIC_OBJS) $(DEFFILE) $(RCOBJS)
|
||||
ifneq ($(SHAREDTARGET),)
|
||||
$(LDSHARED) $(CFLAGS) $(LDSHAREDFLAGS) $(LDFLAGS) -o $@ $(DEFFILE) $(PIC_OBJS) $(RCOBJS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
ifneq ($(SHAREDLIB),$(SHAREDTARGET))
|
||||
rm -f $(SHAREDLIB) $(SHAREDLIBM)
|
||||
ln -s $@ $(SHAREDLIB)
|
||||
ln -s $@ $(SHAREDLIBM)
|
||||
endif
|
||||
endif
|
||||
|
||||
example$(EXE): example.o $(TESTOBJG) $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
minigzip$(EXE): minigzip.o $(TESTOBJG) $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(TESTOBJG) $(TEST_LIBS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
minigzipsh$(EXE): minigzip.o $(PIC_TESTOBJG) $(SHAREDTARGET)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
|
||||
examplesh$(EXE): example.o $(PIC_TESTOBJG) $(SHAREDTARGET)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(PIC_TESTOBJG) $(SHAREDLIB) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
makefixed$(EXE): makefixed.o $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makefixed.o $(TEST_LIBS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
maketrees$(EXE): maketrees.o $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ maketrees.o $(TEST_LIBS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
makecrct$(EXE): makecrct.o $(STATICLIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ makecrct.o $(TEST_LIBS) $(LDSHAREDLIBC)
|
||||
ifneq ($(STRIP),)
|
||||
$(STRIP) $@
|
||||
endif
|
||||
|
||||
install-shared: $(SHAREDTARGET)
|
||||
ifneq ($(SHAREDTARGET),)
|
||||
-@if [ ! -d $(DESTDIR)$(sharedlibdir) ]; then mkdir -p $(DESTDIR)$(sharedlibdir); fi
|
||||
rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
|
||||
cp $(SHAREDTARGET) $(DESTDIR)$(sharedlibdir)
|
||||
chmod 755 $(DESTDIR)$(sharedlibdir)/$(SHAREDTARGET)
|
||||
ifneq ($(SHAREDLIB),$(SHAREDTARGET))
|
||||
rm -f $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
|
||||
ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIB)
|
||||
ln -s $(SHAREDLIBV) $(DESTDIR)$(sharedlibdir)/$(SHAREDLIBM)
|
||||
($(LDCONFIG) || true) >/dev/null 2>&1
|
||||
# ldconfig is for Linux
|
||||
endif
|
||||
ifneq ($(IMPORTLIB),)
|
||||
cp $(IMPORTLIB) $(DESTDIR)$(sharedlibdir)
|
||||
chmod 644 $(DESTDIR)$(sharedlibdir)/$(IMPORTLIB)
|
||||
endif
|
||||
endif
|
||||
|
||||
install-static: $(STATICLIB)
|
||||
-@if [ ! -d $(DESTDIR)$(libdir) ]; then mkdir -p $(DESTDIR)$(libdir); fi
|
||||
rm -f $(DESTDIR)$(libdir)/$(STATICLIB)
|
||||
cp $(STATICLIB) $(DESTDIR)$(libdir)
|
||||
chmod 644 $(DESTDIR)$(libdir)/$(STATICLIB)
|
||||
-@($(RANLIB) $(DESTDIR)$(libdir)/$(STATICLIB) || true) >/dev/null 2>&1
|
||||
# The ranlib in install-static is needed on NeXTSTEP which checks file times
|
||||
|
||||
install-libs: install-shared install-static
|
||||
-@if [ ! -d $(DESTDIR)$(man3dir) ]; then mkdir -p $(DESTDIR)$(man3dir); fi
|
||||
-@if [ ! -d $(DESTDIR)$(pkgconfigdir) ]; then mkdir -p $(DESTDIR)$(pkgconfigdir); fi
|
||||
rm -f $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
|
||||
cp $(PKGFILE) $(DESTDIR)$(pkgconfigdir)
|
||||
chmod 644 $(DESTDIR)$(pkgconfigdir)/$(PKGFILE)
|
||||
|
||||
install: install-libs
|
||||
-@if [ ! -d $(DESTDIR)$(includedir) ]; then mkdir -p $(DESTDIR)$(includedir); fi
|
||||
rm -f $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
|
||||
cp zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zlib$(SUFFIX).h
|
||||
cp zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h
|
||||
cp zlib_name_mangling$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
|
||||
chmod 644 $(DESTDIR)$(includedir)/zlib$(SUFFIX).h $(DESTDIR)$(includedir)/zconf$(SUFFIX).h $(DESTDIR)$(includedir)/zlib_name_mangling$(SUFFIX).h
|
||||
|
||||
uninstall-static:
|
||||
cd $(DESTDIR)$(libdir) && rm -f $(STATICLIB)
|
||||
|
||||
uninstall-shared:
|
||||
ifneq ($(SHAREDLIB),)
|
||||
cd $(DESTDIR)$(sharedlibdir) && rm -f $(SHAREDLIBV) $(SHAREDLIB) $(SHAREDLIBM)
|
||||
endif
|
||||
ifneq ($(IMPORTLIB),)
|
||||
cd $(DESTDIR)$(sharedlibdir) && rm -f $(IMPORTLIB)
|
||||
endif
|
||||
|
||||
uninstall: uninstall-static uninstall-shared
|
||||
cd $(DESTDIR)$(includedir) && rm -f zlib$(SUFFIX).h zconf$(SUFFIX).h zlib_name_mangling$(SUFFIX).h
|
||||
cd $(DESTDIR)$(pkgconfigdir) && rm -f $(PKGFILE)
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) clean; fi
|
||||
@if [ -f test/Makefile ]; then $(MAKE) -C test clean; fi
|
||||
rm -f *.o *.lo *~ \
|
||||
example$(EXE) minigzip$(EXE) minigzipsh$(EXE) \
|
||||
infcover makefixed$(EXE) maketrees$(EXE) makecrct$(EXE) \
|
||||
$(STATICLIB) $(IMPORTLIB) $(SHAREDLIB) $(SHAREDLIBV) $(SHAREDLIBM) \
|
||||
foo.gz so_locations \
|
||||
_match.s maketree
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
rm -f a.out a.exe
|
||||
rm -f *._h
|
||||
rm -rf btmp1 btmp2 pkgtmp1 pkgtmp2
|
||||
|
||||
maintainer-clean: distclean
|
||||
distclean: clean
|
||||
@if [ -f $(ARCHDIR)/Makefile ]; then $(MAKE) -C $(ARCHDIR) distclean; fi
|
||||
@if [ -f test/Makefile ]; then $(MAKE) -C test distclean; fi
|
||||
rm -f $(PKGFILE) configure.log zconf.h zconf.h.cmakein zlib$(SUFFIX).h zlib_name_mangling$(SUFFIX)}.h *.pc
|
||||
-@rm -f .DS_Store
|
||||
# Reset Makefile if building inside source tree
|
||||
@if [ -f Makefile.in ]; then \
|
||||
printf 'all:\n\t-@echo "Please use ./configure first. Thank you."\n' > Makefile ; \
|
||||
printf '\ndistclean:\n\t$(MAKE) -f Makefile.in distclean\n' >> Makefile ; \
|
||||
touch -r $(SRCDIR)/Makefile.in Makefile ; fi
|
||||
# Reset zconf.h and zconf.h.cmakein if building inside source tree
|
||||
@if [ -f zconf.h.in ]; then \
|
||||
cp -p $(SRCDIR)/zconf.h.in zconf.h ; \
|
||||
grep -v '^#cmakedefine' $(SRCDIR)/zconf.h.in > zconf.h.cmakein &&\
|
||||
touch -r $(SRCDIR)/zconf.h.in zconf.h.cmakein ; fi
|
||||
# Cleanup these files if building outside source tree
|
||||
@if [ ! -f README.md ]; then rm -f Makefile; fi
|
||||
# Remove arch and test directory if building outside source tree
|
||||
@if [ ! -f $(ARCHDIR)/Makefile.in ]; then rm -rf arch; fi
|
||||
@if [ ! -f test/Makefile.in ]; then rm -rf test; fi
|
||||
|
||||
tags:
|
||||
etags $(SRCDIR)/*.[ch]
|
||||
79
deps/zlib-ng/PORTING.md
vendored
Normal file
79
deps/zlib-ng/PORTING.md
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
Porting applications to use zlib-ng
|
||||
===================================
|
||||
|
||||
Zlib-ng can be used/compiled in two different modes, that require some
|
||||
consideration by the application developer.
|
||||
|
||||
zlib-compat mode
|
||||
----------------
|
||||
Zlib-ng can be compiled in zlib-compat mode, suitable for zlib-replacement
|
||||
in a single application or system-wide.
|
||||
|
||||
Please note that zlib-ng in zlib-compat mode tries to maintain both API and
|
||||
ABI compatibility with the original zlib. Any issues regarding compatibility
|
||||
can be reported as bugs.
|
||||
|
||||
In certain instances you may not be able to simply replace the zlib library/dll
|
||||
files and expect the application to work. The application may need to be
|
||||
recompiled against the zlib-ng headers and libs to ensure full compatibility.
|
||||
|
||||
It is also possible for the deflate output stream to differ from the original
|
||||
zlib due to algorithmic differences between the two libraries. Any tests or
|
||||
applications that depend on the exact length of the deflate stream being a
|
||||
certain value will need to be updated.
|
||||
|
||||
**Advantages:**
|
||||
- Easy to port to, since it only requires a recompile of the application and
|
||||
no changes to the application code.
|
||||
|
||||
**Disadvantages:**
|
||||
- Can conflict with a system-installed zlib, as that can often be linked in
|
||||
by another library you are linking into your application. This can cause
|
||||
crashes or incorrect output.
|
||||
- If your application is pre-allocating a memory buffer and you are providing
|
||||
deflate/inflate init with your own allocator that allocates from that buffer
|
||||
(looking at you nginx), you should be aware that zlib-ng needs to allocate
|
||||
more memory than stock zlib needs. The same problem exists with Intel’s and
|
||||
Cloudflare’s zlib forks. Doing this is not recommended since it makes it
|
||||
very hard to maintain compatibility over time.
|
||||
|
||||
**Build Considerations:**
|
||||
- Compile against the *zlib.h* provided by zlib-ng
|
||||
- Configuration header is named *zconf.h*
|
||||
- Static library is *libz.a* on Unix and macOS, or *zlib.lib* on Windows
|
||||
- Shared library is *libz.so* on Unix, *libz.dylib* on macOS, or *zlib1.dll*
|
||||
on Windows
|
||||
- Type `z_size_t` is *unsigned __int64* on 64-bit Windows, and *unsigned long* on 32-bit Windows, Unix and macOS
|
||||
- Type `z_uintmax_t` is *unsigned long* in zlib-compat mode, and *size_t* with zlib-ng API
|
||||
|
||||
zlib-ng native mode
|
||||
-------------------
|
||||
Zlib-ng in native mode is suitable for co-existing with the standard zlib
|
||||
library, allowing applications to implement support and testing separately.
|
||||
|
||||
The zlib-ng native has implemented some modernization and simplifications
|
||||
in its API, intended to make life easier for application developers.
|
||||
|
||||
**Advantages:**
|
||||
- Does not conflict with other zlib implementations, and can co-exist as a
|
||||
system library along with zlib.
|
||||
- In certain places zlib-ng native uses more appropriate data types, removing
|
||||
the need for some workarounds in the API compared to zlib.
|
||||
|
||||
**Disadvantages:**
|
||||
- Requires minor changes to applications to use the prefixed zlib-ng
|
||||
function calls and structs. Usually this means a small prefix `zng_` has to be added.
|
||||
|
||||
**Build Considerations:**
|
||||
- Compile against *zlib-ng.h*
|
||||
- Configuration header is named *zconf-ng.h*
|
||||
- Static library is *libz-ng.a* on Unix and macOS, or *zlib-ng.lib* on Windows
|
||||
- Shared library is *libz-ng.so* on Unix, *libz-ng.dylib* on macOS, or
|
||||
*zlib-ng2.dll* on Windows
|
||||
- Type `z_size_t` is *size_t*
|
||||
|
||||
zlib-ng compile-time detection
|
||||
------------------------------
|
||||
|
||||
To distinguish zlib-ng from other zlib implementations at compile-time check for the
|
||||
existence of `ZLIBNG_VERSION` defined in the zlib header.
|
||||
216
deps/zlib-ng/README.md
vendored
Normal file
216
deps/zlib-ng/README.md
vendored
Normal file
@@ -0,0 +1,216 @@
|
||||
| CI | Stable | Develop |
|
||||
|:---|:-------|:--------|
|
||||
| GitHub Actions | [](https://github.com/zlib-ng/zlib-ng/actions) <br> [](https://github.com/zlib-ng/zlib-ng/actions) <br> [](https://github.com/zlib-ng/zlib-ng/actions) | [](https://github.com/zlib-ng/zlib-ng/actions) <br> [](https://github.com/zlib-ng/zlib-ng/actions) <br> [](https://github.com/zlib-ng/zlib-ng/actions) |
|
||||
| CodeFactor | [](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
|
||||
| OSS-Fuzz | [](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
|
||||
| Codecov | [](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
|
||||
|
||||
## zlib-ng
|
||||
*zlib data compression library for the next generation systems*
|
||||
|
||||
Maintained by Hans Kristian Rosbach
|
||||
aka Dead2 (zlib-ng àt circlestorm dót org)
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Zlib compatible API with support for dual-linking
|
||||
* Modernized native API based on zlib API for ease of porting
|
||||
* Modern C11 syntax and a clean code layout
|
||||
* Deflate medium and quick algorithms based on Intel’s zlib fork
|
||||
* Support for CPU intrinsics when available
|
||||
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
|
||||
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
|
||||
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
|
||||
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
|
||||
* Compare256 implementations using SSE2, AVX2, Neon, & POWER9
|
||||
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
|
||||
* Support for hardware-accelerated deflate using IBM Z DFLTCC
|
||||
* Unaligned memory read/writes and large bit buffer improvements
|
||||
* Includes improvements from Cloudflare and Intel forks
|
||||
* Configure, CMake, and NMake build system support
|
||||
* Comprehensive set of CMake unit tests
|
||||
* Code sanitizers, fuzzing, and coverage
|
||||
* GitHub Actions continuous integration on Windows, macOS, and Linux
|
||||
* Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
|
||||
|
||||
|
||||
History
|
||||
-------
|
||||
|
||||
The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
|
||||
implemented into the official zlib repository.
|
||||
|
||||
Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
|
||||
for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
|
||||
lower threshold for code change.
|
||||
|
||||
zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
|
||||
That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
|
||||
for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
|
||||
|
||||
Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
|
||||
cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
|
||||
|
||||
I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
|
||||
smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
|
||||
The result is a better performing and easier to maintain zlib-ng.
|
||||
|
||||
A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
|
||||
small and big improvements, or valuable testing.
|
||||
|
||||
|
||||
Build
|
||||
-----
|
||||
<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
|
||||
|
||||
There are two ways to build zlib-ng:
|
||||
|
||||
### Cmake
|
||||
|
||||
To build zlib-ng using the cross-platform makefile generator cmake.
|
||||
|
||||
```
|
||||
cmake .
|
||||
cmake --build . --config Release
|
||||
ctest --verbose -C Release
|
||||
```
|
||||
|
||||
Alternatively, you can use the cmake configuration GUI tool ccmake:
|
||||
|
||||
```
|
||||
ccmake .
|
||||
```
|
||||
|
||||
### Configure
|
||||
|
||||
To build zlib-ng using the bash configure script:
|
||||
|
||||
```
|
||||
./configure
|
||||
make
|
||||
make test
|
||||
```
|
||||
|
||||
Build Options
|
||||
-------------
|
||||
|
||||
| CMake | configure | Description | Default |
|
||||
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
|
||||
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
|
||||
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
|
||||
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
|
||||
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
|
||||
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
|
||||
| WITH_NATIVE_INSTRUCTIONS | --native | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
|
||||
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
|
||||
| WITH_GTEST | | Build gtest_zlib | ON |
|
||||
| WITH_FUZZERS | | Build test/fuzz | OFF |
|
||||
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
|
||||
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
|
||||
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
|
||||
|
||||
|
||||
Install
|
||||
-------
|
||||
|
||||
WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
|
||||
potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
|
||||
can make the whole system unusable, requiring recovery or reinstall.
|
||||
If you still want a manual install, we recommend using the /opt/ path prefix.
|
||||
|
||||
For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
|
||||
the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
|
||||
will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
|
||||
|
||||
```
|
||||
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
|
||||
```
|
||||
|
||||
### Cmake
|
||||
|
||||
To install zlib-ng system-wide using cmake:
|
||||
|
||||
```
|
||||
cmake --build . --target install
|
||||
```
|
||||
|
||||
### Configure
|
||||
|
||||
To install zlib-ng system-wide using the configure script:
|
||||
|
||||
```
|
||||
make install
|
||||
```
|
||||
|
||||
### Vcpkg
|
||||
|
||||
Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
|
||||
|
||||
```sh or powershell
|
||||
git clone https://github.com/Microsoft/vcpkg.git
|
||||
cd vcpkg
|
||||
./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
|
||||
./vcpkg integrate install
|
||||
./vcpkg install zlib-ng
|
||||
```
|
||||
|
||||
The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
|
||||
If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
|
||||
|
||||
Contributing
|
||||
------------
|
||||
|
||||
Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
|
||||
Help with testing and reviewing pull requests etc is also very much appreciated.
|
||||
|
||||
Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
|
||||
|
||||
Acknowledgments
|
||||
----------------
|
||||
|
||||
Thanks go out to all the people and companies who have taken the time to contribute
|
||||
code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
|
||||
|
||||
The deflate format used by zlib was defined by Phil Katz.<br>
|
||||
The deflate and zlib specifications were written by L. Peter Deutsch.
|
||||
|
||||
zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
|
||||
|
||||
|
||||
Advanced Build Options
|
||||
----------------------
|
||||
|
||||
| CMake | configure | Description | Default |
|
||||
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
|
||||
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
|
||||
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
|
||||
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
|
||||
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
|
||||
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
|
||||
| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON |
|
||||
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
|
||||
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
|
||||
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
|
||||
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
|
||||
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
|
||||
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
|
||||
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
|
||||
| WITH_RVV | | Build with RVV intrinsics | ON |
|
||||
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON |
|
||||
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF |
|
||||
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF |
|
||||
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON |
|
||||
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF |
|
||||
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF |
|
||||
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF |
|
||||
| ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON |
|
||||
|
||||
|
||||
Related Projects
|
||||
----------------
|
||||
|
||||
* Fork of the popular minizip https://github.com/zlib-ng/minizip-ng
|
||||
* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
|
||||
* Python tool to benchmark pigz https://github.com/zlib-ng/pigzbench
|
||||
* 3rd party patches for zlib-ng compatibility https://github.com/zlib-ng/patches
|
||||
115
deps/zlib-ng/adler32.c
vendored
Normal file
115
deps/zlib-ng/adler32.c
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
unsigned n;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
/* do length NMAX blocks -- requires just one modulo operation */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
#ifdef UNROLL_MORE
|
||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||
#else
|
||||
n = NMAX / 8; /* NMAX is divisible by 8 */
|
||||
#endif
|
||||
do {
|
||||
#ifdef UNROLL_MORE
|
||||
DO16(adler, sum2, buf); /* 16 sums unrolled */
|
||||
buf += 16;
|
||||
#else
|
||||
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
|
||||
buf += 8;
|
||||
#endif
|
||||
} while (--n);
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
}
|
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||||
return adler32_len_64(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
|
||||
uint32_t sum1;
|
||||
uint32_t sum2;
|
||||
unsigned rem;
|
||||
|
||||
/* for negative len, return invalid adler32 as a clue for debugging */
|
||||
if (len2 < 0)
|
||||
return 0xffffffff;
|
||||
|
||||
/* the derivation of this formula is left as an exercise for the reader */
|
||||
len2 %= BASE; /* assumes len2 >= 0 */
|
||||
rem = (unsigned)len2;
|
||||
sum1 = adler1 & 0xffff;
|
||||
sum2 = rem * sum1;
|
||||
sum2 %= BASE;
|
||||
sum1 += (adler2 & 0xffff) + BASE - 1;
|
||||
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
|
||||
if (sum1 >= BASE) sum1 -= BASE;
|
||||
if (sum1 >= BASE) sum1 -= BASE;
|
||||
if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
|
||||
if (sum2 >= BASE) sum2 -= BASE;
|
||||
return sum1 | (sum2 << 16);
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
|
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
|
||||
}
|
||||
|
||||
unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
|
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
|
||||
return adler32_combine_(adler1, adler2, len2);
|
||||
}
|
||||
#endif
|
||||
16
deps/zlib-ng/adler32_fold.c
vendored
Normal file
16
deps/zlib-ng/adler32_fold.c
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
/* adler32_fold.c -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_fold.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
adler = functable.adler32(adler, src, len);
|
||||
memcpy(dst, src, len);
|
||||
return adler;
|
||||
}
|
||||
11
deps/zlib-ng/adler32_fold.h
vendored
Normal file
11
deps/zlib-ng/adler32_fold.h
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/* adler32_fold.h -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_FOLD_H_
|
||||
#define ADLER32_FOLD_H_
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
|
||||
#endif
|
||||
70
deps/zlib-ng/adler32_p.h
vendored
Normal file
70
deps/zlib-ng/adler32_p.h
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
/* adler32_p.h -- Private inline functions and macros shared with
|
||||
* different computation of the Adler-32 checksum
|
||||
* of a data stream.
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_P_H
|
||||
#define ADLER32_P_H
|
||||
|
||||
#define BASE 65521U /* largest prime smaller than 65536 */
|
||||
#define NMAX 5552
|
||||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
||||
|
||||
#define DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);}
|
||||
#define DO2(sum1, sum2, buf, i) {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
|
||||
#define DO4(sum1, sum2, buf, i) {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
|
||||
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
|
||||
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
|
||||
|
||||
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
|
||||
adler += buf[0];
|
||||
adler %= BASE;
|
||||
sum2 += adler;
|
||||
sum2 %= BASE;
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
|
||||
while (len) {
|
||||
--len;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE; /* only added so many BASE's */
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
|
||||
while (len--) {
|
||||
*dst = *buf++;
|
||||
adler += *dst++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE; /* only added so many BASE's */
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
|
||||
#ifdef UNROLL_MORE
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
DO16(adler, sum2, buf);
|
||||
buf += 16;
|
||||
#else
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO8(adler, sum2, buf, 0);
|
||||
buf += 8;
|
||||
#endif
|
||||
}
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#endif /* ADLER32_P_H */
|
||||
2
deps/zlib-ng/arch/.gitignore
vendored
Normal file
2
deps/zlib-ng/arch/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# ignore Makefiles; they're all automatically generated
|
||||
Makefile
|
||||
77
deps/zlib-ng/arch/arm/Makefile.in
vendored
Normal file
77
deps/zlib-ng/arch/arm/Makefile.in
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
ACLEFLAG=
|
||||
NEONFLAG=
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
adler32_neon.o adler32_neon.lo \
|
||||
arm_features.o arm_features.lo \
|
||||
chunkset_neon.o chunkset_neon.lo \
|
||||
compare256_neon.o compare256_neon.lo \
|
||||
crc32_acle.o crc32_acle.lo \
|
||||
slide_hash_neon.o slide_hash_neon.lo \
|
||||
insert_string_acle.o insert_string_acle.lo
|
||||
|
||||
adler32_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
adler32_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
arm_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
arm_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
chunkset_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
chunkset_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
compare256_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
compare256_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
crc32_acle.o:
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
crc32_acle.lo:
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
slide_hash_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
slide_hash_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
insert_string_acle.o:
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
insert_string_acle.lo:
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
215
deps/zlib-ng/arch/arm/adler32_neon.c
vendored
Normal file
215
deps/zlib-ng/arch/arm/adler32_neon.c
vendored
Normal file
@@ -0,0 +1,215 @@
|
||||
/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* Copyright (C) 2017 ARM Holdings Inc.
|
||||
* Authors:
|
||||
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
static const uint16_t ALIGNED_(16) taps[64] = {
|
||||
64, 63, 62, 61, 60, 59, 58, 57,
|
||||
56, 55, 54, 53, 52, 51, 50, 49,
|
||||
48, 47, 46, 45, 44, 43, 42, 41,
|
||||
40, 39, 38, 37, 36, 35, 34, 33,
|
||||
32, 31, 30, 29, 28, 27, 26, 25,
|
||||
24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
|
||||
uint32x4_t adacc = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_0 = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_1 = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_2 = vdupq_n_u32(0);
|
||||
|
||||
adacc = vsetq_lane_u32(s[0], adacc, 0);
|
||||
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
|
||||
|
||||
uint32x4_t s3acc = vdupq_n_u32(0);
|
||||
uint32x4_t adacc_prev = adacc;
|
||||
|
||||
uint16x8_t s2_0, s2_1, s2_2, s2_3;
|
||||
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
|
||||
|
||||
uint16x8_t s2_4, s2_5, s2_6, s2_7;
|
||||
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
|
||||
|
||||
size_t num_iter = len >> 2;
|
||||
int rem = len & 3;
|
||||
|
||||
for (size_t i = 0; i < num_iter; ++i) {
|
||||
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
|
||||
|
||||
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
|
||||
* bit instruction, we'll have to make due summing to 16 bits first */
|
||||
uint16x8x2_t hsum, hsum_fold;
|
||||
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
|
||||
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
|
||||
|
||||
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
|
||||
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
|
||||
|
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
|
||||
s3acc = vaddq_u32(s3acc, adacc_prev);
|
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
|
||||
|
||||
/* If we do straight widening additions to the 16 bit values, we don't incur
|
||||
* the usual penalties of a pairwise add. We can defer the multiplications
|
||||
* until the very end. These will not overflow because we are incurring at
|
||||
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
|
||||
* summed into once. This means for the maximum input size, the largest value
|
||||
* we will see is 255 * 102 = 26010, safely under uint16 max */
|
||||
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
|
||||
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
|
||||
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
|
||||
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
|
||||
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
|
||||
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
|
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
|
||||
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
|
||||
|
||||
adacc_prev = adacc;
|
||||
buf += 64;
|
||||
}
|
||||
|
||||
s3acc = vshlq_n_u32(s3acc, 6);
|
||||
|
||||
if (rem) {
|
||||
uint32x4_t s3acc_0 = vdupq_n_u32(0);
|
||||
while (rem--) {
|
||||
uint8x16_t d0 = vld1q_u8(buf);
|
||||
uint16x8_t adler;
|
||||
adler = vpaddlq_u8(d0);
|
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
|
||||
s2_7 = vaddw_high_u8(s2_7, d0);
|
||||
adacc = vpadalq_u16(adacc, adler);
|
||||
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
|
||||
adacc_prev = adacc;
|
||||
buf += 16;
|
||||
}
|
||||
|
||||
s3acc_0 = vshlq_n_u32(s3acc_0, 4);
|
||||
s3acc = vaddq_u32(s3acc_0, s3acc);
|
||||
}
|
||||
|
||||
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
|
||||
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
|
||||
|
||||
s2acc = vaddq_u32(s2acc_0, s2acc);
|
||||
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
|
||||
s2acc = vaddq_u32(s2acc, s2acc_2);
|
||||
|
||||
uint32x2_t adacc2, s2acc2, as;
|
||||
s2acc = vaddq_u32(s2acc, s3acc);
|
||||
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
|
||||
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
|
||||
as = vpadd_u32(adacc2, s2acc2);
|
||||
s[0] = vget_lane_u32(as, 0);
|
||||
s[1] = vget_lane_u32(as, 1);
|
||||
}
|
||||
|
||||
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
pair[0] += buf[i];
|
||||
pair[1] += pair[0];
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
/* split Adler-32 into component sums */
|
||||
uint32_t sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (len == 1)
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (buf == NULL)
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (len < 16)
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t pair[2];
|
||||
int n = NMAX;
|
||||
unsigned int done = 0;
|
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file).
|
||||
*/
|
||||
pair[0] = adler;
|
||||
pair[1] = sum2;
|
||||
|
||||
/* If memory is not SIMD aligned, do scalar sums to an aligned
|
||||
* offset, provided that doing so doesn't completely eliminate
|
||||
* SIMD operation. Aligned loads are still faster on ARM, even
|
||||
* though there's no explicit aligned load instruction */
|
||||
unsigned int align_offset = ((uintptr_t)buf & 15);
|
||||
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
|
||||
|
||||
if (align_offset && len >= (16 + align_adj)) {
|
||||
NEON_handle_tail(pair, buf, align_adj);
|
||||
n -= align_adj;
|
||||
done += align_adj;
|
||||
|
||||
} else {
|
||||
/* If here, we failed the len criteria test, it wouldn't be
|
||||
* worthwhile to do scalar aligning sums */
|
||||
align_adj = 0;
|
||||
}
|
||||
|
||||
while (done < len) {
|
||||
int remaining = (int)(len - done);
|
||||
n = MIN(remaining, (done == align_adj) ? n : NMAX);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
NEON_accum32(pair, buf + done, n >> 4);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
|
||||
int actual_nsums = (n >> 4) << 4;
|
||||
done += actual_nsums;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (done < len) {
|
||||
NEON_handle_tail(pair, (buf + done), len - done);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
}
|
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
|
||||
return (pair[1] << 16) | pair[0];
|
||||
}
|
||||
|
||||
#endif
|
||||
82
deps/zlib-ng/arch/arm/arm_features.c
vendored
Normal file
82
deps/zlib-ng/arch/arm/arm_features.c
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
#include "../../zbuild.h"
|
||||
#include "arm_features.h"
|
||||
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
# include <sys/auxv.h>
|
||||
# ifdef ARM_ASM_HWCAP
|
||||
# include <asm/hwcap.h>
|
||||
# endif
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
# include <machine/armreg.h>
|
||||
# ifndef ID_AA64ISAR0_CRC32_VAL
|
||||
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
|
||||
# endif
|
||||
#elif defined(__APPLE__)
|
||||
# if !defined(_DARWIN_C_SOURCE)
|
||||
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
|
||||
# endif
|
||||
# include <sys/sysctl.h>
|
||||
#elif defined(_WIN32)
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
static int arm_has_crc32() {
|
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
|
||||
# ifdef HWCAP_CRC32
|
||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
|
||||
# else
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
|
||||
# endif
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
return getenv("QEMU_EMULATING") == NULL
|
||||
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
|
||||
#elif defined(__APPLE__)
|
||||
int hascrc32;
|
||||
size_t size = sizeof(hascrc32);
|
||||
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
|
||||
&& hascrc32 == 1;
|
||||
#elif defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
|
||||
#elif defined(ARM_NOCHECK_ACLE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* AArch64 has neon. */
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
static inline int arm_has_neon() {
|
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
|
||||
# ifdef HWCAP_ARM_NEON
|
||||
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
|
||||
# else
|
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
|
||||
# endif
|
||||
#elif defined(__APPLE__)
|
||||
int hasneon;
|
||||
size_t size = sizeof(hasneon);
|
||||
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
|
||||
&& hasneon == 1;
|
||||
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
|
||||
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
|
||||
return 1; /* Always supported */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(ARM_NOCHECK_NEON)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
features->has_neon = 1; /* always available */
|
||||
#else
|
||||
features->has_neon = arm_has_neon();
|
||||
#endif
|
||||
features->has_crc32 = arm_has_crc32();
|
||||
}
|
||||
15
deps/zlib-ng/arch/arm/arm_features.h
vendored
Normal file
15
deps/zlib-ng/arch/arm/arm_features.h
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
/* arm_features.h -- check for ARM features.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ARM_H_
|
||||
#define ARM_H_
|
||||
|
||||
struct arm_cpu_features {
|
||||
int has_neon;
|
||||
int has_crc32;
|
||||
};
|
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
|
||||
|
||||
#endif /* ARM_H_ */
|
||||
101
deps/zlib-ng/arch/arm/chunkset_neon.c
vendored
Normal file
101
deps/zlib-ng/arch/arm/chunkset_neon.c
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef uint8x16_t chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
{0, 0}, /* don't care */
|
||||
{1 * 32, 1}, /* 5 */
|
||||
{2 * 32, 4}, /* 6 */
|
||||
{3 * 32, 2}, /* 7 */
|
||||
{0 * 32, 0}, /* don't care */
|
||||
{4 * 32, 7}, /* 9 */
|
||||
{5 * 32, 6}, /* 10 */
|
||||
{6 * 32, 5}, /* 11 */
|
||||
{7 * 32, 4}, /* 12 */
|
||||
{8 * 32, 3}, /* 13 */
|
||||
{9 * 32, 2}, /* 14 */
|
||||
{10 * 32, 1},/* 15 */
|
||||
};
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
uint16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
uint64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_neon
|
||||
#define CHUNKCOPY chunkcopy_neon
|
||||
#define CHUNKUNROLL chunkunroll_neon
|
||||
#define CHUNKMEMSET chunkmemset_neon
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vst1q_u8(out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
#ifdef Z_MEMORY_SANITIZER
|
||||
/* See note in chunkset_ssse3.c for why this is ok */
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
#endif
|
||||
|
||||
/* This version of table is only available on aarch64 */
|
||||
#if defined(_M_ARM64) || defined(__aarch64__)
|
||||
uint8x16_t ret_vec = vld1q_u8(buf);
|
||||
|
||||
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
|
||||
return vqtbl1q_u8(ret_vec, perm_vec);
|
||||
#else
|
||||
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
|
||||
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
|
||||
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
|
||||
a = vld1_u8(buf);
|
||||
b = vld1_u8(buf + 8);
|
||||
ret0 = vtbl1_u8(a, perm_vec0);
|
||||
uint8x8x2_t ab = {{a, b}};
|
||||
ret1 = vtbl2_u8(ab, perm_vec1);
|
||||
return vcombine_u8(ret0, ret1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_neon
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
59
deps/zlib-ng/arch/arm/compare256_neon.c
vendored
Normal file
59
deps/zlib-ng/arch/arm/compare256_neon.c
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
/* compare256_neon.c - NEON version of compare256
|
||||
* Copyright (C) 2022 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
#include "neon_intrins.h"
|
||||
|
||||
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint8x16_t a, b, cmp;
|
||||
uint64_t lane;
|
||||
|
||||
a = vld1q_u8(src0);
|
||||
b = vld1q_u8(src1);
|
||||
|
||||
cmp = veorq_u8(a, b);
|
||||
|
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
|
||||
if (lane) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
len += 8;
|
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
|
||||
if (lane) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
len += 8;
|
||||
|
||||
src0 += 16, src1 += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_neon_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_neon
|
||||
#define COMPARE256 compare256_neon_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_neon
|
||||
#define COMPARE256 compare256_neon_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
98
deps/zlib-ng/arch/arm/crc32_acle.c
vendored
Normal file
98
deps/zlib-ng/arch/arm/crc32_acle.c
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
/* crc32_acle.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
|
||||
* Copyright (C) 2016 Yang Zhang
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#else
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
|
||||
Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
Z_REGISTER uint32_t c;
|
||||
Z_REGISTER const uint16_t *buf2;
|
||||
Z_REGISTER const uint32_t *buf4;
|
||||
|
||||
c = ~crc;
|
||||
if (len && ((ptrdiff_t)buf & 1)) {
|
||||
c = __crc32b(c, *buf++);
|
||||
len--;
|
||||
}
|
||||
|
||||
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
|
||||
buf2 = (const uint16_t *) buf;
|
||||
c = __crc32h(c, *buf2++);
|
||||
len -= sizeof(uint16_t);
|
||||
buf4 = (const uint32_t *) buf2;
|
||||
} else {
|
||||
buf4 = (const uint32_t *) buf;
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
}
|
||||
|
||||
if (len == 0) {
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
|
||||
const uint64_t *buf8 = (const uint64_t *) buf4;
|
||||
|
||||
while (len >= sizeof(uint64_t)) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
len -= sizeof(uint64_t);
|
||||
}
|
||||
|
||||
if (len >= sizeof(uint32_t)) {
|
||||
buf4 = (const uint32_t *) buf8;
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
buf2 = (const uint16_t *) buf4;
|
||||
} else {
|
||||
buf2 = (const uint16_t *) buf8;
|
||||
}
|
||||
|
||||
if (len >= sizeof(uint16_t)) {
|
||||
c = __crc32h(c, *buf2++);
|
||||
len -= sizeof(uint16_t);
|
||||
}
|
||||
|
||||
buf = (const unsigned char *) buf2;
|
||||
#else /* __aarch64__ */
|
||||
|
||||
if (len == 0) {
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
|
||||
while (len >= sizeof(uint32_t)) {
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
}
|
||||
|
||||
if (len >= sizeof(uint16_t)) {
|
||||
buf2 = (const uint16_t *) buf4;
|
||||
c = __crc32h(c, *buf2++);
|
||||
len -= sizeof(uint16_t);
|
||||
buf = (const unsigned char *) buf2;
|
||||
} else {
|
||||
buf = (const unsigned char *) buf4;
|
||||
}
|
||||
#endif /* __aarch64__ */
|
||||
|
||||
if (len) {
|
||||
c = __crc32b(c, *buf);
|
||||
}
|
||||
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
26
deps/zlib-ng/arch/arm/insert_string_acle.c
vendored
Normal file
26
deps/zlib-ng/arch/arm/insert_string_acle.c
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#ifndef _MSC_VER
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define HASH_CALC(s, h, val) \
|
||||
h = __crc32w(0, val)
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH update_hash_acle
|
||||
#define INSERT_STRING insert_string_acle
|
||||
#define QUICK_INSERT_STRING quick_insert_string_acle
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
||||
57
deps/zlib-ng/arch/arm/neon_intrins.h
vendored
Normal file
57
deps/zlib-ng/arch/arm/neon_intrins.h
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
#ifndef ARM_NEON_INTRINS_H
|
||||
#define ARM_NEON_INTRINS_H
|
||||
|
||||
#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
/* Compatibility shim for the _high family of functions */
|
||||
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
|
||||
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
|
||||
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
|
||||
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
|
||||
#endif
|
||||
|
||||
#ifdef ARM_NEON
|
||||
|
||||
#define vqsubq_u16_x4_x1(out, a, b) do { \
|
||||
out.val[0] = vqsubq_u16(a.val[0], b); \
|
||||
out.val[1] = vqsubq_u16(a.val[1], b); \
|
||||
out.val[2] = vqsubq_u16(a.val[2], b); \
|
||||
out.val[3] = vqsubq_u16(a.val[3], b); \
|
||||
} while (0)
|
||||
|
||||
|
||||
# ifndef ARM_NEON_HASLD4
|
||||
|
||||
static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
|
||||
uint16x8x4_t ret = (uint16x8x4_t) {{
|
||||
vld1q_u16(a),
|
||||
vld1q_u16(a+8),
|
||||
vld1q_u16(a+16),
|
||||
vld1q_u16(a+24)}};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
|
||||
uint8x16x4_t ret = (uint8x16x4_t) {{
|
||||
vld1q_u8(a),
|
||||
vld1q_u8(a+16),
|
||||
vld1q_u8(a+32),
|
||||
vld1q_u8(a+48)}};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
|
||||
vst1q_u16(p, a.val[0]);
|
||||
vst1q_u16(p + 8, a.val[1]);
|
||||
vst1q_u16(p + 16, a.val[2]);
|
||||
vst1q_u16(p + 24, a.val[3]);
|
||||
}
|
||||
# endif // HASLD4 check
|
||||
#endif
|
||||
|
||||
#endif // include guard ARM_NEON_INTRINS_H
|
||||
46
deps/zlib-ng/arch/arm/slide_hash_neon.c
vendored
Normal file
46
deps/zlib-ng/arch/arm/slide_hash_neon.c
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
|
||||
* Copyright (C) 2017-2020 Mika T. Lindqvist
|
||||
*
|
||||
* Authors:
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Jun He <jun.he@arm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
Z_REGISTER uint16x8_t v;
|
||||
uint16x8x4_t p0, p1;
|
||||
Z_REGISTER size_t n;
|
||||
|
||||
size_t size = entries*sizeof(table[0]);
|
||||
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||
v = vdupq_n_u16(wsize);
|
||||
|
||||
n = size / (sizeof(uint16x8_t) * 8);
|
||||
do {
|
||||
p0 = vld1q_u16_x4(table);
|
||||
p1 = vld1q_u16_x4(table+32);
|
||||
vqsubq_u16_x4_x1(p0, p0, v);
|
||||
vqsubq_u16_x4_x1(p1, p1, v);
|
||||
vst1q_u16_x4(table, p0);
|
||||
vst1q_u16_x4(table+32, p1);
|
||||
table += 64;
|
||||
} while (--n);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
#endif
|
||||
24
deps/zlib-ng/arch/generic/Makefile.in
vendored
Normal file
24
deps/zlib-ng/arch/generic/Makefile.in
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all:
|
||||
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~ \
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
53
deps/zlib-ng/arch/generic/chunk_permute_table.h
vendored
Normal file
53
deps/zlib-ng/arch/generic/chunk_permute_table.h
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CHUNK_PERMUTE_TABLE_H_
|
||||
#define CHUNK_PERMUTE_TABLE_H_
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
|
||||
static const ALIGNED_(32) uint8_t permute_table[26*32] = {
|
||||
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
|
||||
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
|
||||
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
|
||||
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
|
||||
|
||||
/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
|
||||
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
|
||||
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
|
||||
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
|
||||
* this is what we're dealt.
|
||||
*/
|
||||
|
||||
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
|
||||
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
|
||||
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
|
||||
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
|
||||
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
|
||||
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
|
||||
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
|
||||
};
|
||||
|
||||
typedef struct lut_rem_pair_s {
|
||||
uint16_t idx;
|
||||
uint16_t remval;
|
||||
} lut_rem_pair;
|
||||
|
||||
#endif
|
||||
93
deps/zlib-ng/arch/power/Makefile.in
vendored
Normal file
93
deps/zlib-ng/arch/power/Makefile.in
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
# Makefile for POWER-specific files
|
||||
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
P8FLAGS=-mcpu=power8
|
||||
P9FLAGS=-mcpu=power9
|
||||
PPCFLAGS=-maltivec
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: power_features.o \
|
||||
power_features.lo \
|
||||
adler32_power8.o \
|
||||
adler32_power8.lo \
|
||||
adler32_vmx.o \
|
||||
adler32_vmx.lo \
|
||||
chunkset_power8.o \
|
||||
chunkset_power8.lo \
|
||||
compare256_power9.o \
|
||||
compare256_power9.lo \
|
||||
crc32_power8.o \
|
||||
crc32_power8.lo \
|
||||
slide_hash_power8.o \
|
||||
slide_hash_power8.lo \
|
||||
slide_hash_vmx.o \
|
||||
slide_hash_vmx.lo
|
||||
|
||||
power_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
power_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
adler32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_vmx.o:
|
||||
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
adler32_vmx.lo:
|
||||
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
chunkset_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
chunkset_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
compare256_power9.o:
|
||||
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
compare256_power9.lo:
|
||||
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
crc32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
crc32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
slide_hash_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_vmx.o:
|
||||
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
slide_hash_vmx.lo:
|
||||
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
153
deps/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
153
deps/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
@@ -0,0 +1,153 @@
|
||||
/* Adler32 for POWER8 using VSX instructions.
|
||||
* Copyright (C) 2020 IBM Corporation
|
||||
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
||||
* instructions.
|
||||
*
|
||||
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
||||
* iteration n) is the initial value of adler - at start _0 is 1 unless
|
||||
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
||||
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
||||
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
||||
* after iteration N.
|
||||
*
|
||||
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
||||
* N-1*c[1] + ... + c[N]
|
||||
*
|
||||
* In a more general way:
|
||||
*
|
||||
* s1_N = s1_0 + sum(i=1 to N)c[i]
|
||||
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
||||
*
|
||||
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
||||
* can process N-bit at time we can do this at once.
|
||||
*
|
||||
* Since VSX can support 16-bit vector instructions, we can process
|
||||
* 16-bit at time using N = 16 we have:
|
||||
*
|
||||
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
||||
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
||||
*
|
||||
* After the first iteration we calculate the adler32 checksum for 16 bytes.
|
||||
*
|
||||
* For more background about adler32 please check the RFC:
|
||||
* https://www.ietf.org/rfc/rfc1950.txt
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* Vector across sum unsigned int (saturate). */
|
||||
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
|
||||
__b = vec_sld(__a, __a, 8);
|
||||
__b = vec_add(__b, __a);
|
||||
__a = vec_sld(__b, __b, 4);
|
||||
__a = vec_add(__a, __b);
|
||||
|
||||
return __a;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = (adler >> 16) & 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(s1, buf, s2);
|
||||
|
||||
/* If buffer is empty or len=0 we need to return adler initial value. */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1;
|
||||
|
||||
/* This is faster than VSX code for len < 64. */
|
||||
if (len < 64)
|
||||
return adler32_len_64(s1, buf, len, s2);
|
||||
|
||||
/* Use POWER VSX instructions for len >= 64. */
|
||||
const vector unsigned int v_zeros = { 0 };
|
||||
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
||||
6, 5, 4, 3, 2, 1};
|
||||
const vector unsigned char vsh = vec_splat_u8(4);
|
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
vector unsigned int vs1 = { 0 };
|
||||
vector unsigned int vs2 = { 0 };
|
||||
vector unsigned int vs1_save = { 0 };
|
||||
vector unsigned int vsum1, vsum2;
|
||||
vector unsigned char vbuf;
|
||||
int n;
|
||||
|
||||
vs1[0] = s1;
|
||||
vs2[0] = s2;
|
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
n = NMAX / 16;
|
||||
do {
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
} while (--n);
|
||||
/* Once each block of NMAX size. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
|
||||
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
||||
vs1[0] = vs1[0] % BASE;
|
||||
/* vs2[0] = s2_i + 16*s1_save +
|
||||
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
||||
vs2[0] = vs2[0] % BASE;
|
||||
|
||||
vs1 = vec_and(vs1, vmask);
|
||||
vs2 = vec_and(vs2, vmask);
|
||||
vs1_save = v_zeros;
|
||||
}
|
||||
|
||||
/* len is less than NMAX one modulo is needed. */
|
||||
if (len >= 16) {
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
}
|
||||
/* Since the size will be always less than NMAX we do this once. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
}
|
||||
/* Copy result back to s1, s2 (mod 65521). */
|
||||
s1 = vs1[0] % BASE;
|
||||
s2 = vs2[0] % BASE;
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(s1, buf, len, s2);
|
||||
}
|
||||
|
||||
#endif /* POWER8_VSX */
|
||||
181
deps/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
181
deps/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef PPC_VMX
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
#define vmx_zero() (vec_splat_u32(0))
|
||||
|
||||
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
pair[0] += buf[i];
|
||||
pair[1] += pair[0];
|
||||
}
|
||||
}
|
||||
|
||||
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
/* Different taps for the separable components of sums */
|
||||
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
|
||||
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
|
||||
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
|
||||
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
|
||||
* a 2 element vector from a single load + a subsequent shift is just barely faster
|
||||
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
|
||||
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
|
||||
vector unsigned int adacc, s2acc;
|
||||
vector unsigned int pair_vec = vec_ld(0, s);
|
||||
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
|
||||
s2acc = vec_slo(pair_vec, shift_vec);
|
||||
|
||||
vector unsigned int zero = vmx_zero();
|
||||
vector unsigned int s3acc = zero;
|
||||
vector unsigned int s3acc_0 = zero;
|
||||
vector unsigned int adacc_prev = adacc;
|
||||
vector unsigned int adacc_prev_0 = zero;
|
||||
|
||||
vector unsigned int s2acc_0 = zero;
|
||||
vector unsigned int s2acc_1 = zero;
|
||||
vector unsigned int s2acc_2 = zero;
|
||||
|
||||
/* Maintain a running sum of a second half, this might help use break yet another
|
||||
* data dependency bubble in the sum */
|
||||
vector unsigned int adacc_0 = zero;
|
||||
|
||||
int num_iter = len / 4;
|
||||
int rem = len & 3;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
vector unsigned char d1 = vec_ld(16, buf);
|
||||
vector unsigned char d2 = vec_ld(32, buf);
|
||||
vector unsigned char d3 = vec_ld(48, buf);
|
||||
|
||||
/* The core operation of the loop, basically
|
||||
* what is being unrolled below */
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
|
||||
s2acc = vec_msum(t0, d0, s2acc);
|
||||
|
||||
/* interleave dependent sums in here */
|
||||
adacc_0 = vec_sum4s(d1, adacc_0);
|
||||
s2acc_0 = vec_msum(t1, d1, s2acc_0);
|
||||
adacc = vec_sum4s(d2, adacc);
|
||||
s2acc_1 = vec_msum(t2, d2, s2acc_1);
|
||||
s2acc_2 = vec_msum(t3, d3, s2acc_2);
|
||||
adacc_0 = vec_sum4s(d3, adacc_0);
|
||||
|
||||
adacc_prev = adacc;
|
||||
adacc_prev_0 = adacc_0;
|
||||
buf += 64;
|
||||
}
|
||||
|
||||
adacc = vec_add(adacc, adacc_0);
|
||||
s3acc = vec_add(s3acc, s3acc_0);
|
||||
s3acc = vec_sl(s3acc, vec_splat_u32(6));
|
||||
|
||||
if (rem) {
|
||||
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
|
||||
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
|
||||
while (rem--) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s2acc = vec_msum(t3, d0, s2acc);
|
||||
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
|
||||
buf += 16;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Sum up independent second sums */
|
||||
s2acc = vec_add(s2acc, s2acc_0);
|
||||
s2acc_2 = vec_add(s2acc_1, s2acc_2);
|
||||
s2acc = vec_add(s2acc, s2acc_2);
|
||||
|
||||
s2acc = vec_add(s2acc, s3acc);
|
||||
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
|
||||
|
||||
vec_ste(adacc, 0, s);
|
||||
vec_ste(s2acc, 0, s+1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
uint32_t pair[16] ALIGNED_(16);
|
||||
memset(&pair[2], 0, 14);
|
||||
int n = NMAX;
|
||||
unsigned int done = 0, i;
|
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file).
|
||||
*/
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
pair[0] = adler;
|
||||
pair[1] = sum2;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
// Align buffer
|
||||
unsigned int al = 0;
|
||||
if ((uintptr_t)buf & 0xf) {
|
||||
al = 16-((uintptr_t)buf & 0xf);
|
||||
if (al > len) {
|
||||
al=len;
|
||||
}
|
||||
vmx_handle_head_or_tail(pair, buf, al);
|
||||
|
||||
done += al;
|
||||
/* Rather than rebasing, we can reduce the max sums for the
|
||||
* first round only */
|
||||
n -= al;
|
||||
}
|
||||
for (i = al; i < len; i += n) {
|
||||
int remaining = (int)(len-i);
|
||||
n = MIN(remaining, (i == al) ? n : NMAX);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
vmx_accum32(pair, buf + i, n / 16);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
|
||||
done += (n / 16) * 16;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (done < len) {
|
||||
vmx_handle_head_or_tail(pair, (buf + done), len - done);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
}
|
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
|
||||
return (pair[1] << 16) | pair[0];
|
||||
}
|
||||
#endif
|
||||
55
deps/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
55
deps/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
|
||||
typedef vector unsigned char chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
uint16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
uint64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vec_xl(0, s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vec_xst(*chunk, 0, out);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_power8
|
||||
#define CHUNKCOPY chunkcopy_power8
|
||||
#define CHUNKUNROLL chunkunroll_power8
|
||||
#define CHUNKMEMSET chunkmemset_power8
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_power8
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
66
deps/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
66
deps/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/* compare256_power9.c - Power9 version of compare256
|
||||
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER9
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zendian.h"
|
||||
|
||||
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
|
||||
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
|
||||
#if defined(__GNUC__) && (__GNUC__ < 12)
|
||||
# define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
|
||||
# define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
|
||||
#else
|
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
|
||||
# define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0, cmplen;
|
||||
|
||||
do {
|
||||
vector unsigned char vsrc0, vsrc1, vc;
|
||||
|
||||
vsrc0 = *((vector unsigned char *)src0);
|
||||
vsrc1 = *((vector unsigned char *)src1);
|
||||
|
||||
/* Compare 16 bytes at a time. Each byte of vc will be either
|
||||
* all ones or all zeroes, depending on the result of the comparison. */
|
||||
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
|
||||
|
||||
/* Since the index of matching bytes will contain only zeroes
|
||||
* on vc (since we used cmpne), counting the number of consecutive
|
||||
* bytes where LSB == 0 is the same as counting the length of the match. */
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
zng_vec_vctzlsbb(vc, cmplen);
|
||||
#else
|
||||
zng_vec_vclzlsbb(vc, cmplen);
|
||||
#endif
|
||||
if (cmplen != 16)
|
||||
return len + cmplen;
|
||||
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_power9_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
1123
deps/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
1123
deps/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
589
deps/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
589
deps/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
@@ -0,0 +1,589 @@
|
||||
/* crc32 for POWER8 using VSX instructions
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Author: Rogerio Alves <rogealve@br.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* This code uses gcc vector builtins instead using assembly directly.
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zendian.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
#include "crc32_constants.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
|
||||
#if defined (__clang__)
|
||||
#include "fallback_builtins.h"
|
||||
#endif
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
#define VMX_ALIGN 16
|
||||
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
|
||||
|
||||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
|
||||
while (len--)
|
||||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
|
||||
return crc;
|
||||
}
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
|
||||
|
||||
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
|
||||
unsigned int prealign;
|
||||
unsigned int tail;
|
||||
|
||||
unsigned long len = (unsigned long) _len;
|
||||
|
||||
if (p == (const unsigned char *) 0x0)
|
||||
return 0;
|
||||
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
|
||||
crc = crc32_align(crc, p, len);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((unsigned long)p & VMX_ALIGN_MASK) {
|
||||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
|
||||
crc = crc32_align(crc, p, prealign);
|
||||
len -= prealign;
|
||||
p += prealign;
|
||||
}
|
||||
|
||||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
|
||||
|
||||
tail = len & VMX_ALIGN_MASK;
|
||||
if (tail) {
|
||||
p += len & ~VMX_ALIGN_MASK;
|
||||
crc = crc32_align(crc, p, tail);
|
||||
}
|
||||
|
||||
out:
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
/* When we have a load-store in a single-dispatch group and address overlap
|
||||
* such that forward is not allowed (load-hit-store) the group must be flushed.
|
||||
* A group ending NOP prevents the flush.
|
||||
*/
|
||||
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
#define BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
/* Byte reverse permute constant LE. */
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
|
||||
#else
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
|
||||
#endif
|
||||
#else
|
||||
#define VEC_PERM(vr, va, vb, vc)
|
||||
#endif
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
|
||||
|
||||
const __vector unsigned long long vzero = {0,0};
|
||||
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
|
||||
|
||||
const __vector unsigned long long vmask_32bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
|
||||
|
||||
const __vector unsigned long long vmask_64bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
|
||||
|
||||
__vector unsigned long long vcrc;
|
||||
|
||||
__vector unsigned long long vconst1, vconst2;
|
||||
|
||||
/* vdata0-vdata7 will contain our data (p). */
|
||||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
|
||||
|
||||
/* v0-v7 will contain our checksums */
|
||||
__vector unsigned long long v0 = {0,0};
|
||||
__vector unsigned long long v1 = {0,0};
|
||||
__vector unsigned long long v2 = {0,0};
|
||||
__vector unsigned long long v3 = {0,0};
|
||||
__vector unsigned long long v4 = {0,0};
|
||||
__vector unsigned long long v5 = {0,0};
|
||||
__vector unsigned long long v6 = {0,0};
|
||||
__vector unsigned long long v7 = {0,0};
|
||||
|
||||
|
||||
/* Vector auxiliary variables. */
|
||||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
|
||||
unsigned int offset; /* Constant table offset. */
|
||||
|
||||
unsigned long i; /* Counter. */
|
||||
unsigned long chunks;
|
||||
|
||||
unsigned long block_size;
|
||||
int next_block = 0;
|
||||
|
||||
/* Align by 128 bits. The last 128 bit block will be processed at end. */
|
||||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
|
||||
|
||||
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
|
||||
|
||||
/* Short version. */
|
||||
if (len < 256) {
|
||||
/* Calculate where in the constant table we need to start. */
|
||||
offset = 256 - len;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_short_const);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
|
||||
/* xor initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
|
||||
for (i = 16; i < len; i += 16) {
|
||||
vconst1 = vec_ld(offset + i, vcrc_short_const);
|
||||
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Load initial values. */
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
/* xor in initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
do {
|
||||
/* Checksum in blocks of MAX_SIZE. */
|
||||
block_size = length;
|
||||
if (block_size > MAX_SIZE) {
|
||||
block_size = MAX_SIZE;
|
||||
}
|
||||
|
||||
length = length - block_size;
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
offset = (MAX_SIZE/8) - (block_size/8);
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
chunks = (block_size/128)-1;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
if (chunks > 1) {
|
||||
offset += 16;
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
/*
|
||||
* main loop. Each iteration calculates the CRC for a 128-byte
|
||||
* block.
|
||||
*/
|
||||
for (i = 0; i < chunks-2; i++) {
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
|
||||
vdata2, (__vector unsigned long long)vconst2);
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
}
|
||||
|
||||
/* First cool down */
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
}/* else */
|
||||
|
||||
/* Second cool down. */
|
||||
v0 = vec_xor(v0, va0);
|
||||
v1 = vec_xor(v1, va1);
|
||||
v2 = vec_xor(v2, va2);
|
||||
v3 = vec_xor(v3, va3);
|
||||
v4 = vec_xor(v4, va4);
|
||||
v5 = vec_xor(v5, va5);
|
||||
v6 = vec_xor(v6, va6);
|
||||
v7 = vec_xor(v7, va7);
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
/* xor with the last 1024 bits. */
|
||||
va0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va0, va0, va0, vperm_const);
|
||||
|
||||
va1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va1, va1, va1, vperm_const);
|
||||
|
||||
va2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va2, va2, va2, vperm_const);
|
||||
|
||||
va3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va3, va3, va3, vperm_const);
|
||||
|
||||
va4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va4, va4, va4, vperm_const);
|
||||
|
||||
va5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va5, va5, va5, vperm_const);
|
||||
|
||||
va6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va6, va6, va6, vperm_const);
|
||||
|
||||
va7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va7, va7, va7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
vdata0 = vec_xor(v0, va0);
|
||||
vdata1 = vec_xor(v1, va1);
|
||||
vdata2 = vec_xor(v2, va2);
|
||||
vdata3 = vec_xor(v3, va3);
|
||||
vdata4 = vec_xor(v4, va4);
|
||||
vdata5 = vec_xor(v5, va5);
|
||||
vdata6 = vec_xor(v6, va6);
|
||||
vdata7 = vec_xor(v7, va7);
|
||||
|
||||
/* Check if we have more blocks to process */
|
||||
next_block = 0;
|
||||
if (length != 0) {
|
||||
next_block = 1;
|
||||
|
||||
/* zero v0-v7 */
|
||||
v0 = vec_xor(v0, v0);
|
||||
v1 = vec_xor(v1, v1);
|
||||
v2 = vec_xor(v2, v2);
|
||||
v3 = vec_xor(v3, v3);
|
||||
v4 = vec_xor(v4, v4);
|
||||
v5 = vec_xor(v5, v5);
|
||||
v6 = vec_xor(v6, v6);
|
||||
v7 = vec_xor(v7, v7);
|
||||
}
|
||||
length = length + 128;
|
||||
|
||||
} while (next_block);
|
||||
|
||||
/* Calculate how many bytes we have left. */
|
||||
length = (len & 127);
|
||||
|
||||
/* Calculate where in (short) constant table we need to start. */
|
||||
offset = 128 - length;
|
||||
|
||||
v0 = vec_ld(offset, vcrc_short_const);
|
||||
v1 = vec_ld(offset + 16, vcrc_short_const);
|
||||
v2 = vec_ld(offset + 32, vcrc_short_const);
|
||||
v3 = vec_ld(offset + 48, vcrc_short_const);
|
||||
v4 = vec_ld(offset + 64, vcrc_short_const);
|
||||
v5 = vec_ld(offset + 80, vcrc_short_const);
|
||||
v6 = vec_ld(offset + 96, vcrc_short_const);
|
||||
v7 = vec_ld(offset + 112, vcrc_short_const);
|
||||
|
||||
offset += 128;
|
||||
|
||||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
|
||||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
|
||||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
|
||||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
|
||||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
|
||||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
|
||||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
|
||||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
|
||||
|
||||
/* Now reduce the tail (0-112 bytes). */
|
||||
for (i = 0; i < length; i+=16) {
|
||||
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
va0 = vec_ld(offset + i,vcrc_short_const);
|
||||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
|
||||
v0 = vec_xor(v0, va0);
|
||||
}
|
||||
|
||||
/* xor all parallel chunks together. */
|
||||
v0 = vec_xor(v0, v1);
|
||||
v2 = vec_xor(v2, v3);
|
||||
v4 = vec_xor(v4, v5);
|
||||
v6 = vec_xor(v6, v7);
|
||||
|
||||
v0 = vec_xor(v0, v2);
|
||||
v4 = vec_xor(v4, v6);
|
||||
|
||||
v0 = vec_xor(v0, v4);
|
||||
}
|
||||
|
||||
/* Barrett Reduction */
|
||||
vconst1 = vec_ld(0, v_Barrett_const);
|
||||
vconst2 = vec_ld(16, v_Barrett_const);
|
||||
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)v0, 8);
|
||||
v0 = vec_xor(v1,v0);
|
||||
|
||||
/* shift left one bit */
|
||||
__vector unsigned char vsht_splat = vec_splat_u8 (1);
|
||||
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
|
||||
|
||||
v0 = vec_and(v0, vmask_64bit);
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
|
||||
/* bottom 32 bits of a */
|
||||
v1 = vec_and(v0, vmask_32bit);
|
||||
|
||||
/* ma */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
/* bottom 32bits of ma */
|
||||
v1 = vec_and(v1, vmask_32bit);
|
||||
/* qn */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
/* a - qn, subtraction is xor in GF(2) */
|
||||
v0 = vec_xor (v0, v1);
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
|
||||
/* shift result into top 64 bits of */
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
return v0[0];
|
||||
#else
|
||||
return v0[1];
|
||||
#endif
|
||||
}
|
||||
31
deps/zlib-ng/arch/power/fallback_builtins.h
vendored
Normal file
31
deps/zlib-ng/arch/power/fallback_builtins.h
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
/* Helper functions to work around issues with clang builtins
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Authors:
|
||||
* Daniel Black <daniel@linux.vnet.ibm.com>
|
||||
* Rogerio Alves <rogealve@br.ibm.com>
|
||||
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_BUILTINS_H
|
||||
#define POWER_BUILTINS_H
|
||||
|
||||
/*
|
||||
* These stubs fix clang incompatibilities with GCC builtins.
|
||||
*/
|
||||
|
||||
#ifndef __builtin_crypto_vpmsumw
|
||||
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
#ifndef __builtin_crypto_vpmsumd
|
||||
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
|
||||
static inline __vector unsigned long long __attribute__((overloadable))
|
||||
vec_ld(int __a, const __vector unsigned long long* __b) {
|
||||
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
|
||||
}
|
||||
|
||||
#endif
|
||||
42
deps/zlib-ng/arch/power/power_features.c
vendored
Normal file
42
deps/zlib-ng/arch/power/power_features.c
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/* power_features.c - POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
#ifdef __FreeBSD__
|
||||
# include <machine/cpu.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "power_features.h"
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
|
||||
#ifdef PPC_FEATURES
|
||||
unsigned long hwcap;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
|
||||
#else
|
||||
hwcap = getauxval(AT_HWCAP);
|
||||
#endif
|
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
features->has_altivec = 1;
|
||||
#endif
|
||||
|
||||
#ifdef POWER_FEATURES
|
||||
unsigned long hwcap2;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
|
||||
#else
|
||||
hwcap2 = getauxval(AT_HWCAP2);
|
||||
#endif
|
||||
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||
features->has_arch_2_07 = 1;
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
|
||||
features->has_arch_3_00 = 1;
|
||||
#endif
|
||||
}
|
||||
18
deps/zlib-ng/arch/power/power_features.h
vendored
Normal file
18
deps/zlib-ng/arch/power/power_features.h
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
/* power_features.h -- check for POWER CPU features
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_H_
|
||||
#define POWER_H_
|
||||
|
||||
struct power_cpu_features {
|
||||
int has_altivec;
|
||||
int has_arch_2_07;
|
||||
int has_arch_3_00;
|
||||
};
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
|
||||
|
||||
#endif /* POWER_H_ */
|
||||
12
deps/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
12
deps/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
/* Optimized slide_hash for POWER processors
|
||||
* Copyright (C) 2019-2020 IBM Corporation
|
||||
* Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#define SLIDE_PPC slide_hash_power8
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* POWER8_VSX */
|
||||
10
deps/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
10
deps/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
/* Optimized slide_hash for PowerPC processors with VMX instructions
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef PPC_VMX
|
||||
|
||||
#define SLIDE_PPC slide_hash_vmx
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* PPC_VMX */
|
||||
31
deps/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
31
deps/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
/* Optimized slide_hash for PowerPC processors
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
const vector unsigned short vmx_wsize = vec_splats(wsize);
|
||||
Pos *p = table;
|
||||
|
||||
do {
|
||||
vector unsigned short value, result;
|
||||
|
||||
value = vec_ld(0, p);
|
||||
result = vec_subs(value, vmx_wsize);
|
||||
vec_st(result, 0, p);
|
||||
|
||||
p += 8;
|
||||
entries -= 8;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
|
||||
uint16_t wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
45
deps/zlib-ng/arch/riscv/README.md
vendored
Normal file
45
deps/zlib-ng/arch/riscv/README.md
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
# Building RISC-V Target with Cmake #
|
||||
|
||||
> **Warning**
|
||||
> We cannot detect rvv support at runtime, running the rvv code on a no-rvv target is a risk. Users should disable the rvv when the target does not support it.
|
||||
>
|
||||
> We will have a better solution when the kernels update `hwcap` or `hwprobe` for risc-v.
|
||||
|
||||
## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
|
||||
|
||||
If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
|
||||
|
||||
```bash
|
||||
./prepare_riscv_toolchain_qemu.sh
|
||||
```
|
||||
|
||||
After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
|
||||
|
||||
`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
|
||||
`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
|
||||
|
||||
You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
|
||||
|
||||
## Cross-Compile for RISC-V Target ##
|
||||
|
||||
```bash
|
||||
cmake -G Ninja -B ./build-riscv \
|
||||
-D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=./build-riscv/install \
|
||||
-D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
|
||||
-D QEMU_PATH={QEMU_PATH} \
|
||||
.
|
||||
|
||||
cmake --build ./build-riscv
|
||||
```
|
||||
|
||||
Disable the option if there is no RVV support:
|
||||
```
|
||||
-D WITH_RVV=OFF
|
||||
```
|
||||
|
||||
## Run Unittests on User Mode QEMU ##
|
||||
|
||||
```bash
|
||||
cd ./build-riscv && ctest --verbose
|
||||
```
|
||||
15
deps/zlib-ng/arch/riscv/riscv_features.c
vendored
Normal file
15
deps/zlib-ng/arch/riscv/riscv_features.c
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "riscv_features.h"
|
||||
|
||||
/* TODO: detect risc-v cpu info at runtime when the kernel updates hwcap or hwprobe for risc-v */
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
|
||||
#if defined(__riscv_v) && defined(__linux__)
|
||||
features->has_rvv = 1;
|
||||
#else
|
||||
features->has_rvv = 0;
|
||||
#endif
|
||||
}
|
||||
18
deps/zlib-ng/arch/riscv/riscv_features.h
vendored
Normal file
18
deps/zlib-ng/arch/riscv/riscv_features.h
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
/* riscv_features.h -- check for riscv features.
|
||||
*
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef RISCV_H_
|
||||
#define RISCV_H_
|
||||
|
||||
struct riscv_cpu_features {
|
||||
int has_rvv;
|
||||
};
|
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
|
||||
|
||||
#endif /* RISCV_H_ */
|
||||
54
deps/zlib-ng/arch/s390/Makefile.in
vendored
Normal file
54
deps/zlib-ng/arch/s390/Makefile.in
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
# Makefile for zlib-ng
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
VGFMAFLAG=
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
s390_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
|
||||
|
||||
s390_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
|
||||
|
||||
dfltcc_common.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
|
||||
|
||||
dfltcc_common.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
|
||||
|
||||
dfltcc_deflate.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
|
||||
|
||||
dfltcc_deflate.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
|
||||
|
||||
dfltcc_inflate.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
|
||||
|
||||
dfltcc_inflate.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
|
||||
|
||||
crc32-vx.o:
|
||||
$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
|
||||
|
||||
crc32-vx.lo:
|
||||
$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
284
deps/zlib-ng/arch/s390/README.md
vendored
Normal file
284
deps/zlib-ng/arch/s390/README.md
vendored
Normal file
@@ -0,0 +1,284 @@
|
||||
# Introduction
|
||||
|
||||
This directory contains SystemZ deflate hardware acceleration support.
|
||||
It can be enabled using the following build commands:
|
||||
|
||||
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
|
||||
$ make
|
||||
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
|
||||
$ make
|
||||
|
||||
When built like this, zlib-ng would compress using hardware on level 1,
|
||||
and using software on all other levels. Decompression will always happen
|
||||
in hardware. In order to enable hardware compression for levels 1-6
|
||||
(i.e. to make it used by default) one could add
|
||||
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
|
||||
|
||||
SystemZ deflate hardware acceleration is available on [IBM z15](
|
||||
https://www.ibm.com/products/z15) and newer machines under the name [
|
||||
"Integrated Accelerator for zEnterprise Data Compression"](
|
||||
https://www.ibm.com/support/z-content-solutions/compression/). The
|
||||
programming interface to it is a machine instruction called DEFLATE
|
||||
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
|
||||
of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
|
||||
the code and the rest of this document refer to this feature simply as
|
||||
"DFLTCC".
|
||||
|
||||
# Performance
|
||||
|
||||
Performance figures are published [here](
|
||||
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
|
||||
). The compression speed-up can be as high as 110x and the decompression
|
||||
speed-up can be as high as 15x.
|
||||
|
||||
# Limitations
|
||||
|
||||
Two DFLTCC compression calls with identical inputs are not guaranteed to
|
||||
produce identical outputs. Therefore care should be taken when using
|
||||
hardware compression when reproducible results are desired. In
|
||||
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
|
||||
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
|
||||
particular stream.
|
||||
|
||||
DFLTCC does not support every single zlib-ng feature, in particular:
|
||||
|
||||
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
|
||||
* `inflateMark()`
|
||||
* `inflatePrime()`
|
||||
* `inflateSyncPoint()`
|
||||
|
||||
When used, these functions will either switch to software, or, in case
|
||||
this is not possible, gracefully fail.
|
||||
|
||||
# Code structure
|
||||
|
||||
All SystemZ-specific code lives in `arch/s390` directory and is
|
||||
integrated with the rest of zlib-ng using hook macros.
|
||||
|
||||
## Hook macros
|
||||
|
||||
DFLTCC takes as arguments a parameter block, an input buffer, an output
|
||||
buffer and a window. `ZALLOC_DEFLATE_STATE()`, `ZALLOC_INFLATE_STATE()`,
|
||||
`ZFREE_STATE()`, `ZCOPY_DEFLATE_STATE()`, `ZCOPY_INFLATE_STATE()`,
|
||||
`ZALLOC_WINDOW()`, `ZCOPY_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate
|
||||
allocation details for the parameter block (which is allocated alongside
|
||||
zlib-ng state) and the window (which must be page-aligned and large enough).
|
||||
|
||||
Software and hardware window formats do not match, therefore,
|
||||
`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
|
||||
and `inflateGetDictionary()` need special handling, which is triggered using
|
||||
`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
|
||||
`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
|
||||
|
||||
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
|
||||
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
|
||||
`INFLATE_RESET_KEEP_HOOK()` macros.
|
||||
|
||||
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
|
||||
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
|
||||
calls gracefully fail.
|
||||
|
||||
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
|
||||
software compression mid-stream using `deflateParams()`. Switching
|
||||
normally entails flushing the current block, which might not be possible
|
||||
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
|
||||
in order to detect and gracefully handle such situations.
|
||||
|
||||
The algorithm implemented in hardware has different compression ratio
|
||||
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
|
||||
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
|
||||
return the correct results for the hardware implementation.
|
||||
|
||||
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
|
||||
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
|
||||
window on its own, calling `updatewindow()` is suppressed using
|
||||
`INFLATE_NEED_UPDATEWINDOW()` macro.
|
||||
|
||||
In addition to compression, DFLTCC computes CRC-32 and Adler-32
|
||||
checksums, therefore, whenever it's used, software checksumming is
|
||||
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
|
||||
macros.
|
||||
|
||||
While software always produces reproducible compression results, this
|
||||
is not the case for DFLTCC. Therefore, zlib-ng users are given the
|
||||
ability to specify whether or not reproducible compression results
|
||||
are required. While it is always possible to specify this setting
|
||||
before the compression begins, it is not always possible to do so in
|
||||
the middle of a deflate stream - the exact conditions for that are
|
||||
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
|
||||
|
||||
## SystemZ-specific code
|
||||
|
||||
When zlib-ng is built with DFLTCC, the hooks described above are
|
||||
converted to calls to functions, which are implemented in
|
||||
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
|
||||
categories:
|
||||
|
||||
* Base DFLTCC support, e.g. wrapping the machine instruction -
|
||||
`dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
|
||||
* Translating between software and hardware data formats, e.g.
|
||||
`dfltcc_deflate_set_dictionary()`.
|
||||
* Translating between software and hardware state machines, e.g.
|
||||
`dfltcc_deflate()` and `dfltcc_inflate()`.
|
||||
|
||||
The functions from the first two categories are fairly simple, however,
|
||||
various quirks in both software and hardware state machines make the
|
||||
functions from the third category quite complicated.
|
||||
|
||||
### `dfltcc_deflate()` function
|
||||
|
||||
This function is called by `deflate()` and has the following
|
||||
responsibilities:
|
||||
|
||||
* Checking whether DFLTCC can be used with the current stream. If this
|
||||
is not the case, then it returns `0`, making `deflate()` use some
|
||||
other function in order to compress in software. Otherwise it returns
|
||||
`1`.
|
||||
* Block management and Huffman table generation. DFLTCC ends blocks only
|
||||
when explicitly instructed to do so by the software. Furthermore,
|
||||
whether to use fixed or dynamic Huffman tables must also be determined
|
||||
by the software. Since looking at data in order to gather statistics
|
||||
would negate performance benefits, the following approach is used: the
|
||||
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
|
||||
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
|
||||
dynamic blocks.
|
||||
* Writing EOBS. Block Closing Control bit in the parameter block
|
||||
instructs DFLTCC to write EOBS, however, certain conditions need to be
|
||||
met: input data length must be non-zero or Continuation Flag must be
|
||||
set. To put this in simpler terms, DFLTCC will silently refuse to
|
||||
write EOBS if this is the only thing that it is asked to do. Since the
|
||||
code has to be able to emit EOBS in software anyway, in order to avoid
|
||||
tricky corner cases Block Closing Control is never used. Whether to
|
||||
write EOBS is instead controlled by `soft_bcc` variable.
|
||||
* Triggering block post-processing. Depending on flush mode, `deflate()`
|
||||
must perform various additional actions when a block or a stream ends.
|
||||
`dfltcc_deflate()` informs `deflate()` about this using
|
||||
`block_state *result` parameter.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
|
||||
and Sub-Byte Boundary. Certain fields cannot be translated and must
|
||||
persist untouched in the parameter block between calls, for example,
|
||||
Continuation Flag or Continuation State Buffer.
|
||||
* Handling flush modes and low-memory situations. These aspects are
|
||||
quite intertwined and pervasive. The general idea here is that the
|
||||
code must not do anything in software - whether explicitly by e.g.
|
||||
calling `send_eobs()`, or implicitly - by returning to `deflate()`
|
||||
with certain return and `*result` values, when Continuation Flag is
|
||||
set.
|
||||
* Ending streams. When a new block is started and flush mode is
|
||||
`Z_FINISH`, Block Header Final parameter block bit is used to mark
|
||||
this block as final. However, sometimes an empty final block is
|
||||
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
|
||||
refuse to do this. The general idea of DFLTCC implementation is to
|
||||
rely as much as possible on the existing code. Here in order to do
|
||||
this, the code pretends that it does not support DFLTCC, which makes
|
||||
`deflate()` call a software compression function, which writes an
|
||||
empty final block. Whether this is required is controlled by
|
||||
`need_empty_block` variable.
|
||||
* Error handling. This is simply converting
|
||||
Operation-Ending-Supplemental Code to string. Errors can only happen
|
||||
due to things like memory corruption, and therefore they don't affect
|
||||
the `deflate()` return code.
|
||||
|
||||
### `dfltcc_inflate()` function
|
||||
|
||||
This function is called by `inflate()` from the `TYPEDO` state (that is,
|
||||
when all the metadata is parsed and the stream is positioned at the type
|
||||
bits of deflate block header) and it's responsible for the following:
|
||||
|
||||
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
|
||||
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
|
||||
block or tree boundary.
|
||||
* `inflate()` decompression loop management. This is controlled using
|
||||
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
|
||||
`DFLTCC_INFLATE_CONTINUE`.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `whave` and History Length or `wnext` and
|
||||
History Offset.
|
||||
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
|
||||
and is controlled by `last` state field.
|
||||
* Error handling. Like deflate, error handling comprises
|
||||
Operation-Ending-Supplemental Code to string conversion. Unlike
|
||||
deflate, errors may happen due to bad inputs, therefore they are
|
||||
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
|
||||
|
||||
# Testing
|
||||
|
||||
Given complexity of DFLTCC machine instruction, it is not clear whether
|
||||
QEMU TCG will ever support it. At the time of writing, one has to have
|
||||
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
|
||||
DFLTCC is a non-privileged instruction, neither special VM/LPAR
|
||||
configuration nor root are required.
|
||||
|
||||
zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
|
||||
testing. There are no IBM Z builds of GitHub Actions runner, and
|
||||
stable qemu-user has problems with .NET apps, so the builder runs the
|
||||
x86_64 runner version with qemu-user built from the master branch.
|
||||
|
||||
## Configuring the builder.
|
||||
|
||||
### Install prerequisites.
|
||||
|
||||
```
|
||||
$ sudo dnf install docker
|
||||
```
|
||||
|
||||
### Add services.
|
||||
|
||||
```
|
||||
$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
|
||||
$ sudo systemctl daemon-reload
|
||||
```
|
||||
|
||||
### Create a config file.
|
||||
|
||||
```
|
||||
$ sudo tee /etc/actions-runner
|
||||
repo=<owner>/<name>
|
||||
access_token=<ghp_***>
|
||||
```
|
||||
|
||||
Access token should have the repo scope, consult
|
||||
https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
|
||||
for details.
|
||||
|
||||
### Autostart the x86_64 emulation support.
|
||||
|
||||
```
|
||||
$ sudo systemctl enable --now qemu-user-static
|
||||
```
|
||||
|
||||
### Autostart the runner.
|
||||
|
||||
```
|
||||
$ sudo systemctl enable --now actions-runner
|
||||
```
|
||||
|
||||
## Rebuilding the image
|
||||
|
||||
In order to update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
|
||||
latest OS security fixes, use the following commands:
|
||||
|
||||
```
|
||||
$ sudo docker build \
|
||||
--pull \
|
||||
-f self-hosted-builder/actions-runner.Dockerfile \
|
||||
-t iiilinuxibmcom/actions-runner
|
||||
$ sudo systemctl restart actions-runner
|
||||
```
|
||||
|
||||
## Removing persistent data
|
||||
|
||||
The `actions-runner` service stores various temporary data, such as runner
|
||||
registration information, work directories and logs, in the `actions-runner`
|
||||
volume. In order to remove it and start from scratch, e.g. when switching the
|
||||
runner to a different repository, use the following commands:
|
||||
|
||||
```
|
||||
$ sudo systemctl stop actions-runner
|
||||
$ sudo docker rm -f actions-runner
|
||||
$ sudo docker volume rm actions-runner
|
||||
```
|
||||
222
deps/zlib-ng/arch/s390/crc32-vx.c
vendored
Normal file
222
deps/zlib-ng/arch/s390/crc32-vx.c
vendored
Normal file
@@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Hardware-accelerated CRC-32 variants for Linux on z Systems
|
||||
*
|
||||
* Use the z/Architecture Vector Extension Facility to accelerate the
|
||||
* computing of bitreflected CRC-32 checksums.
|
||||
*
|
||||
* This CRC-32 implementation algorithm is bitreflected and processes
|
||||
* the least-significant bit first (Little-Endian).
|
||||
*
|
||||
* This code was originally written by Hendrik Brueckner
|
||||
* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
|
||||
* relicensed under the zlib license.
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "crc32_braid_p.h"
|
||||
|
||||
#include <vecintrin.h>
|
||||
|
||||
typedef unsigned char uv16qi __attribute__((vector_size(16)));
|
||||
typedef unsigned int uv4si __attribute__((vector_size(16)));
|
||||
typedef unsigned long long uv2di __attribute__((vector_size(16)));
|
||||
|
||||
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
/*
|
||||
* The CRC-32 constant block contains reduction constants to fold and
|
||||
* process particular chunks of the input data stream in parallel.
|
||||
*
|
||||
* For the CRC-32 variants, the constants are precomputed according to
|
||||
* these definitions:
|
||||
*
|
||||
* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
|
||||
* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
|
||||
* R3 = [(x128+32 mod P'(x) << 32)]' << 1
|
||||
* R4 = [(x128-32 mod P'(x) << 32)]' << 1
|
||||
* R5 = [(x64 mod P'(x) << 32)]' << 1
|
||||
* R6 = [(x32 mod P'(x) << 32)]' << 1
|
||||
*
|
||||
* The bitreflected Barret reduction constant, u', is defined as
|
||||
* the bit reversal of floor(x**64 / P(x)).
|
||||
*
|
||||
* where P(x) is the polynomial in the normal domain and the P'(x) is the
|
||||
* polynomial in the reversed (bitreflected) domain.
|
||||
*
|
||||
* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
|
||||
*
|
||||
* P(x) = 0x04C11DB7
|
||||
* P'(x) = 0xEDB88320
|
||||
*/
|
||||
const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
|
||||
const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
|
||||
const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
|
||||
const uv2di r5 = {0, 0x163CD6124}; /* R5 */
|
||||
const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
|
||||
const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
|
||||
|
||||
/*
|
||||
* Load the initial CRC value.
|
||||
*
|
||||
* The CRC value is loaded into the rightmost word of the
|
||||
* vector register and is later XORed with the LSB portion
|
||||
* of the loaded input data.
|
||||
*/
|
||||
uv2di v0 = {0, 0};
|
||||
v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
|
||||
|
||||
/* Load a 64-byte data chunk and XOR with CRC */
|
||||
uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
|
||||
uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
|
||||
uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
|
||||
uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
|
||||
|
||||
v1 ^= v0;
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
|
||||
while (len >= 64) {
|
||||
/* Load the next 64-byte data chunk */
|
||||
uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
|
||||
uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
|
||||
uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
|
||||
uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
|
||||
|
||||
/*
|
||||
* Perform a GF(2) multiplication of the doublewords in V1 with
|
||||
* the R1 and R2 reduction constants in V0. The intermediate result
|
||||
* is then folded (accumulated) with the next data chunk in PART1 and
|
||||
* stored in V1. Repeat this step for the register contents
|
||||
* in V2, V3, and V4 respectively.
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
|
||||
v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
|
||||
v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
|
||||
v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
|
||||
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
|
||||
* value remains.
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
|
||||
|
||||
while (len >= 16) {
|
||||
/* Load next data chunk */
|
||||
v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
|
||||
|
||||
/* Fold next data chunk */
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up a vector register for byte shifts. The shift value must
|
||||
* be loaded in bits 1-4 in byte element 7 of a vector register.
|
||||
* Shift by 8 bytes: 0x40
|
||||
* Shift by 4 bytes: 0x20
|
||||
*/
|
||||
uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
v9 = vec_insert((unsigned char)0x40, v9, 7);
|
||||
|
||||
/*
|
||||
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
|
||||
* to move R4 into the rightmost doubleword and set the leftmost
|
||||
* doubleword to 0x1.
|
||||
*/
|
||||
v0 = vec_srb(r4r3, (uv2di)v9);
|
||||
v0[0] = 1;
|
||||
|
||||
/*
|
||||
* Compute GF(2) product of V1 and V0. The rightmost doubleword
|
||||
* of V1 is multiplied with R4. The leftmost doubleword of V1 is
|
||||
* multiplied by 0x1 and is then XORed with rightmost product.
|
||||
* Implicitly, the intermediate leftmost product becomes padded
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_128(v0, v1);
|
||||
|
||||
/*
|
||||
* Now do the final 32-bit fold by multiplying the rightmost word
|
||||
* in V1 with R5 and XOR the result with the remaining bits in V1.
|
||||
*
|
||||
* To achieve this by a single VGFMAG, right shift V1 by a word
|
||||
* and store the result in V2 which is then accumulated. Use the
|
||||
* vector unpack instruction to load the rightmost half of the
|
||||
* doubleword into the rightmost doubleword element of V1; the other
|
||||
* half is loaded in the leftmost doubleword.
|
||||
* The vector register with CONST_R5 contains the R5 constant in the
|
||||
* rightmost doubleword and the leftmost doubleword is zero to ignore
|
||||
* the leftmost product of V1.
|
||||
*/
|
||||
v9 = vec_insert((unsigned char)0x20, v9, 7);
|
||||
v2 = vec_srb(v1, (uv2di)v9);
|
||||
v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
|
||||
|
||||
/*
|
||||
* Apply a Barret reduction to compute the final 32-bit CRC value.
|
||||
*
|
||||
* The input values to the Barret reduction are the degree-63 polynomial
|
||||
* in V1 (R(x)), degree-32 generator polynomial, and the reduction
|
||||
* constant u. The Barret reduction result is the CRC value of R(x) mod
|
||||
* P(x).
|
||||
*
|
||||
* The Barret reduction algorithm is defined as:
|
||||
*
|
||||
* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
|
||||
* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
|
||||
* 3. C(x) = R(x) XOR T2(x) mod x^32
|
||||
*
|
||||
* Note: The leftmost doubleword of vector register containing
|
||||
* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
|
||||
* is zero and does not contribute to the final result.
|
||||
*/
|
||||
|
||||
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
|
||||
v2 = vec_unpackl((uv4si)v1);
|
||||
v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
|
||||
|
||||
/*
|
||||
* Compute the GF(2) product of the CRC polynomial with T1(x) in
|
||||
* V2 and XOR the intermediate result, T2(x), with the value in V1.
|
||||
* The final result is stored in word element 2 of V2.
|
||||
*/
|
||||
v2 = vec_unpackl((uv4si)v2);
|
||||
v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
|
||||
|
||||
return ((uv4si)v2)[2];
|
||||
}
|
||||
|
||||
#define VX_MIN_LEN 64
|
||||
#define VX_ALIGNMENT 16L
|
||||
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
|
||||
|
||||
uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
|
||||
size_t prealign, aligned, remaining;
|
||||
|
||||
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
|
||||
return PREFIX(crc32_braid)(crc, buf, len);
|
||||
|
||||
if ((uintptr_t)buf & VX_ALIGN_MASK) {
|
||||
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
|
||||
len -= prealign;
|
||||
crc = PREFIX(crc32_braid)(crc, buf, prealign);
|
||||
buf += prealign;
|
||||
}
|
||||
aligned = len & ~VX_ALIGN_MASK;
|
||||
remaining = len & VX_ALIGN_MASK;
|
||||
|
||||
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
|
||||
|
||||
if (remaining)
|
||||
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
|
||||
|
||||
return crc;
|
||||
}
|
||||
40
deps/zlib-ng/arch/s390/dfltcc_common.c
vendored
Normal file
40
deps/zlib-ng/arch/s390/dfltcc_common.c
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "dfltcc_common.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
/*
|
||||
Memory management.
|
||||
|
||||
DFLTCC requires parameter blocks and window to be aligned. zlib-ng allows
|
||||
users to specify their own allocation functions, so using e.g.
|
||||
`posix_memalign' is not an option. Thus, we overallocate and take the
|
||||
aligned portion of the buffer.
|
||||
*/
|
||||
|
||||
static const int PAGE_ALIGN = 0x1000;
|
||||
|
||||
void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size) {
|
||||
void *p;
|
||||
void *w;
|
||||
|
||||
/* To simplify freeing, we store the pointer to the allocated buffer right
|
||||
* before the window. Note that DFLTCC always uses HB_SIZE bytes.
|
||||
*/
|
||||
p = ZALLOC(strm, sizeof(void *) + MAX(items * size, HB_SIZE) + PAGE_ALIGN, sizeof(unsigned char));
|
||||
if (p == NULL)
|
||||
return NULL;
|
||||
w = ALIGN_UP((char *)p + sizeof(void *), PAGE_ALIGN);
|
||||
*(void **)((char *)w - sizeof(void *)) = p;
|
||||
return w;
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n) {
|
||||
memcpy(dest, src, MAX(n, HB_SIZE));
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w) {
|
||||
if (w)
|
||||
ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
|
||||
}
|
||||
44
deps/zlib-ng/arch/s390/dfltcc_common.h
vendored
Normal file
44
deps/zlib-ng/arch/s390/dfltcc_common.h
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
#ifndef DFLTCC_COMMON_H
|
||||
#define DFLTCC_COMMON_H
|
||||
|
||||
#include "zutil.h"
|
||||
|
||||
void Z_INTERNAL *PREFIX(dfltcc_alloc_window)(PREFIX3(streamp) strm, uInt items, uInt size);
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_window)(void *dest, const void *src, size_t n);
|
||||
void Z_INTERNAL PREFIX(dfltcc_free_window)(PREFIX3(streamp) strm, void *w);
|
||||
|
||||
#define ZFREE_STATE ZFREE
|
||||
|
||||
#define ZALLOC_WINDOW PREFIX(dfltcc_alloc_window)
|
||||
|
||||
#define ZCOPY_WINDOW PREFIX(dfltcc_copy_window)
|
||||
|
||||
#define ZFREE_WINDOW PREFIX(dfltcc_free_window)
|
||||
|
||||
#define TRY_FREE_WINDOW PREFIX(dfltcc_free_window)
|
||||
|
||||
#define DFLTCC_BLOCK_HEADER_BITS 3
|
||||
#define DFLTCC_HLITS_COUNT_BITS 5
|
||||
#define DFLTCC_HDISTS_COUNT_BITS 5
|
||||
#define DFLTCC_HCLENS_COUNT_BITS 4
|
||||
#define DFLTCC_MAX_HCLENS 19
|
||||
#define DFLTCC_HCLEN_BITS 3
|
||||
#define DFLTCC_MAX_HLITS 286
|
||||
#define DFLTCC_MAX_HDISTS 30
|
||||
#define DFLTCC_MAX_HLIT_HDIST_BITS 7
|
||||
#define DFLTCC_MAX_SYMBOL_BITS 16
|
||||
#define DFLTCC_MAX_EOBS_BITS 15
|
||||
#define DFLTCC_MAX_PADDING_BITS 7
|
||||
|
||||
#define DEFLATE_BOUND_COMPLEN(source_len) \
|
||||
((DFLTCC_BLOCK_HEADER_BITS + \
|
||||
DFLTCC_HLITS_COUNT_BITS + \
|
||||
DFLTCC_HDISTS_COUNT_BITS + \
|
||||
DFLTCC_HCLENS_COUNT_BITS + \
|
||||
DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
|
||||
(DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
|
||||
(source_len) * DFLTCC_MAX_SYMBOL_BITS + \
|
||||
DFLTCC_MAX_EOBS_BITS + \
|
||||
DFLTCC_MAX_PADDING_BITS) >> 3)
|
||||
|
||||
#endif
|
||||
404
deps/zlib-ng/arch/s390/dfltcc_deflate.c
vendored
Normal file
404
deps/zlib-ng/arch/s390/dfltcc_deflate.c
vendored
Normal file
@@ -0,0 +1,404 @@
|
||||
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
|
||||
|
||||
/*
|
||||
Use the following commands to build zlib-ng with DFLTCC compression support:
|
||||
|
||||
$ ./configure --with-dfltcc-deflate
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_DEFLATE=1 .
|
||||
|
||||
and then
|
||||
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "trees_emit.h"
|
||||
#include "dfltcc_deflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
struct dfltcc_deflate_state {
|
||||
struct dfltcc_state common;
|
||||
uint16_t level_mask; /* Levels on which to use DFLTCC */
|
||||
uint32_t block_size; /* New block each X bytes */
|
||||
size_t block_threshold; /* New block after total_in > X */
|
||||
uint32_t dht_threshold; /* New block only if avail_in >= X */
|
||||
};
|
||||
|
||||
#define GET_DFLTCC_DEFLATE_STATE(state) ((struct dfltcc_deflate_state *)GET_DFLTCC_STATE(state))
|
||||
|
||||
void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp) strm) {
|
||||
return dfltcc_alloc_state(strm, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
|
||||
|
||||
dfltcc_reset_state(&dfltcc_state->common);
|
||||
|
||||
/* Initialize tuning parameters */
|
||||
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
|
||||
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
|
||||
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
|
||||
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src) {
|
||||
dfltcc_copy_state(dst, src, sizeof(deflate_state), sizeof(struct dfltcc_deflate_state));
|
||||
}
|
||||
|
||||
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
|
||||
int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
|
||||
|
||||
/* Unsupported compression settings */
|
||||
if ((dfltcc_state->level_mask & (1 << level)) == 0)
|
||||
return 0;
|
||||
if (window_bits != HB_BITS)
|
||||
return 0;
|
||||
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
|
||||
return 0;
|
||||
if (reproducible)
|
||||
return 0;
|
||||
|
||||
/* Unsupported hardware */
|
||||
if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
|
||||
!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
|
||||
!is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
|
||||
}
|
||||
|
||||
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
|
||||
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
size_t avail_out = strm->avail_out;
|
||||
dfltcc_cc cc;
|
||||
|
||||
cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
|
||||
param, &strm->next_out, &avail_out,
|
||||
&strm->next_in, &avail_in, state->window);
|
||||
strm->total_in += (strm->avail_in - avail_in);
|
||||
strm->total_out += (strm->avail_out - avail_out);
|
||||
strm->avail_in = avail_in;
|
||||
strm->avail_out = avail_out;
|
||||
return cc;
|
||||
}
|
||||
|
||||
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
|
||||
PREFIX(flush_pending)(strm);
|
||||
if (state->pending != 0) {
|
||||
/* The remaining data is located in pending_out[0:pending]. If someone
|
||||
* calls put_byte() - this might happen in deflate() - the byte will be
|
||||
* placed into pending_buf[pending], which is incorrect. Move the
|
||||
* remaining data to the beginning of pending_buf so that put_byte() is
|
||||
* usable again.
|
||||
*/
|
||||
memmove(state->pending_buf, state->pending_out, state->pending);
|
||||
state->pending_out = state->pending_buf;
|
||||
}
|
||||
#ifdef ZLIB_DEBUG
|
||||
state->compressed_len += param->eobl;
|
||||
#endif
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
|
||||
uInt masked_avail_in;
|
||||
dfltcc_cc cc;
|
||||
int need_empty_block;
|
||||
int soft_bcc;
|
||||
int no_flush;
|
||||
|
||||
if (!PREFIX(dfltcc_can_deflate)(strm)) {
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
again:
|
||||
masked_avail_in = 0;
|
||||
soft_bcc = 0;
|
||||
no_flush = flush == Z_NO_FLUSH;
|
||||
|
||||
/* No input data. Return, except when Continuation Flag is set, which means
|
||||
* that DFLTCC has buffered some output in the parameter block and needs to
|
||||
* be called again in order to flush it.
|
||||
*/
|
||||
if (strm->avail_in == 0 && !param->cf) {
|
||||
/* A block is still open, and the hardware does not support closing
|
||||
* blocks without adding data. Thus, close it manually.
|
||||
*/
|
||||
if (!no_flush && param->bcf) {
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
}
|
||||
/* Let one of deflate_* functions write a trailing empty block. */
|
||||
if (flush == Z_FINISH)
|
||||
return 0;
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
/* Trigger block post-processing if necessary. */
|
||||
*result = no_flush ? need_more : block_done;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* There is an open non-BFINAL block, we are not going to close it just
|
||||
* yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
|
||||
* more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
|
||||
* DHT in order to adapt to a possibly changed input data distribution.
|
||||
*/
|
||||
if (param->bcf && no_flush &&
|
||||
strm->total_in > dfltcc_state->block_threshold &&
|
||||
strm->avail_in >= dfltcc_state->dht_threshold) {
|
||||
if (param->cf) {
|
||||
/* We need to flush the DFLTCC buffer before writing the
|
||||
* End-of-block Symbol. Mask the input data and proceed as usual.
|
||||
*/
|
||||
masked_avail_in += strm->avail_in;
|
||||
strm->avail_in = 0;
|
||||
no_flush = 0;
|
||||
} else {
|
||||
/* DFLTCC buffer is empty, so we can manually write the
|
||||
* End-of-block Symbol right away.
|
||||
*/
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
|
||||
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
|
||||
* set BCF=1, which is wrong. Avoid complications and return early.
|
||||
*/
|
||||
if (strm->avail_out == 0) {
|
||||
*result = need_more;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The caller gave us too much data. Pass only one block worth of
|
||||
* uncompressed data to DFLTCC and mask the rest, so that on the next
|
||||
* iteration we start a new block.
|
||||
*/
|
||||
if (no_flush && strm->avail_in > dfltcc_state->block_size) {
|
||||
masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
|
||||
strm->avail_in = dfltcc_state->block_size;
|
||||
}
|
||||
|
||||
/* When we have an open non-BFINAL deflate block and caller indicates that
|
||||
* the stream is ending, we need to close an open deflate block and open a
|
||||
* BFINAL one.
|
||||
*/
|
||||
need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
|
||||
|
||||
/* Translate stream to parameter block */
|
||||
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
|
||||
if (!no_flush)
|
||||
/* We need to close a block. Always do this in software - when there is
|
||||
* no input data, the hardware will not honor BCC. */
|
||||
soft_bcc = 1;
|
||||
if (flush == Z_FINISH && !param->bcf)
|
||||
/* We are about to open a BFINAL block, set Block Header Final bit
|
||||
* until the stream ends.
|
||||
*/
|
||||
param->bhf = 1;
|
||||
/* DFLTCC-CMPR will write to next_out, so make sure that buffers with
|
||||
* higher precedence are empty.
|
||||
*/
|
||||
Assert(state->pending == 0, "There must be no pending bytes");
|
||||
Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
|
||||
param->sbb = (unsigned int)state->bi_valid;
|
||||
if (param->sbb > 0)
|
||||
*strm->next_out = (unsigned char)state->bi_buf;
|
||||
/* Honor history and check value */
|
||||
param->nt = 0;
|
||||
if (state->wrap == 1)
|
||||
param->cv = strm->adler;
|
||||
else if (state->wrap == 2)
|
||||
param->cv = ZSWAP32(state->crc_fold.value);
|
||||
|
||||
/* When opening a block, choose a Huffman-Table Type */
|
||||
if (!param->bcf) {
|
||||
if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
|
||||
param->htt = HTT_FIXED;
|
||||
else {
|
||||
param->htt = HTT_DYNAMIC;
|
||||
dfltcc_gdht(strm);
|
||||
}
|
||||
}
|
||||
|
||||
/* Deflate */
|
||||
do {
|
||||
cc = dfltcc_cmpr(strm);
|
||||
if (strm->avail_in < 4096 && masked_avail_in > 0)
|
||||
/* We are about to call DFLTCC with a small input buffer, which is
|
||||
* inefficient. Since there is masked data, there will be at least
|
||||
* one more DFLTCC call, so skip the current one and make the next
|
||||
* one handle more data.
|
||||
*/
|
||||
break;
|
||||
} while (cc == DFLTCC_CC_AGAIN);
|
||||
|
||||
/* Translate parameter block to stream */
|
||||
strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
|
||||
state->bi_valid = param->sbb;
|
||||
if (state->bi_valid == 0)
|
||||
state->bi_buf = 0; /* Avoid accessing next_out */
|
||||
else
|
||||
state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
|
||||
if (state->wrap == 1)
|
||||
strm->adler = param->cv;
|
||||
else if (state->wrap == 2)
|
||||
state->crc_fold.value = ZSWAP32(param->cv);
|
||||
|
||||
/* Unmask the input data */
|
||||
strm->avail_in += masked_avail_in;
|
||||
masked_avail_in = 0;
|
||||
|
||||
/* If we encounter an error, it means there is a bug in DFLTCC call */
|
||||
Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
|
||||
|
||||
/* Update Block-Continuation Flag. It will be used to check whether to call
|
||||
* GDHT the next time.
|
||||
*/
|
||||
if (cc == DFLTCC_CC_OK) {
|
||||
if (soft_bcc) {
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
|
||||
} else
|
||||
param->bcf = 1;
|
||||
if (flush == Z_FINISH) {
|
||||
if (need_empty_block)
|
||||
/* Make the current deflate() call also close the stream */
|
||||
return 0;
|
||||
else {
|
||||
bi_windup(state);
|
||||
*result = finish_done;
|
||||
}
|
||||
} else {
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0; /* Clear history */
|
||||
*result = flush == Z_NO_FLUSH ? need_more : block_done;
|
||||
}
|
||||
} else {
|
||||
param->bcf = 1;
|
||||
*result = need_more;
|
||||
}
|
||||
if (strm->avail_in != 0 && strm->avail_out != 0)
|
||||
goto again; /* deflate() must use all input or all output */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
Switching between hardware and software compression.
|
||||
|
||||
DFLTCC does not support all zlib settings, e.g. generation of non-compressed
|
||||
blocks or alternative window sizes. When such settings are applied on the
|
||||
fly with deflateParams, we need to convert between hardware and software
|
||||
window formats.
|
||||
*/
|
||||
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
|
||||
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
|
||||
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
|
||||
|
||||
if (can_deflate == could_deflate)
|
||||
/* We continue to work in the same mode - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
if (!dfltcc_was_deflate_used(strm))
|
||||
/* DFLTCC was not used yet - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
/* For now, do not convert between window formats - simply get rid of the old data instead */
|
||||
*flush = Z_FULL_FLUSH;
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
|
||||
* close the block without resetting the compression state. Detect this
|
||||
* situation and return that deflation is not done.
|
||||
*/
|
||||
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
|
||||
return 0;
|
||||
|
||||
/* Return that deflation is not done if DFLTCC is used and either it
|
||||
* buffered some data (Continuation Flag is set), or has not written EOBS
|
||||
* yet (Block-Continuation Flag is set).
|
||||
*/
|
||||
return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
|
||||
}
|
||||
|
||||
/*
|
||||
Preloading history.
|
||||
*/
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
append_history(param, state->window, dictionary, dict_length);
|
||||
state->strstart = 1; /* Add FDICT to zlib header */
|
||||
state->block_start = state->strstart; /* Make deflate_stored happy */
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
if (dictionary)
|
||||
get_history(param, state->window, dictionary);
|
||||
if (dict_length)
|
||||
*dict_length = param->hl;
|
||||
return Z_OK;
|
||||
}
|
||||
60
deps/zlib-ng/arch/s390/dfltcc_deflate.h
vendored
Normal file
60
deps/zlib-ng/arch/s390/dfltcc_deflate.h
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
#ifndef DFLTCC_DEFLATE_H
|
||||
#define DFLTCC_DEFLATE_H
|
||||
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
void Z_INTERNAL *PREFIX(dfltcc_alloc_deflate_state)(PREFIX3(streamp));
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_deflate_state)(void *dst, const void *src);
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
|
||||
|
||||
#define ZALLOC_DEFLATE_STATE PREFIX(dfltcc_alloc_deflate_state)
|
||||
#define ZCOPY_DEFLATE_STATE PREFIX(dfltcc_copy_deflate_state)
|
||||
|
||||
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
|
||||
|
||||
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
|
||||
do { \
|
||||
int err; \
|
||||
\
|
||||
err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
|
||||
if (err == Z_STREAM_ERROR) \
|
||||
return err; \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
|
||||
|
||||
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
|
||||
do { \
|
||||
if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
|
||||
|
||||
#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
|
||||
|
||||
#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
|
||||
|
||||
#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
|
||||
|
||||
#endif
|
||||
312
deps/zlib-ng/arch/s390/dfltcc_detail.h
vendored
Normal file
312
deps/zlib-ng/arch/s390/dfltcc_detail.h
vendored
Normal file
@@ -0,0 +1,312 @@
|
||||
#include "../../zbuild.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
#include <sys/sdt.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
Tuning parameters.
|
||||
*/
|
||||
#ifndef DFLTCC_LEVEL_MASK
|
||||
#define DFLTCC_LEVEL_MASK 0x2
|
||||
#endif
|
||||
#ifndef DFLTCC_BLOCK_SIZE
|
||||
#define DFLTCC_BLOCK_SIZE 1048576
|
||||
#endif
|
||||
#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
|
||||
#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
|
||||
#endif
|
||||
#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
|
||||
#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
|
||||
#endif
|
||||
#ifndef DFLTCC_RIBM
|
||||
#define DFLTCC_RIBM 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
Parameter Block for Query Available Functions.
|
||||
*/
|
||||
#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
|
||||
|
||||
struct dfltcc_qaf_param {
|
||||
char fns[16];
|
||||
char reserved1[8];
|
||||
char fmts[2];
|
||||
char reserved2[6];
|
||||
};
|
||||
|
||||
#define DFLTCC_SIZEOF_QAF 32
|
||||
static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
|
||||
|
||||
static inline int is_bit_set(const char *bits, int n) {
|
||||
return bits[n / 8] & (1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
static inline void clear_bit(char *bits, int n) {
|
||||
bits[n / 8] &= ~(1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
#define DFLTCC_FACILITY 151
|
||||
|
||||
static inline int is_dfltcc_enabled(void) {
|
||||
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
|
||||
Z_REGISTER uint8_t r0 __asm__("r0");
|
||||
|
||||
memset(facilities, 0, sizeof(facilities));
|
||||
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
|
||||
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
|
||||
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
|
||||
* is 64-bit, it's always z/Architecture mode at runtime.
|
||||
*/
|
||||
__asm__ volatile(
|
||||
#ifndef __clang__
|
||||
".machinemode push\n"
|
||||
".machinemode zarch\n"
|
||||
#endif
|
||||
"stfle %[facilities]\n"
|
||||
#ifndef __clang__
|
||||
".machinemode pop\n"
|
||||
#endif
|
||||
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
|
||||
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
|
||||
}
|
||||
|
||||
#define DFLTCC_FMT0 0
|
||||
|
||||
/*
|
||||
Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
|
||||
*/
|
||||
#define CVT_CRC32 0
|
||||
#define CVT_ADLER32 1
|
||||
#define HTT_FIXED 0
|
||||
#define HTT_DYNAMIC 1
|
||||
|
||||
struct dfltcc_param_v0 {
|
||||
uint16_t pbvn; /* Parameter-Block-Version Number */
|
||||
uint8_t mvn; /* Model-Version Number */
|
||||
uint8_t ribm; /* Reserved for IBM use */
|
||||
uint32_t reserved32 : 31;
|
||||
uint32_t cf : 1; /* Continuation Flag */
|
||||
uint8_t reserved64[8];
|
||||
uint32_t nt : 1; /* New Task */
|
||||
uint32_t reserved129 : 1;
|
||||
uint32_t cvt : 1; /* Check Value Type */
|
||||
uint32_t reserved131 : 1;
|
||||
uint32_t htt : 1; /* Huffman-Table Type */
|
||||
uint32_t bcf : 1; /* Block-Continuation Flag */
|
||||
uint32_t bcc : 1; /* Block Closing Control */
|
||||
uint32_t bhf : 1; /* Block Header Final */
|
||||
uint32_t reserved136 : 1;
|
||||
uint32_t reserved137 : 1;
|
||||
uint32_t dhtgc : 1; /* DHT Generation Control */
|
||||
uint32_t reserved139 : 5;
|
||||
uint32_t reserved144 : 5;
|
||||
uint32_t sbb : 3; /* Sub-Byte Boundary */
|
||||
uint8_t oesc; /* Operation-Ending-Supplemental Code */
|
||||
uint32_t reserved160 : 12;
|
||||
uint32_t ifs : 4; /* Incomplete-Function Status */
|
||||
uint16_t ifl; /* Incomplete-Function Length */
|
||||
uint8_t reserved192[8];
|
||||
uint8_t reserved256[8];
|
||||
uint8_t reserved320[4];
|
||||
uint16_t hl; /* History Length */
|
||||
uint32_t reserved368 : 1;
|
||||
uint16_t ho : 15; /* History Offset */
|
||||
uint32_t cv; /* Check Value */
|
||||
uint32_t eobs : 15; /* End-of-block Symbol */
|
||||
uint32_t reserved431: 1;
|
||||
uint8_t eobl : 4; /* End-of-block Length */
|
||||
uint32_t reserved436 : 12;
|
||||
uint32_t reserved448 : 4;
|
||||
uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
|
||||
Length */
|
||||
uint8_t reserved464[6];
|
||||
uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
|
||||
uint8_t reserved[24];
|
||||
uint8_t ribm2[8]; /* Reserved for IBM use */
|
||||
uint8_t csb[1152]; /* Continuation-State Buffer */
|
||||
};
|
||||
|
||||
#define DFLTCC_SIZEOF_GDHT_V0 384
|
||||
#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
|
||||
static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
|
||||
static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
|
||||
|
||||
static inline z_const char *oesc_msg(char *buf, int oesc) {
|
||||
if (oesc == 0x00)
|
||||
return NULL; /* Successful completion */
|
||||
else {
|
||||
sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
C wrapper for the DEFLATE CONVERSION CALL instruction.
|
||||
*/
|
||||
typedef enum {
|
||||
DFLTCC_CC_OK = 0,
|
||||
DFLTCC_CC_OP1_TOO_SHORT = 1,
|
||||
DFLTCC_CC_OP2_TOO_SHORT = 2,
|
||||
DFLTCC_CC_OP2_CORRUPT = 2,
|
||||
DFLTCC_CC_AGAIN = 3,
|
||||
} dfltcc_cc;
|
||||
|
||||
#define DFLTCC_QAF 0
|
||||
#define DFLTCC_GDHT 1
|
||||
#define DFLTCC_CMPR 2
|
||||
#define DFLTCC_XPND 4
|
||||
#define HBT_CIRCULAR (1 << 7)
|
||||
#define DFLTCC_FN_MASK ((1 << 7) - 1)
|
||||
#define HB_BITS 15
|
||||
#define HB_SIZE (1 << HB_BITS)
|
||||
|
||||
static inline dfltcc_cc dfltcc(int fn, void *param,
|
||||
unsigned char **op1, size_t *len1,
|
||||
z_const unsigned char **op2, size_t *len2, void *hist) {
|
||||
unsigned char *t2 = op1 ? *op1 : NULL;
|
||||
#ifdef Z_MEMORY_SANITIZER
|
||||
unsigned char *orig_t2 = t2;
|
||||
#endif
|
||||
size_t t3 = len1 ? *len1 : 0;
|
||||
z_const unsigned char *t4 = op2 ? *op2 : NULL;
|
||||
size_t t5 = len2 ? *len2 : 0;
|
||||
Z_REGISTER int r0 __asm__("r0") = fn;
|
||||
Z_REGISTER void *r1 __asm__("r1") = param;
|
||||
Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
|
||||
Z_REGISTER size_t r3 __asm__("r3") = t3;
|
||||
Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
|
||||
Z_REGISTER size_t r5 __asm__("r5") = t5;
|
||||
int cc;
|
||||
|
||||
__asm__ volatile(
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
|
||||
#endif
|
||||
".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
|
||||
#endif
|
||||
"ipm %[cc]\n"
|
||||
: [r2] "+r" (r2)
|
||||
, [r3] "+r" (r3)
|
||||
, [r4] "+r" (r4)
|
||||
, [r5] "+r" (r5)
|
||||
, [cc] "=r" (cc)
|
||||
: [r0] "r" (r0)
|
||||
, [r1] "r" (r1)
|
||||
, [hist] "r" (hist)
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
, STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
|
||||
#endif
|
||||
: "cc", "memory");
|
||||
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
|
||||
|
||||
#ifdef Z_MEMORY_SANITIZER
|
||||
switch (fn & DFLTCC_FN_MASK) {
|
||||
case DFLTCC_QAF:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
|
||||
break;
|
||||
case DFLTCC_GDHT:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
|
||||
break;
|
||||
case DFLTCC_CMPR:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
|
||||
__msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
|
||||
break;
|
||||
case DFLTCC_XPND:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
|
||||
__msan_unpoison(orig_t2, t2 - orig_t2);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (op1)
|
||||
*op1 = t2;
|
||||
if (len1)
|
||||
*len1 = t3;
|
||||
if (op2)
|
||||
*op2 = t4;
|
||||
if (len2)
|
||||
*len2 = t5;
|
||||
return (cc >> 28) & 3;
|
||||
}
|
||||
|
||||
/*
|
||||
Extension of inflate_state and deflate_state. Must be doubleword-aligned.
|
||||
*/
|
||||
struct dfltcc_state {
|
||||
struct dfltcc_param_v0 param; /* Parameter block. */
|
||||
struct dfltcc_qaf_param af; /* Available functions. */
|
||||
char msg[64]; /* Buffer for strm->msg */
|
||||
};
|
||||
|
||||
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
|
||||
|
||||
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
|
||||
|
||||
static inline void *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt size, uInt extension_size) {
|
||||
return ZALLOC(strm, 1, ALIGN_UP(size, 8) + extension_size);
|
||||
}
|
||||
|
||||
static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
|
||||
/* Initialize available functions */
|
||||
if (is_dfltcc_enabled()) {
|
||||
dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
|
||||
memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
|
||||
} else
|
||||
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
|
||||
|
||||
/* Initialize parameter block */
|
||||
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
|
||||
dfltcc_state->param.nt = 1;
|
||||
dfltcc_state->param.ribm = DFLTCC_RIBM;
|
||||
}
|
||||
|
||||
static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
|
||||
memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
|
||||
}
|
||||
|
||||
static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
|
||||
const unsigned char *buf, uInt count) {
|
||||
size_t offset;
|
||||
size_t n;
|
||||
|
||||
/* Do not use more than 32K */
|
||||
if (count > HB_SIZE) {
|
||||
buf += count - HB_SIZE;
|
||||
count = HB_SIZE;
|
||||
}
|
||||
offset = (param->ho + param->hl) % HB_SIZE;
|
||||
if (offset + count <= HB_SIZE)
|
||||
/* Circular history buffer does not wrap - copy one chunk */
|
||||
memcpy(history + offset, buf, count);
|
||||
else {
|
||||
/* Circular history buffer wraps - copy two chunks */
|
||||
n = HB_SIZE - offset;
|
||||
memcpy(history + offset, buf, n);
|
||||
memcpy(history, buf + n, count - n);
|
||||
}
|
||||
n = param->hl + count;
|
||||
if (n <= HB_SIZE)
|
||||
/* All history fits into buffer - no need to discard anything */
|
||||
param->hl = n;
|
||||
else {
|
||||
/* History does not fit into buffer - discard extra bytes */
|
||||
param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
|
||||
param->hl = HB_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
|
||||
unsigned char *buf) {
|
||||
if (param->ho + param->hl <= HB_SIZE)
|
||||
/* Circular history buffer does not wrap - copy one chunk */
|
||||
memcpy(buf, history + param->ho, param->hl);
|
||||
else {
|
||||
/* Circular history buffer wraps - copy two chunks */
|
||||
memcpy(buf, history + param->ho, HB_SIZE - param->ho);
|
||||
memcpy(buf + HB_SIZE - param->ho, history, param->ho + param->hl - HB_SIZE);
|
||||
}
|
||||
}
|
||||
205
deps/zlib-ng/arch/s390/dfltcc_inflate.c
vendored
Normal file
205
deps/zlib-ng/arch/s390/dfltcc_inflate.c
vendored
Normal file
@@ -0,0 +1,205 @@
|
||||
/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
|
||||
|
||||
/*
|
||||
Use the following commands to build zlib-ng with DFLTCC decompression support:
|
||||
|
||||
$ ./configure --with-dfltcc-inflate
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_INFLATE=1 .
|
||||
|
||||
and then
|
||||
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "inftrees.h"
|
||||
#include "inflate.h"
|
||||
#include "dfltcc_inflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm) {
|
||||
return (struct inflate_state *)dfltcc_alloc_state(strm, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
|
||||
dfltcc_reset_state(dfltcc_state);
|
||||
}
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src) {
|
||||
dfltcc_copy_state(dst, src, sizeof(struct inflate_state), sizeof(struct dfltcc_state));
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
|
||||
/* Unsupported hardware */
|
||||
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
size_t avail_out = strm->avail_out;
|
||||
dfltcc_cc cc;
|
||||
|
||||
cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
|
||||
param, &strm->next_out, &avail_out,
|
||||
&strm->next_in, &avail_in, state->window);
|
||||
strm->avail_in = avail_in;
|
||||
strm->avail_out = avail_out;
|
||||
return cc;
|
||||
}
|
||||
|
||||
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
dfltcc_cc cc;
|
||||
|
||||
if (flush == Z_BLOCK || flush == Z_TREES) {
|
||||
/* DFLTCC does not support stopping on block boundaries */
|
||||
if (PREFIX(dfltcc_inflate_disable)(strm)) {
|
||||
*ret = Z_STREAM_ERROR;
|
||||
return DFLTCC_INFLATE_BREAK;
|
||||
} else
|
||||
return DFLTCC_INFLATE_SOFTWARE;
|
||||
}
|
||||
|
||||
if (state->last) {
|
||||
if (state->bits != 0) {
|
||||
strm->next_in++;
|
||||
strm->avail_in--;
|
||||
state->bits = 0;
|
||||
}
|
||||
state->mode = CHECK;
|
||||
return DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
if (strm->avail_in == 0 && !param->cf)
|
||||
return DFLTCC_INFLATE_BREAK;
|
||||
|
||||
if (PREFIX(inflate_ensure_window)(state)) {
|
||||
state->mode = MEM;
|
||||
return DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
/* Translate stream to parameter block */
|
||||
param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
|
||||
param->sbb = state->bits;
|
||||
if (param->hl)
|
||||
param->nt = 0; /* Honor history for the first block */
|
||||
if (state->wrap & 4)
|
||||
param->cv = state->flags ? ZSWAP32(state->check) : state->check;
|
||||
|
||||
/* Inflate */
|
||||
do {
|
||||
cc = dfltcc_xpnd(strm);
|
||||
} while (cc == DFLTCC_CC_AGAIN);
|
||||
|
||||
/* Translate parameter block to stream */
|
||||
strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
|
||||
state->last = cc == DFLTCC_CC_OK;
|
||||
state->bits = param->sbb;
|
||||
if (state->wrap & 4)
|
||||
strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
|
||||
if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
|
||||
/* Report an error if stream is corrupted */
|
||||
state->mode = BAD;
|
||||
return DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
state->mode = TYPEDO;
|
||||
/* Break if operands are exhausted, otherwise continue looping */
|
||||
return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
|
||||
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
|
||||
return !param->nt;
|
||||
}
|
||||
|
||||
/*
|
||||
Rotates a circular buffer.
|
||||
The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
|
||||
*/
|
||||
static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
|
||||
unsigned char *p = pivot;
|
||||
unsigned char tmp;
|
||||
|
||||
while (p != start) {
|
||||
tmp = *start;
|
||||
*start = *p;
|
||||
*p = tmp;
|
||||
|
||||
start++;
|
||||
p++;
|
||||
|
||||
if (p == end)
|
||||
p = pivot;
|
||||
else if (start == pivot)
|
||||
pivot = p;
|
||||
}
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
if (!PREFIX(dfltcc_can_inflate)(strm))
|
||||
return 0;
|
||||
if (PREFIX(dfltcc_was_inflate_used)(strm))
|
||||
/* DFLTCC has already decompressed some data. Since there is not
|
||||
* enough information to resume decompression in software, the call
|
||||
* must fail.
|
||||
*/
|
||||
return 1;
|
||||
/* DFLTCC was not used yet - decompress in software */
|
||||
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
|
||||
/* Convert the window from the hardware to the software format */
|
||||
rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
|
||||
state->whave = state->wnext = MIN(param->hl, state->wsize);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Preloading history.
|
||||
*/
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
if (PREFIX(inflate_ensure_window)(state)) {
|
||||
state->mode = MEM;
|
||||
return Z_MEM_ERROR;
|
||||
}
|
||||
|
||||
append_history(param, state->window, dictionary, dict_length);
|
||||
state->havedict = 1;
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
|
||||
unsigned char *dictionary, uInt *dict_length) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
if (dictionary && state->window)
|
||||
get_history(param, state->window, dictionary);
|
||||
if (dict_length)
|
||||
*dict_length = param->hl;
|
||||
return Z_OK;
|
||||
}
|
||||
70
deps/zlib-ng/arch/s390/dfltcc_inflate.h
vendored
Normal file
70
deps/zlib-ng/arch/s390/dfltcc_inflate.h
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifndef DFLTCC_INFLATE_H
|
||||
#define DFLTCC_INFLATE_H
|
||||
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
struct inflate_state Z_INTERNAL *PREFIX(dfltcc_alloc_inflate_state)(PREFIX3(streamp) strm);
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
|
||||
void Z_INTERNAL PREFIX(dfltcc_copy_inflate_state)(struct inflate_state *dst, const struct inflate_state *src);
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
|
||||
typedef enum {
|
||||
DFLTCC_INFLATE_CONTINUE,
|
||||
DFLTCC_INFLATE_BREAK,
|
||||
DFLTCC_INFLATE_SOFTWARE,
|
||||
} dfltcc_inflate_action;
|
||||
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
|
||||
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
|
||||
unsigned char *dictionary, uInt* dict_length);
|
||||
|
||||
#define ZALLOC_INFLATE_STATE PREFIX(dfltcc_alloc_inflate_state)
|
||||
#define ZCOPY_INFLATE_STATE PREFIX(dfltcc_copy_inflate_state)
|
||||
|
||||
#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
|
||||
|
||||
#define INFLATE_PRIME_HOOK(strm, bits, value) \
|
||||
do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
|
||||
|
||||
#define INFLATE_TYPEDO_HOOK(strm, flush) \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) { \
|
||||
dfltcc_inflate_action action; \
|
||||
\
|
||||
RESTORE(); \
|
||||
action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
|
||||
LOAD(); \
|
||||
if (action == DFLTCC_INFLATE_CONTINUE) \
|
||||
break; \
|
||||
else if (action == DFLTCC_INFLATE_BREAK) \
|
||||
goto inf_leave; \
|
||||
}
|
||||
|
||||
#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
|
||||
|
||||
#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
|
||||
|
||||
#define INFLATE_MARK_HOOK(strm) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_SYNC_POINT_HOOK(strm) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) \
|
||||
return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) \
|
||||
return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
14
deps/zlib-ng/arch/s390/s390_features.c
vendored
Normal file
14
deps/zlib-ng/arch/s390/s390_features.c
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
#include "../../zbuild.h"
|
||||
#include "s390_features.h"
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#ifndef HWCAP_S390_VXRS
|
||||
#define HWCAP_S390_VXRS HWCAP_S390_VX
|
||||
#endif
|
||||
|
||||
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
|
||||
features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
|
||||
}
|
||||
10
deps/zlib-ng/arch/s390/s390_features.h
vendored
Normal file
10
deps/zlib-ng/arch/s390/s390_features.h
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef S390_FEATURES_H_
|
||||
#define S390_FEATURES_H_
|
||||
|
||||
struct s390_cpu_features {
|
||||
int has_vx;
|
||||
};
|
||||
|
||||
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
|
||||
|
||||
#endif
|
||||
45
deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
vendored
Normal file
45
deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
# Self-Hosted IBM Z Github Actions Runner.
|
||||
|
||||
# Temporary image: amd64 dependencies.
|
||||
FROM amd64/ubuntu:20.04 as ld-prefix
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && apt-get -y install ca-certificates libicu66 libssl1.1
|
||||
|
||||
# Main image.
|
||||
FROM s390x/ubuntu:20.04
|
||||
|
||||
# Packages for zlib-ng testing.
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && apt-get -y install \
|
||||
clang-11 \
|
||||
cmake \
|
||||
curl \
|
||||
gcc \
|
||||
git \
|
||||
jq \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
llvm-11-tools \
|
||||
ninja-build \
|
||||
python-is-python3 \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip
|
||||
|
||||
# amd64 dependencies.
|
||||
COPY --from=ld-prefix / /usr/x86_64-linux-gnu/
|
||||
RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/
|
||||
RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/
|
||||
ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu
|
||||
|
||||
# amd64 Github Actions Runner.
|
||||
RUN useradd -m actions-runner
|
||||
USER actions-runner
|
||||
WORKDIR /home/actions-runner
|
||||
RUN curl -L https://github.com/actions/runner/releases/download/v2.287.1/actions-runner-linux-x64-2.287.1.tar.gz | tar -xz
|
||||
VOLUME /home/actions-runner
|
||||
|
||||
# Scripts.
|
||||
COPY fs/ /
|
||||
ENTRYPOINT ["/usr/bin/entrypoint"]
|
||||
CMD ["/usr/bin/actions-runner"]
|
||||
24
deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
vendored
Normal file
24
deps/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
[Unit]
|
||||
Description=Self-Hosted IBM Z Github Actions Runner
|
||||
Wants=qemu-user-static
|
||||
After=qemu-user-static
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
Restart=always
|
||||
ExecStartPre=-/usr/bin/docker rm --force actions-runner
|
||||
ExecStart=/usr/bin/docker run \
|
||||
--env-file=/etc/actions-runner \
|
||||
--init \
|
||||
--interactive \
|
||||
--name=actions-runner \
|
||||
--rm \
|
||||
--volume=actions-runner:/home/actions-runner \
|
||||
iiilinuxibmcom/actions-runner
|
||||
ExecStop=/bin/sh -c "docker exec actions-runner kill -INT -- -1"
|
||||
ExecStop=/bin/sh -c "docker wait actions-runner"
|
||||
ExecStop=/bin/sh -c "docker rm actions-runner"
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
40
deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/actions-runner
vendored
Executable file
40
deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/actions-runner
vendored
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Ephemeral runner startup script.
|
||||
#
|
||||
# Expects the following environment variables:
|
||||
#
|
||||
# - repo=<owner>/<name>
|
||||
# - access_token=<ghp_***>
|
||||
#
|
||||
|
||||
set -e -u
|
||||
|
||||
# Check the cached registration token.
|
||||
token_file=registration-token.json
|
||||
set +e
|
||||
expires_at=$(jq --raw-output .expires_at "$token_file" 2>/dev/null)
|
||||
status=$?
|
||||
set -e
|
||||
if [[ $status -ne 0 || $(date +%s) -ge $(date -d "$expires_at" +%s) ]]; then
|
||||
# Refresh the cached registration token.
|
||||
curl \
|
||||
-X POST \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
-H "Authorization: token $access_token" \
|
||||
"https://api.github.com/repos/$repo/actions/runners/registration-token" \
|
||||
-o "$token_file"
|
||||
fi
|
||||
|
||||
# (Re-)register the runner.
|
||||
registration_token=$(jq --raw-output .token "$token_file")
|
||||
./config.sh remove --token "$registration_token" || true
|
||||
./config.sh \
|
||||
--url "https://github.com/$repo" \
|
||||
--token "$registration_token" \
|
||||
--labels z15 \
|
||||
--ephemeral
|
||||
|
||||
# Run one job.
|
||||
./run.sh
|
||||
30
deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/entrypoint
vendored
Executable file
30
deps/zlib-ng/arch/s390/self-hosted-builder/fs/usr/bin/entrypoint
vendored
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Container entrypoint that waits for all spawned processes.
|
||||
#
|
||||
|
||||
set -e -u
|
||||
|
||||
# Create a FIFO and start reading from its read end.
|
||||
tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
|
||||
trap 'rm -r "$tempdir"' EXIT
|
||||
done="$tempdir/pipe"
|
||||
mkfifo "$done"
|
||||
cat "$done" & waiter=$!
|
||||
|
||||
# Start the workload. Its descendants will inherit the FIFO's write end.
|
||||
status=0
|
||||
if [ "$#" -eq 0 ]; then
|
||||
bash 9>"$done" || status=$?
|
||||
else
|
||||
"$@" 9>"$done" || status=$?
|
||||
fi
|
||||
|
||||
# When the workload and all of its descendants exit, the FIFO's write end will
|
||||
# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
|
||||
# in order to handle SelfUpdater, which the workload may start in background
|
||||
# before exiting.
|
||||
wait "$waiter"
|
||||
|
||||
exit "$status"
|
||||
11
deps/zlib-ng/arch/s390/self-hosted-builder/qemu-user-static.service
vendored
Normal file
11
deps/zlib-ng/arch/s390/self-hosted-builder/qemu-user-static.service
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Support for transparent execution of non-native binaries with QEMU user emulation
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1
|
||||
# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available
|
||||
ExecStart=/usr/bin/docker run --rm --interactive --privileged iiilinuxibmcom/qemu-user-static --reset -p yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
147
deps/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
147
deps/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
|
||||
AVX512VNNIFLAG=-mavx512vnni
|
||||
AVX2FLAG=-mavx2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE42FLAG=-msse4.2
|
||||
PCLMULFLAG=-mpclmul
|
||||
VPCLMULFLAG=-mvpclmulqdq
|
||||
XSAVEFLAG=-mxsave
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
x86_features.o x86_features.lo \
|
||||
adler32_avx2.o adler32_avx2.lo \
|
||||
adler32_avx512.o adler32_avx512.lo \
|
||||
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
|
||||
adler32_sse42.o adler32_sse42.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx2.o chunkset_avx2.lo \
|
||||
chunkset_sse2.o chunkset_sse2.lo \
|
||||
chunkset_ssse3.o chunkset_ssse3.lo \
|
||||
compare256_avx2.o compare256_avx2.lo \
|
||||
compare256_sse2.o compare256_sse2.lo \
|
||||
insert_string_sse42.o insert_string_sse42.lo \
|
||||
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
|
||||
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
|
||||
slide_hash_avx2.o slide_hash_avx2.lo \
|
||||
slide_hash_sse2.o slide_hash_sse2.lo
|
||||
|
||||
x86_features.o:
|
||||
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
x86_features.lo:
|
||||
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
chunkset_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_ssse3.o:
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
chunkset_ssse3.lo:
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
compare256_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
compare256_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
insert_string_sse42.o:
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
insert_string_sse42.lo:
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
crc32_pclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_pclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
slide_hash_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
slide_hash_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
17
deps/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
17
deps/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_AVX2
|
||||
|
||||
#include "adler32_avx2_tpl.h"
|
||||
|
||||
#define COPY
|
||||
#include "adler32_avx2_tpl.h"
|
||||
|
||||
#endif
|
||||
32
deps/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
32
deps/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
/* adler32_avx2_p.h -- adler32 avx2 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_AVX2_P_H_
|
||||
#define ADLER32_AVX2_P_H_
|
||||
|
||||
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
|
||||
|
||||
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
|
||||
static inline uint32_t hsum256(__m256i x) {
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
|
||||
_mm256_castsi256_si128(x));
|
||||
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum256(__m256i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros */
|
||||
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
|
||||
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
|
||||
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
|
||||
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
141
deps/zlib-ng/arch/x86/adler32_avx2_tpl.h
vendored
Normal file
141
deps/zlib-ng/arch/x86/adler32_avx2_tpl.h
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
/* adler32_avx2_tpl.h -- adler32 avx2 vectorized function templates
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../fallback_builtins.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
|
||||
#ifdef X86_SSE42
|
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
|
||||
|
||||
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
|
||||
#define sub32(a, b, c) adler32_ssse3(a, b, c)
|
||||
#else
|
||||
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
|
||||
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
|
||||
#endif
|
||||
|
||||
#ifdef COPY
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
#endif
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
#ifdef COPY
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
} else if (len < 32) {
|
||||
#ifdef COPY
|
||||
return copy_sub32(adler, dst, src, len);
|
||||
#else
|
||||
return sub32(adler, src, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m256i vs1, vs2;
|
||||
|
||||
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m256i dot3v = _mm256_set1_epi16(1);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
|
||||
*/
|
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
|
||||
//
|
||||
#ifdef COPY
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf);
|
||||
dst += 32;
|
||||
#endif
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
|
||||
vs2 = _mm256_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Defer the multiplication with 32 to outside of the loop */
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
|
||||
/* The compiler is generating the following sequence for this integer modulus
|
||||
* when done the scalar way, in GPRs:
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
|
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
|
||||
|
||||
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
|
||||
...
|
||||
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
|
||||
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
|
||||
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
|
||||
shr $0x2f,%rsi // shift right by 47
|
||||
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
|
||||
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
|
||||
...
|
||||
// repeats for each element with vpextract instructions
|
||||
|
||||
This is tricky with AVX2 for a number of reasons:
|
||||
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
|
||||
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
|
||||
back down to 32 bit precision later (there is in AVX512)
|
||||
3.) Full width integer multiplications aren't cheap
|
||||
|
||||
We can, however, and do a relatively cheap sequence for horizontal sums.
|
||||
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
|
||||
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
|
||||
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
|
||||
performed on the maximum possible inputs before overflow
|
||||
*/
|
||||
|
||||
|
||||
/* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
|
||||
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
|
||||
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
|
||||
* what the compiler is doing to avoid integer divisions. */
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
16
deps/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
16
deps/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#include "adler32_avx512_tpl.h"
|
||||
|
||||
#define COPY
|
||||
#include "adler32_avx512_tpl.h"
|
||||
|
||||
#endif
|
||||
46
deps/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
46
deps/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef AVX512_FUNCS_H
|
||||
#define AVX512_FUNCS_H
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
/* Written because *_add_epi32(a) sets off ubsan */
|
||||
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
|
||||
__m256i a = _mm512_extracti64x4_epi64(x, 1);
|
||||
__m256i b = _mm512_extracti64x4_epi64(x, 0);
|
||||
|
||||
__m256i a_plus_b = _mm256_add_epi32(a, b);
|
||||
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
|
||||
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
|
||||
__m128i c_plus_d = _mm_add_epi32(c, d);
|
||||
|
||||
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
|
||||
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum(__m512i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros. Marking this const so the compiler stands
|
||||
* a better chance of keeping this resident in a register through entire
|
||||
* loop execution. We certainly have enough zmm registers (32) */
|
||||
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
|
||||
1, 1, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
|
||||
|
||||
/* From here, it's a simple 256 bit wide reduction sum */
|
||||
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
|
||||
|
||||
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
|
||||
* pretty slow, much slower than the longer instruction sequence below */
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
|
||||
_mm256_castsi256_si128(non_zero_avx));
|
||||
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
#endif
|
||||
106
deps/zlib-ng/arch/x86/adler32_avx512_tpl.h
vendored
Normal file
106
deps/zlib-ng/arch/x86/adler32_avx512_tpl.h
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include "../../fallback_builtins.h"
|
||||
#include <immintrin.h>
|
||||
#include "adler32_avx512_p.h"
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#ifdef COPY
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
#endif
|
||||
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 64) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
#ifdef COPY
|
||||
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
|
||||
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
|
||||
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
#endif
|
||||
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m512i vbuf, vs1_0, vs3;
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
const __m512i dot3v = _mm512_set1_epi16(1);
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
size_t k;
|
||||
|
||||
while (len >= 64) {
|
||||
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
vs1_0 = vs1;
|
||||
vs3 = _mm512_setzero_si512();
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm512_loadu_si512(src);
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512(dst, vbuf);
|
||||
dst += 64;
|
||||
#endif
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
|
||||
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm512_add_epi32(vs1_sad, vs1);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm512_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
#endif
|
||||
225
deps/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
225
deps/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
@@ -0,0 +1,225 @@
|
||||
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
|
||||
* Based on Brian Bockelman's AVX2 version
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512VNNI
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include "../../fallback_builtins.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 32)
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
if (len < 64)
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__m512i vs1, vs2;
|
||||
|
||||
while (len >= 64) {
|
||||
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
__m512i vs1_0 = vs1;
|
||||
__m512i vs3 = _mm512_setzero_si512();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m512i vs2_1 = _mm512_setzero_si512();
|
||||
__m512i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 128) {
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)src);
|
||||
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 128) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm512_loadu_si512((__m512i*)src);
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
|
||||
src += 128;
|
||||
k -= 128;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm512_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
vs2 = _mm512_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel_copy:
|
||||
if (len < 32) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
|
||||
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
|
||||
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256i vs1, vs2;
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m256i vs2_1 = _mm256_setzero_si256();
|
||||
__m256i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 64) {
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)src);
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf1);
|
||||
dst += 32;
|
||||
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm256_loadu_si256((__m256i*)src);
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf0);
|
||||
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
|
||||
dst += 64;
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm256_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
vs2 = _mm256_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel_copy;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
#endif
|
||||
121
deps/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
121
deps/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
@@ -0,0 +1,121 @@
|
||||
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_SSE42
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
}
|
||||
|
||||
__m128i vbuf, vbuf_0;
|
||||
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
v_sad_sum2, vsum2, vsum2_0;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
size_t k;
|
||||
|
||||
while (len >= 16) {
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
vs1 = _mm_cvtsi32_si128(adler0);
|
||||
vs2 = _mm_cvtsi32_si128(adler1);
|
||||
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
|
||||
dst += 32;
|
||||
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
src += 16;
|
||||
k -= 16;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
dst += 16;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = hsum(vs2) % BASE;
|
||||
}
|
||||
|
||||
/* If this is true, there's fewer than 16 elements remaining */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler0 | (adler1 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
156
deps/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
156
deps/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
|
||||
|
||||
/* If our buffer is unaligned (likely), make the determination whether
|
||||
* or not there's enough of a buffer to consume to make the scalar, aligning
|
||||
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
|
||||
* load. This is a pretty simple test, just test if 16 - the remainder + len is
|
||||
* < 16 */
|
||||
size_t max_iters = NMAX;
|
||||
size_t rem = (uintptr_t)buf & 15;
|
||||
size_t align_offset = 16 - rem;
|
||||
size_t k = 0;
|
||||
if (rem) {
|
||||
if (len < 16 + align_offset) {
|
||||
/* Let's eat the cost of this one unaligned load so that
|
||||
* we don't completely skip over the vectorization. Doing
|
||||
* 16 bytes at a time unaligned is is better than 16 + <= 15
|
||||
* sums */
|
||||
vbuf = _mm_loadu_si128((__m128i*)buf);
|
||||
len -= 16;
|
||||
buf += 16;
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
goto unaligned_jmp;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < align_offset; ++i) {
|
||||
adler += *(buf++);
|
||||
sum2 += adler;
|
||||
}
|
||||
|
||||
/* lop off the max number of sums based on the scalar sums done
|
||||
* above */
|
||||
len -= align_offset;
|
||||
max_iters -= align_offset;
|
||||
}
|
||||
|
||||
|
||||
while (len >= 16) {
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
k = (len < max_iters ? len : max_iters);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
|
||||
buf += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
buf += 16;
|
||||
k -= 16;
|
||||
|
||||
unaligned_jmp:
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
|
||||
* a partial reduction sum implicitly and only summing to integers in vector positions
|
||||
* 0 and 2. This saves us some contention on the shuffle port(s) */
|
||||
adler = partial_hsum(vs1) % BASE;
|
||||
sum2 = hsum(vs2) % BASE;
|
||||
max_iters = NMAX;
|
||||
}
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#endif
|
||||
29
deps/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
29
deps/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_SSSE3_P_H_
|
||||
#define ADLER32_SSSE3_P_H_
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static inline uint32_t partial_hsum(__m128i x) {
|
||||
__m128i second_int = _mm_srli_si128(x, 8);
|
||||
__m128i sum = _mm_add_epi32(x, second_int);
|
||||
return _mm_cvtsi128_si32(sum);
|
||||
}
|
||||
|
||||
static inline uint32_t hsum(__m128i x) {
|
||||
__m128i sum1 = _mm_unpackhi_epi64(x, x);
|
||||
__m128i sum2 = _mm_add_epi32(x, sum1);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
135
deps/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
135
deps/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
|
||||
#ifdef X86_AVX2
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 32
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
|
||||
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
|
||||
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
|
||||
static const lut_rem_pair perm_idx_lut[29] = {
|
||||
{ 0, 2}, /* 3 */
|
||||
{ 0, 0}, /* don't care */
|
||||
{ 1 * 32, 2}, /* 5 */
|
||||
{ 2 * 32, 2}, /* 6 */
|
||||
{ 3 * 32, 4}, /* 7 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{ 4 * 32, 5}, /* 9 */
|
||||
{ 5 * 32, 22}, /* 10 */
|
||||
{ 6 * 32, 21}, /* 11 */
|
||||
{ 7 * 32, 20}, /* 12 */
|
||||
{ 8 * 32, 6}, /* 13 */
|
||||
{ 9 * 32, 4}, /* 14 */
|
||||
{10 * 32, 2}, /* 15 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{11 * 32, 15}, /* 17 */
|
||||
{11 * 32 + 16, 14}, /* 18 */
|
||||
{11 * 32 + 16 * 2, 13}, /* 19 */
|
||||
{11 * 32 + 16 * 3, 12}, /* 20 */
|
||||
{11 * 32 + 16 * 4, 11}, /* 21 */
|
||||
{11 * 32 + 16 * 5, 10}, /* 22 */
|
||||
{11 * 32 + 16 * 6, 9}, /* 23 */
|
||||
{11 * 32 + 16 * 7, 8}, /* 24 */
|
||||
{11 * 32 + 16 * 8, 7}, /* 25 */
|
||||
{11 * 32 + 16 * 9, 6}, /* 26 */
|
||||
{11 * 32 + 16 * 10, 5}, /* 27 */
|
||||
{11 * 32 + 16 * 11, 4}, /* 28 */
|
||||
{11 * 32 + 16 * 12, 3}, /* 29 */
|
||||
{11 * 32 + 16 * 13, 2}, /* 30 */
|
||||
{11 * 32 + 16 * 14, 1} /* 31 */
|
||||
};
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m256i ret_vec;
|
||||
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
|
||||
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
|
||||
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
#ifdef Z_MEMORY_SANITIZER
|
||||
/* See note in chunkset_ssse3.c for why this is ok */
|
||||
__msan_unpoison(buf + dist, 32 - dist);
|
||||
#endif
|
||||
|
||||
if (dist < 16) {
|
||||
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
|
||||
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
|
||||
* shuffles and combining the halves later */
|
||||
const __m256i permute_xform =
|
||||
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
|
||||
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
|
||||
} else if (dist == 16) {
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
} else {
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
|
||||
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
|
||||
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
|
||||
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
|
||||
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
|
||||
* shuffle those values */
|
||||
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
|
||||
}
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx2
|
||||
#define CHUNKCOPY chunkcopy_avx2
|
||||
#define CHUNKUNROLL chunkunroll_avx2
|
||||
#define CHUNKMEMSET chunkmemset_avx2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_avx2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
56
deps/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
56
deps/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
#ifdef X86_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_sse2
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKMEMSET chunkmemset_sse2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_sse2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
103
deps/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
103
deps/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
|
||||
* code size by sharing the chunkcopy functions, which will certainly compile
|
||||
* to identical machine code */
|
||||
#if defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_CHUNKCOPY
|
||||
#define HAVE_CHUNKUNROLL
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
{0, 0}, /* don't care */
|
||||
{1 * 32, 1}, /* 5 */
|
||||
{2 * 32, 4}, /* 6 */
|
||||
{3 * 32, 2}, /* 7 */
|
||||
{0 * 32, 0}, /* don't care */
|
||||
{4 * 32, 7}, /* 9 */
|
||||
{5 * 32, 6}, /* 10 */
|
||||
{6 * 32, 5}, /* 11 */
|
||||
{7 * 32, 4}, /* 12 */
|
||||
{8 * 32, 3}, /* 13 */
|
||||
{9 * 32, 2}, /* 14 */
|
||||
{10 * 32, 1},/* 15 */
|
||||
};
|
||||
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
#ifdef Z_MEMORY_SANITIZER
|
||||
/* Important to note:
|
||||
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
|
||||
* bytes we willingly and purposefully load uninitialized that we swizzle over
|
||||
* in a vector register, anyway. If what we assume is wrong about what is used,
|
||||
* the memory sanitizer will still usefully flag it */
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
#endif
|
||||
ret_vec = _mm_loadu_si128((__m128i*)buf);
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
|
||||
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
|
||||
|
||||
#define CHUNKSIZE chunksize_ssse3
|
||||
#define CHUNKMEMSET chunkmemset_ssse3
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_ssse3
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
63
deps/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
63
deps/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
/* compare256_avx2.c -- AVX2 version of compare256
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
__m256i ymm_src0, ymm_src1, ymm_cmp;
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
|
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
|
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_avx2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
96
deps/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
96
deps/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
/* compare256_sse2.c -- SSE2 version of compare256
|
||||
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
int align_offset = ((uintptr_t)src0) & 15;
|
||||
const uint8_t *end0 = src0 + 256;
|
||||
const uint8_t *end1 = src1 + 256;
|
||||
__m128i xmm_src0, xmm_src1, xmm_cmp;
|
||||
|
||||
/* Do the first load unaligned, than all subsequent ones we have at least
|
||||
* one aligned load. Sadly aligning both loads is probably unrealistic */
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
int align_adv = 16 - align_offset;
|
||||
len += align_adv;
|
||||
src0 += align_adv;
|
||||
src1 += align_adv;
|
||||
|
||||
/* Do a flooring division (should just be a shift right) */
|
||||
int num_iter = (256 - len) / 16;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
xmm_src0 = _mm_load_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
len += 16, src0 += 16, src1 += 16;
|
||||
}
|
||||
|
||||
if (align_offset) {
|
||||
src0 = end0 - 16;
|
||||
src1 = end1 - 16;
|
||||
len = 256 - 16;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
}
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_sse2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
186
deps/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
186
deps/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
|
||||
#endif
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i xmm_crc_part = _mm_setzero_si128();
|
||||
#ifdef COPY
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
#else
|
||||
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
|
||||
int32_t first = init_crc != 0;
|
||||
|
||||
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
|
||||
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
|
||||
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
|
||||
* by definition can be up to 15 bytes + one full vector load. */
|
||||
assert(len >= 31 || first == 0);
|
||||
#endif
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
if (len < 16) {
|
||||
#ifdef COPY
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
dst += algn_diff;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_crc_part);
|
||||
|
||||
if (algn_diff < 4 && init_crc != 0) {
|
||||
xmm_t0 = xmm_crc_part;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
src += 16;
|
||||
len -= 16;
|
||||
}
|
||||
#endif
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
}
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
if (len >= 256) {
|
||||
#ifdef COPY
|
||||
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
|
||||
dst += n;
|
||||
#else
|
||||
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
|
||||
xmm_initial, first);
|
||||
first = 0;
|
||||
#endif
|
||||
len -= n;
|
||||
src += n;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (len >= 64) {
|
||||
len -= 64;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
||||
src += 64;
|
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
||||
dst += 64;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if (len >= 48) {
|
||||
len -= 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
src += 48;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
dst += 48;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
} else if (len >= 32) {
|
||||
len -= 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
src += 32;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
dst += 32;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
} else if (len >= 16) {
|
||||
len -= 16;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
src += 16;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
dst += 16;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
}
|
||||
|
||||
partial:
|
||||
if (len) {
|
||||
memcpy(&xmm_crc_part, src, len);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
}
|
||||
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
}
|
||||
107
deps/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
107
deps/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
|
||||
__m128i init_crc, int32_t first) {
|
||||
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
|
||||
#endif
|
||||
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
|
||||
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
|
||||
__m512i z0, z1, z2, z3;
|
||||
size_t len_tmp = len;
|
||||
const __m512i zmm_fold4 = _mm512_set4_epi32(
|
||||
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
const __m512i zmm_fold16 = _mm512_set4_epi32(
|
||||
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
|
||||
|
||||
// zmm register init
|
||||
zmm_crc0 = _mm512_setzero_si512();
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
#ifndef COPY
|
||||
XOR_INITIAL512(zmm_t0);
|
||||
#endif
|
||||
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
/* already have intermediate CRC in xmm registers
|
||||
* fold4 with 4 xmm_crc to get zmm_crc0
|
||||
*/
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
|
||||
// fold-16 loops
|
||||
while (len >= 256) {
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
|
||||
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
|
||||
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
|
||||
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
|
||||
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
|
||||
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
|
||||
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
|
||||
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
|
||||
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
|
||||
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
|
||||
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
}
|
||||
// zmm_crc[0,1,2,3] -> zmm_crc0
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
|
||||
|
||||
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
|
||||
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
|
||||
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
|
||||
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
|
||||
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
|
||||
|
||||
return (len_tmp - len); // return n bytes processed
|
||||
}
|
||||
30
deps/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
30
deps/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_pclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
|
||||
#define CRC32 crc32_pclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
||||
363
deps/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
363
deps/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,363 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <smmintrin.h> // _mm_extract_epi32
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "../../crc32_fold.h"
|
||||
#include "../../crc32_braid_p.h"
|
||||
#include "../../fallback_builtins.h"
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
|
||||
int32_t first);
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
|
||||
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
|
||||
*fold0 = _mm_load_si128(fold + 0);
|
||||
*fold1 = _mm_load_si128(fold + 1);
|
||||
*fold2 = _mm_load_si128(fold + 2);
|
||||
*fold3 = _mm_load_si128(fold + 3);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
|
||||
const __m128i *fold2, const __m128i *fold3) {
|
||||
_mm_storeu_si128(fold + 0, *fold0);
|
||||
_mm_storeu_si128(fold + 1, *fold1);
|
||||
_mm_storeu_si128(fold + 2, *fold2);
|
||||
_mm_storeu_si128(fold + 3, *fold3);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
|
||||
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
|
||||
__m128i xmm_zero = _mm_setzero_si128();
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ONCE(op) if (first) { first = 0; op; }
|
||||
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
|
||||
#endif
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
#define COPY
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = {
|
||||
0xccaa009e, 0x00000000, /* rk1 */
|
||||
0x751997d0, 0x00000001, /* rk2 */
|
||||
0xccaa009e, 0x00000000, /* rk5 */
|
||||
0x63cd6124, 0x00000001, /* rk6 */
|
||||
0xf7011640, 0x00000001, /* rk7 */
|
||||
0xdb710640, 0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
|
||||
|
||||
return crc->value;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
|
||||
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
|
||||
* these short lengths might also prove to be effective */
|
||||
if (len < 64)
|
||||
return PREFIX(crc32_braid)(crc32, buf, len);
|
||||
|
||||
crc32_fold ALIGNED_(16) crc_state;
|
||||
CRC32_FOLD_RESET(&crc_state);
|
||||
CRC32_FOLD(&crc_state, buf, len, crc32);
|
||||
return CRC32_FOLD_FINAL(&crc_state);
|
||||
}
|
||||
17
deps/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
17
deps/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
|
||||
|
||||
#define X86_VPCLMULQDQ
|
||||
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_vpclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
|
||||
#define CRC32 crc32_vpclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
||||
50
deps/zlib-ng/arch/x86/insert_string_sse42.c
vendored
Normal file
50
deps/zlib-ng/arch/x86/insert_string_sse42.c
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
#include "../../deflate.h"
|
||||
|
||||
#ifdef X86_SSE42_CRC_INTRIN
|
||||
# ifdef _MSC_VER
|
||||
# define HASH_CALC(s, h, val)\
|
||||
h = _mm_crc32_u32(h, val)
|
||||
# else
|
||||
# define HASH_CALC(s, h, val)\
|
||||
h = __builtin_ia32_crc32si(h, val)
|
||||
# endif
|
||||
#else
|
||||
# ifdef _MSC_VER
|
||||
# define HASH_CALC(s, h, val) {\
|
||||
__asm mov edx, h\
|
||||
__asm mov eax, val\
|
||||
__asm crc32 eax, edx\
|
||||
__asm mov h, eax\
|
||||
}
|
||||
# else
|
||||
# define HASH_CALC(s, h, val) \
|
||||
__asm__ __volatile__ (\
|
||||
"crc32 %1,%0\n\t"\
|
||||
: "+r" (h)\
|
||||
: "r" (val)\
|
||||
);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH update_hash_sse42
|
||||
#define INSERT_STRING insert_string_sse42
|
||||
#define QUICK_INSERT_STRING quick_insert_string_sse42
|
||||
|
||||
#ifdef X86_SSE42
|
||||
# include "../../insert_string_tpl.h"
|
||||
#endif
|
||||
39
deps/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
39
deps/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)table);
|
||||
result = _mm256_subs_epu16(value, wsize);
|
||||
_mm256_storeu_si256((__m256i *)table, result);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
|
||||
slide_hash_chain(s->prev, wsize, ymm_wsize);
|
||||
}
|
||||
62
deps/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
62
deps/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* SSE optimized hash slide
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <assert.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
|
||||
uint32_t entries1, const __m128i wsize) {
|
||||
uint32_t entries;
|
||||
Pos *table;
|
||||
__m128i value0, value1, result0, result1;
|
||||
|
||||
int on_chain = 0;
|
||||
|
||||
next_chain:
|
||||
table = (on_chain) ? table1 : table0;
|
||||
entries = (on_chain) ? entries1 : entries0;
|
||||
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
|
||||
* Our alloc function is aligned to 64 byte boundaries */
|
||||
do {
|
||||
value0 = _mm_load_si128((__m128i *)table);
|
||||
value1 = _mm_load_si128((__m128i *)(table + 8));
|
||||
result0 = _mm_subs_epu16(value0, wsize);
|
||||
result1 = _mm_subs_epu16(value1, wsize);
|
||||
_mm_store_si128((__m128i *)table, result0);
|
||||
_mm_store_si128((__m128i *)(table + 8), result1);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
|
||||
++on_chain;
|
||||
if (on_chain > 1) {
|
||||
return;
|
||||
} else {
|
||||
goto next_chain;
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
|
||||
|
||||
assert(((uintptr_t)s->head & 15) == 0);
|
||||
assert(((uintptr_t)s->prev & 15) == 0);
|
||||
|
||||
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
|
||||
}
|
||||
97
deps/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
97
deps/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
/* x86_features.c - x86 feature check
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Author:
|
||||
* Jim Kukunas
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "x86_features.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
# include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
|
||||
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _WIN32
|
||||
unsigned int registers[4];
|
||||
__cpuid((int *)registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _WIN32
|
||||
unsigned int registers[4];
|
||||
__cpuidex((int *)registers, info, subinfo);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t xgetbv(unsigned int xcr) {
|
||||
#ifdef _WIN32
|
||||
return _xgetbv(xcr);
|
||||
#else
|
||||
uint32_t eax, edx;
|
||||
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||
return (uint64_t)(edx) << 32 | eax;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
features->has_sse2 = edx & 0x4000000;
|
||||
features->has_ssse3 = ecx & 0x200;
|
||||
features->has_sse42 = ecx & 0x100000;
|
||||
features->has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (ecx & 0x08000000) {
|
||||
uint64_t xfeature = xgetbv(0);
|
||||
|
||||
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
|
||||
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
|
||||
}
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
features->has_vpclmulqdq = ecx & 0x400;
|
||||
|
||||
// check AVX2 bit if the OS supports saving YMM registers
|
||||
if (features->has_os_save_ymm) {
|
||||
features->has_avx2 = ebx & 0x20;
|
||||
}
|
||||
|
||||
// check AVX512 bits if the OS supports saving ZMM registers
|
||||
if (features->has_os_save_zmm) {
|
||||
features->has_avx512 = ebx & 0x00010000;
|
||||
features->has_avx512vnni = ecx & 0x800;
|
||||
}
|
||||
}
|
||||
}
|
||||
24
deps/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
24
deps/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
/* x86_features.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FEATURES_H_
|
||||
#define X86_FEATURES_H_
|
||||
|
||||
struct x86_cpu_features {
|
||||
int has_avx2;
|
||||
int has_avx512;
|
||||
int has_avx512vnni;
|
||||
int has_sse2;
|
||||
int has_ssse3;
|
||||
int has_sse42;
|
||||
int has_pclmulqdq;
|
||||
int has_vpclmulqdq;
|
||||
int has_os_save_ymm;
|
||||
int has_os_save_zmm;
|
||||
};
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
|
||||
|
||||
#endif /* CPU_H_ */
|
||||
42
deps/zlib-ng/chunkset.c
vendored
Normal file
42
deps/zlib-ng/chunkset.c
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/* chunkset.c -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
typedef uint64_t chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 8
|
||||
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint8_t *dest = (uint8_t *)chunk;
|
||||
memcpy(dest, from, sizeof(uint32_t));
|
||||
memcpy(dest+4, from, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
memcpy(chunk, from, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
memcpy(out, chunk, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_c
|
||||
#define CHUNKCOPY chunkcopy_c
|
||||
#define CHUNKUNROLL chunkunroll_c
|
||||
#define CHUNKMEMSET chunkmemset_c
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_c
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_c
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
200
deps/zlib-ng/chunkset_tpl.h
vendored
Normal file
200
deps/zlib-ng/chunkset_tpl.h
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
/* chunkset_tpl.h -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
|
||||
#endif
|
||||
|
||||
/* Returns the chunk size */
|
||||
Z_INTERNAL uint32_t CHUNKSIZE(void) {
|
||||
return sizeof(chunk_t);
|
||||
}
|
||||
|
||||
/* Behave like memcpy, but assume that it's OK to overwrite at least
|
||||
chunk_t bytes of output even if the length is shorter than this,
|
||||
that the length is non-zero, and that `from` lags `out` by at least
|
||||
sizeof chunk_t bytes (or that they don't overlap at all or simply that
|
||||
the distance is less than the length of the copy).
|
||||
|
||||
Aside from better memory bus utilisation, this means that short copies
|
||||
(chunk_t bytes or fewer) will fall straight through the loop
|
||||
without iteration, which will hopefully make the branch prediction more
|
||||
reliable. */
|
||||
#ifndef HAVE_CHUNKCOPY
|
||||
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
chunk_t chunk;
|
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += align;
|
||||
from += align;
|
||||
len -= align;
|
||||
while (len > 0) {
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += sizeof(chunk_t);
|
||||
from += sizeof(chunk_t);
|
||||
len -= sizeof(chunk_t);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Perform short copies until distance can be rewritten as being at least
|
||||
sizeof chunk_t.
|
||||
|
||||
This assumes that it's OK to overwrite at least the first
|
||||
2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
|
||||
This assumption holds because inflate_fast() starts every iteration with at
|
||||
least 258 bytes of output space available (258 being the maximum length
|
||||
output from a single token; see inflate_fast()'s assumptions below). */
|
||||
#ifndef HAVE_CHUNKUNROLL
|
||||
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
|
||||
unsigned char const *from = out - *dist;
|
||||
chunk_t chunk;
|
||||
while (*dist < *len && *dist < sizeof(chunk_t)) {
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += *dist;
|
||||
*len -= *dist;
|
||||
*dist += *dist;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_CHUNK_MAG
|
||||
/* Loads a magazine to feed into memory of the pattern */
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
/* This code takes string of length dist from "from" and repeats
|
||||
* it for as many times as can fit in a chunk_t (vector register) */
|
||||
uint32_t cpy_dist;
|
||||
uint32_t bytes_remaining = sizeof(chunk_t);
|
||||
chunk_t chunk_load;
|
||||
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
|
||||
while (bytes_remaining) {
|
||||
cpy_dist = MIN(dist, bytes_remaining);
|
||||
memcpy(cur_chunk, buf, cpy_dist);
|
||||
bytes_remaining -= cpy_dist;
|
||||
cur_chunk += cpy_dist;
|
||||
/* This allows us to bypass an expensive integer division since we're effectively
|
||||
* counting in this loop, anyway */
|
||||
*chunk_rem = cpy_dist;
|
||||
}
|
||||
|
||||
return chunk_load;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
|
||||
Return OUT + LEN. */
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
/* Debug performance related issues when len < sizeof(uint64_t):
|
||||
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
|
||||
Assert(dist > 0, "chunkmemset cannot have a distance 0");
|
||||
/* Only AVX2 */
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
if (len <= 16) {
|
||||
return chunkmemset_ssse3(out, dist, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t *from = out - dist;
|
||||
|
||||
if (dist == 1) {
|
||||
memset(out, *from, len);
|
||||
return out + len;
|
||||
} else if (dist > sizeof(chunk_t)) {
|
||||
return CHUNKCOPY(out, out - dist, len);
|
||||
}
|
||||
|
||||
chunk_t chunk_load;
|
||||
uint32_t chunk_mod = 0;
|
||||
|
||||
/* TODO: possibly build up a permutation table for this if not an even modulus */
|
||||
#ifdef HAVE_CHUNKMEMSET_2
|
||||
if (dist == 2) {
|
||||
chunkmemset_2(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
#ifdef HAVE_CHUNKMEMSET_4
|
||||
if (dist == 4) {
|
||||
chunkmemset_4(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
#ifdef HAVE_CHUNKMEMSET_8
|
||||
if (dist == 8) {
|
||||
chunkmemset_8(from, &chunk_load);
|
||||
} else if (dist == sizeof(chunk_t)) {
|
||||
loadchunk(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
|
||||
}
|
||||
|
||||
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
|
||||
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
|
||||
if (chunk_mod == 0) {
|
||||
while (len >= (2 * sizeof(chunk_t))) {
|
||||
storechunk(out, &chunk_load);
|
||||
storechunk(out + sizeof(chunk_t), &chunk_load);
|
||||
out += 2 * sizeof(chunk_t);
|
||||
len -= 2 * sizeof(chunk_t);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we don't have a "dist" length that divides evenly into a vector
|
||||
* register, we can write the whole vector register but we need only
|
||||
* advance by the amount of the whole string that fits in our chunk_t.
|
||||
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
|
||||
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
|
||||
while (len >= sizeof(chunk_t)) {
|
||||
storechunk(out, &chunk_load);
|
||||
len -= adv_amount;
|
||||
out += adv_amount;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
memcpy(out, &chunk_load, len);
|
||||
out += len;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
|
||||
#if !defined(UNALIGNED64_OK)
|
||||
# if !defined(UNALIGNED_OK)
|
||||
static const uint32_t align_mask = 7;
|
||||
# else
|
||||
static const uint32_t align_mask = 3;
|
||||
# endif
|
||||
#endif
|
||||
|
||||
len = MIN(len, left);
|
||||
uint8_t *from = out - dist;
|
||||
#if !defined(UNALIGNED64_OK)
|
||||
while (((uintptr_t)out & align_mask) && (len > 0)) {
|
||||
*out++ = *from++;
|
||||
--len;
|
||||
--left;
|
||||
}
|
||||
#endif
|
||||
if (left < (unsigned)(3 * sizeof(chunk_t))) {
|
||||
while (len > 0) {
|
||||
*out++ = *from++;
|
||||
--len;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
if (len)
|
||||
return CHUNKMEMSET(out, dist, len);
|
||||
|
||||
return out;
|
||||
}
|
||||
111
deps/zlib-ng/cmake/detect-arch.c
vendored
Normal file
111
deps/zlib-ng/cmake/detect-arch.c
vendored
Normal file
@@ -0,0 +1,111 @@
|
||||
// archdetect.c -- Detect compiler architecture and raise preprocessor error
|
||||
// containing a simple arch identifier.
|
||||
// Copyright (C) 2019 Hans Kristian Rosbach
|
||||
// Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
// x86_64
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#error archfound x86_64
|
||||
|
||||
// x86
|
||||
#elif defined(__i386) || defined(_M_IX86)
|
||||
#error archfound i686
|
||||
|
||||
// ARM
|
||||
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
|
||||
#error archfound aarch64
|
||||
#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
|
||||
#if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
|
||||
#error archfound armv8
|
||||
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
|
||||
#error archfound armv7
|
||||
#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
|
||||
#error archfound armv6
|
||||
#elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
|
||||
#error archfound armv5
|
||||
#elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
|
||||
#error archfound armv4
|
||||
#elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
|
||||
#error archfound armv3
|
||||
#elif defined(__ARM_ARCH_2__)
|
||||
#error archfound armv2
|
||||
#endif
|
||||
|
||||
// PowerPC
|
||||
#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
|
||||
#if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#error archfound powerpc64le
|
||||
#else
|
||||
#error archfound powerpc64
|
||||
#endif
|
||||
#else
|
||||
#error archfound powerpc
|
||||
#endif
|
||||
|
||||
// --------------- Less common architectures alphabetically below ---------------
|
||||
|
||||
// ALPHA
|
||||
#elif defined(__alpha__) || defined(__alpha)
|
||||
#error archfound alpha
|
||||
|
||||
// Blackfin
|
||||
#elif defined(__BFIN__)
|
||||
#error archfound blackfin
|
||||
|
||||
// Itanium
|
||||
#elif defined(__ia64) || defined(_M_IA64)
|
||||
#error archfound ia64
|
||||
|
||||
// MIPS
|
||||
#elif defined(__mips__) || defined(__mips)
|
||||
#error archfound mips
|
||||
|
||||
// Motorola 68000-series
|
||||
#elif defined(__m68k__)
|
||||
#error archfound m68k
|
||||
|
||||
// SuperH
|
||||
#elif defined(__sh__)
|
||||
#error archfound sh
|
||||
|
||||
// SPARC
|
||||
#elif defined(__sparc__) || defined(__sparc)
|
||||
#if defined(__sparcv9) || defined(__sparc_v9__)
|
||||
#error archfound sparc9
|
||||
#elif defined(__sparcv8) || defined(__sparc_v8__)
|
||||
#error archfound sparc8
|
||||
#endif
|
||||
|
||||
// SystemZ
|
||||
#elif defined(__370__)
|
||||
#error archfound s370
|
||||
#elif defined(__s390__)
|
||||
#error archfound s390
|
||||
#elif defined(__s390x) || defined(__zarch__)
|
||||
#error archfound s390x
|
||||
|
||||
// PARISC
|
||||
#elif defined(__hppa__)
|
||||
#error archfound parisc
|
||||
|
||||
// RS-6000
|
||||
#elif defined(__THW_RS6000)
|
||||
#error archfound rs6000
|
||||
|
||||
// RISC-V
|
||||
#elif defined(__riscv)
|
||||
#if __riscv_xlen == 64
|
||||
#error archfound riscv64
|
||||
#elif __riscv_xlen == 32
|
||||
#error archfound riscv32
|
||||
#endif
|
||||
|
||||
// Emscripten (WebAssembly)
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
#error archfound wasm32
|
||||
|
||||
// return 'unrecognized' if we do not know what architecture this is
|
||||
#else
|
||||
#error archfound unrecognized
|
||||
#endif
|
||||
101
deps/zlib-ng/cmake/detect-arch.cmake
vendored
Normal file
101
deps/zlib-ng/cmake/detect-arch.cmake
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
|
||||
# Copyright (C) 2019 Hans Kristian Rosbach
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
set(ARCHDETECT_FOUND TRUE)
|
||||
|
||||
if(CMAKE_OSX_ARCHITECTURES)
|
||||
# If multiple architectures are requested (universal build), pick only the first
|
||||
list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
|
||||
elseif(MSVC)
|
||||
if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
|
||||
set(ARCH "i686")
|
||||
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
|
||||
set(ARCH "x86_64")
|
||||
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
|
||||
set(ARCH "arm")
|
||||
elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64")
|
||||
set(ARCH "aarch64")
|
||||
endif()
|
||||
elseif(EMSCRIPTEN)
|
||||
set(ARCH "wasm32")
|
||||
elseif(CMAKE_CROSSCOMPILING)
|
||||
set(ARCH ${CMAKE_C_COMPILER_TARGET})
|
||||
else()
|
||||
# Let preprocessor parse archdetect.c and raise an error containing the arch identifier
|
||||
enable_language(C)
|
||||
try_run(
|
||||
run_result_unused
|
||||
compile_result_unused
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
|
||||
COMPILE_OUTPUT_VARIABLE RAWOUTPUT
|
||||
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
|
||||
)
|
||||
|
||||
# Find basearch tag, and extract the arch word into BASEARCH variable
|
||||
string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
|
||||
if(NOT ARCH)
|
||||
set(ARCH unknown)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Make sure we have ARCH set
|
||||
if(NOT ARCH OR ARCH STREQUAL "unknown")
|
||||
set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||
message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
|
||||
else()
|
||||
message(STATUS "Arch detected: '${ARCH}'")
|
||||
endif()
|
||||
|
||||
# Base arch detection
|
||||
if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
|
||||
set(BASEARCH "x86")
|
||||
set(BASEARCH_X86_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64)")
|
||||
set(BASEARCH "arm")
|
||||
set(BASEARCH_ARM_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
|
||||
set(BASEARCH "ppc")
|
||||
set(BASEARCH_PPC_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "alpha")
|
||||
set(BASEARCH "alpha")
|
||||
set(BASEARCH_ALPHA_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "blackfin")
|
||||
set(BASEARCH "blackfin")
|
||||
set(BASEARCH_BLACKFIN_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "ia64")
|
||||
set(BASEARCH "ia64")
|
||||
set(BASEARCH_IA64_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "mips")
|
||||
set(BASEARCH "mips")
|
||||
set(BASEARCH_MIPS_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "m68k")
|
||||
set(BASEARCH "m68k")
|
||||
set(BASEARCH_M68K_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "sh")
|
||||
set(BASEARCH "sh")
|
||||
set(BASEARCH_SH_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "sparc[89]?")
|
||||
set(BASEARCH "sparc")
|
||||
set(BASEARCH_SPARC_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "s3[679]0x?")
|
||||
set(BASEARCH "s360")
|
||||
set(BASEARCH_S360_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "parisc")
|
||||
set(BASEARCH "parisc")
|
||||
set(BASEARCH_PARISC_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "rs6000")
|
||||
set(BASEARCH "rs6000")
|
||||
set(BASEARCH_RS6000_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "riscv(32|64)")
|
||||
set(BASEARCH "riscv")
|
||||
set(BASEARCH_RISCV_FOUND TRUE)
|
||||
elseif("${ARCH}" MATCHES "wasm32")
|
||||
set(BASEARCH "wasm32")
|
||||
set(BASEARCH_WASM32_FOUND TRUE)
|
||||
else()
|
||||
set(BASEARCH "x86")
|
||||
set(BASEARCH_X86_FOUND TRUE)
|
||||
message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
|
||||
endif()
|
||||
message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")
|
||||
46
deps/zlib-ng/cmake/detect-coverage.cmake
vendored
Normal file
46
deps/zlib-ng/cmake/detect-coverage.cmake
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
# detect-coverage.cmake -- Detect supported compiler coverage flags
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
macro(add_code_coverage)
|
||||
# Check for -coverage flag support for Clang/GCC
|
||||
if(CMAKE_VERSION VERSION_LESS 3.14)
|
||||
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
|
||||
else()
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
|
||||
endif()
|
||||
check_c_compiler_flag(-coverage HAVE_COVERAGE)
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS)
|
||||
|
||||
if(HAVE_COVERAGE)
|
||||
add_compile_options(-coverage)
|
||||
add_link_options(-coverage)
|
||||
message(STATUS "Code coverage enabled using: -coverage")
|
||||
else()
|
||||
# Some versions of GCC don't support -coverage shorthand
|
||||
if(CMAKE_VERSION VERSION_LESS 3.14)
|
||||
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
|
||||
else()
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
|
||||
endif()
|
||||
check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS)
|
||||
|
||||
if(HAVE_TEST_COVERAGE)
|
||||
add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
|
||||
add_link_options(-lgcov -fprofile-arcs)
|
||||
message(STATUS "Code coverage enabled using: -ftest-coverage")
|
||||
else()
|
||||
message(WARNING "Compiler does not support code coverage")
|
||||
set(WITH_CODE_COVERAGE OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set optimization level to zero for code coverage builds
|
||||
if (WITH_CODE_COVERAGE)
|
||||
# Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
|
||||
set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
endmacro()
|
||||
43
deps/zlib-ng/cmake/detect-install-dirs.cmake
vendored
Normal file
43
deps/zlib-ng/cmake/detect-install-dirs.cmake
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
# detect-install-dirs.cmake -- Detect install directory parameters
|
||||
# Copyright (C) 2021 Hans Kristian Rosbach
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
# Determine installation directory for executables
|
||||
if (DEFINED BIN_INSTALL_DIR)
|
||||
set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
|
||||
set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
|
||||
elseif (DEFINED INSTALL_BIN_DIR)
|
||||
set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
|
||||
endif()
|
||||
|
||||
# Determine installation directory for libraries
|
||||
if (DEFINED LIB_INSTALL_DIR)
|
||||
set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
|
||||
set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
|
||||
elseif (DEFINED INSTALL_LIB_DIR)
|
||||
set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
|
||||
endif()
|
||||
|
||||
# Determine installation directory for include files
|
||||
if (DEFINED INC_INSTALL_DIR)
|
||||
set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
|
||||
set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
|
||||
elseif (DEFINED INSTALL_INC_DIR)
|
||||
set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
|
||||
endif()
|
||||
|
||||
# Define GNU standard installation directories
|
||||
include(GNUInstallDirs)
|
||||
|
||||
# Determine installation directory for pkgconfig files
|
||||
if (DEFINED PKGCONFIG_INSTALL_DIR)
|
||||
set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
|
||||
elseif (DEFINED INSTALL_PKGCONFIG_DIR)
|
||||
set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
|
||||
elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
|
||||
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
|
||||
elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
|
||||
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
|
||||
else()
|
||||
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
|
||||
endif()
|
||||
548
deps/zlib-ng/cmake/detect-intrinsics.cmake
vendored
Normal file
548
deps/zlib-ng/cmake/detect-intrinsics.cmake
vendored
Normal file
@@ -0,0 +1,548 @@
|
||||
# detect-intrinsics.cmake -- Detect compiler intrinsics support
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
macro(check_acle_compiler_flag)
|
||||
if(MSVC)
|
||||
# Both ARM and ARM64-targeting msvc support intrinsics, but
|
||||
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
|
||||
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
|
||||
set(HAVE_ACLE_FLAG TRUE)
|
||||
endif()
|
||||
else()
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
|
||||
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports ACLE flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"int main() { return 0; }"
|
||||
HAVE_ACLE_FLAG FAIL_REGEX "not supported")
|
||||
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
|
||||
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
|
||||
# Check whether compiler supports ACLE flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"int main() { return 0; }"
|
||||
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
|
||||
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
|
||||
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_avx512_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
|
||||
else()
|
||||
set(AVX512FLAG "/arch:AVX512")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
|
||||
# instruction scheduling unless you specify a reasonable -mtune= target
|
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
|
||||
if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
|
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
|
||||
if(HAVE_CASCADE_LAKE)
|
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
|
||||
else()
|
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
|
||||
endif()
|
||||
unset(HAVE_CASCADE_LAKE)
|
||||
endif()
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX512FLAG "/arch:AVX512")
|
||||
endif()
|
||||
# Check whether compiler supports AVX512 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m512i x = _mm512_set1_epi8(2);
|
||||
const __m512i y = _mm512_set_epi32(0x1020304, 0x5060708, 0x90a0b0c, 0xd0e0f10,
|
||||
0x11121314, 0x15161718, 0x191a1b1c, 0x1d1e1f20,
|
||||
0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30,
|
||||
0x31323334, 0x35363738, 0x393a3b3c, 0x3d3e3f40);
|
||||
x = _mm512_sub_epi8(x, y);
|
||||
(void)x;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_AVX512_INTRIN
|
||||
)
|
||||
|
||||
# Evidently both GCC and clang were late to implementing these
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__mmask16 a = 0xFF;
|
||||
a = _knot_mask16(a);
|
||||
(void)a;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_MASK_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_avx512vnni_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
|
||||
else()
|
||||
set(AVX512VNNIFLAG "/arch:AVX512")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
|
||||
if(NOT CMAKE_GENERATOR_TOOLSET MATCHES "ClangCl")
|
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
|
||||
if(HAVE_CASCADE_LAKE)
|
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
|
||||
else()
|
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
|
||||
endif()
|
||||
unset(HAVE_CASCADE_LAKE)
|
||||
endif()
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX512VNNIFLAG "/arch:AVX512")
|
||||
endif()
|
||||
|
||||
# Check whether compiler supports AVX512vnni intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m512i x = _mm512_set1_epi8(2);
|
||||
const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
__m512i z = _mm512_setzero_epi32();
|
||||
z = _mm512_dpbusd_epi32(z, x, y);
|
||||
(void)z;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_AVX512VNNI_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_avx2_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX2FLAG "-mavx2")
|
||||
else()
|
||||
set(AVX2FLAG "/arch:AVX2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(AVX2FLAG "-mavx2")
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX2FLAG "/arch:AVX2")
|
||||
endif()
|
||||
# Check whether compiler supports AVX2 intrinics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m256i x = _mm256_set1_epi16(2);
|
||||
const __m256i y = _mm256_set1_epi16(1);
|
||||
x = _mm256_subs_epu16(x, y);
|
||||
(void)x;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_AVX2_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_neon_compiler_flag)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
if("${ARCH}" MATCHES "aarch64")
|
||||
set(NEONFLAG "-march=armv8-a+simd")
|
||||
else()
|
||||
set(NEONFLAG "-mfpu=neon")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports NEON flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
int main() { return 0; }"
|
||||
MFPU_NEON_AVAILABLE FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_neon_ld4_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
if("${ARCH}" MATCHES "aarch64")
|
||||
set(NEONFLAG "-march=armv8-a+simd")
|
||||
else()
|
||||
set(NEONFLAG "-mfpu=neon")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports loading 4 neon vecs into a register range
|
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
int main(void) {
|
||||
int stack_var[16];
|
||||
int32x4x4_t v = vld1q_s32_x4(stack_var);
|
||||
(void)v;
|
||||
return 0;
|
||||
}"
|
||||
NEON_HAS_LD4)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_pclmulqdq_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(PCLMULFLAG "-mpclmul")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports PCLMULQDQ intrinsics
|
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
|
||||
# The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
|
||||
set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m128i a = _mm_setzero_si128();
|
||||
__m128i b = _mm_setzero_si128();
|
||||
__m128i c = _mm_clmulepi64_si128(a, b, 0x10);
|
||||
(void)c;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_PCLMULQDQ_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
else()
|
||||
set(HAVE_PCLMULQDQ_INTRIN OFF)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_vpclmulqdq_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports VPCLMULQDQ intrinsics
|
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
|
||||
set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m512i a = _mm512_setzero_si512();
|
||||
__m512i b = _mm512_setzero_si512();
|
||||
__m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
|
||||
(void)c;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_VPCLMULQDQ_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
else()
|
||||
set(HAVE_VPCLMULQDQ_INTRIN OFF)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_ppc_intrinsics)
|
||||
# Check if compiler supports AltiVec
|
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec")
|
||||
check_c_source_compiles(
|
||||
"#include <altivec.h>
|
||||
int main(void)
|
||||
{
|
||||
vector int a = vec_splats(0);
|
||||
vector int b = vec_splats(0);
|
||||
a = vec_add(a, b);
|
||||
return 0;
|
||||
}"
|
||||
HAVE_ALTIVEC
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
if(HAVE_ALTIVEC)
|
||||
set(PPCFLAGS "-maltivec")
|
||||
endif()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx")
|
||||
check_c_source_compiles(
|
||||
"#include <altivec.h>
|
||||
int main(void)
|
||||
{
|
||||
vector int a = vec_splats(0);
|
||||
vector int b = vec_splats(0);
|
||||
a = vec_add(a, b);
|
||||
return 0;
|
||||
}"
|
||||
HAVE_NOVSX
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
if(HAVE_NOVSX)
|
||||
set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
|
||||
endif()
|
||||
|
||||
# Check if we have what we need for AltiVec optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifdef __FreeBSD__
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
int main() {
|
||||
#ifdef __FreeBSD__
|
||||
unsigned long hwcap;
|
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
|
||||
return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
|
||||
#else
|
||||
return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
|
||||
#endif
|
||||
}"
|
||||
HAVE_VMX
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_power8_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(POWER8FLAG "-mcpu=power8")
|
||||
endif()
|
||||
endif()
|
||||
# Check if we have what we need for POWER8 optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifdef __FreeBSD__
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
int main() {
|
||||
#ifdef __FreeBSD__
|
||||
unsigned long hwcap;
|
||||
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
|
||||
return (hwcap & PPC_FEATURE2_ARCH_2_07);
|
||||
#else
|
||||
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
|
||||
#endif
|
||||
}"
|
||||
HAVE_POWER8_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_rvv_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(RISCVFLAG "-march=rv64gcv")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports RVV
|
||||
set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <riscv_vector.h>
|
||||
int main() {
|
||||
return 0;
|
||||
}"
|
||||
HAVE_RVV_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_s390_intrinsics)
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifndef HWCAP_S390_VXRS
|
||||
#define HWCAP_S390_VXRS HWCAP_S390_VX
|
||||
#endif
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
|
||||
}"
|
||||
HAVE_S390_INTRIN
|
||||
)
|
||||
endmacro()
|
||||
|
||||
macro(check_power9_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(POWER9FLAG "-mcpu=power9")
|
||||
endif()
|
||||
endif()
|
||||
# Check if we have what we need for POWER9 optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"int main() {
|
||||
return 0;
|
||||
}"
|
||||
HAVE_POWER9_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_sse2_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSE2FLAG "-msse2")
|
||||
else()
|
||||
set(SSE2FLAG "/arch:SSE2")
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
if(NOT "${ARCH}" MATCHES "x86_64")
|
||||
set(SSE2FLAG "/arch:SSE2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSE2FLAG "-msse2")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSE2 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
(void)zero;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_SSE2_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_ssse3_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSSE3FLAG "-mssse3")
|
||||
else()
|
||||
set(SSSE3FLAG "/arch:SSSE3")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSSE3FLAG "-mssse3")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSSE3 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
__m128i u, v, w;
|
||||
u = _mm_set1_epi32(1);
|
||||
v = _mm_set1_epi32(2);
|
||||
w = _mm_hadd_epi32(u, v);
|
||||
(void)w;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_SSSE3_INTRIN
|
||||
)
|
||||
endmacro()
|
||||
|
||||
macro(check_sse42_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSE42FLAG "-msse4.2")
|
||||
else()
|
||||
set(SSE42FLAG "/arch:SSE4.2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSE42FLAG "-msse4.2")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSE4.2 CRC inline asm
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compile_or_run(
|
||||
"int main(void) {
|
||||
unsigned val = 0, h = 0;
|
||||
#if defined(_MSC_VER)
|
||||
{ __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov h, eax }
|
||||
#else
|
||||
__asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) );
|
||||
#endif
|
||||
return (int)h;
|
||||
}"
|
||||
HAVE_SSE42CRC_INLINE_ASM
|
||||
)
|
||||
# Check whether compiler supports SSE4.2 CRC intrinsics
|
||||
check_c_source_compile_or_run(
|
||||
"#include <immintrin.h>
|
||||
int main(void) {
|
||||
unsigned crc = 0;
|
||||
char c = 'c';
|
||||
#if defined(_MSC_VER)
|
||||
crc = _mm_crc32_u32(crc, c);
|
||||
#else
|
||||
crc = __builtin_ia32_crc32qi(crc, c);
|
||||
#endif
|
||||
(void)crc;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_SSE42CRC_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_vgfma_intrinsics)
|
||||
if(NOT NATIVEFLAG)
|
||||
set(VGFMAFLAG "-march=z13")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
|
||||
endif()
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
|
||||
set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <vecintrin.h>
|
||||
int main(void) {
|
||||
unsigned long long a __attribute__((vector_size(16))) = { 0 };
|
||||
unsigned long long b __attribute__((vector_size(16))) = { 0 };
|
||||
unsigned char c __attribute__((vector_size(16))) = { 0 };
|
||||
c = vec_gfmsum_accum_128(a, b, c);
|
||||
return c[0];
|
||||
}"
|
||||
HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_xsave_intrinsics)
|
||||
if(NOT NATIVEFLAG AND NOT MSVC)
|
||||
set(XSAVEFLAG "-mxsave")
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#ifdef _WIN32
|
||||
# include <intrin.h>
|
||||
#else
|
||||
# include <x86gprintrin.h>
|
||||
#endif
|
||||
int main(void) {
|
||||
return _xgetbv(0);
|
||||
}"
|
||||
HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
166
deps/zlib-ng/cmake/detect-sanitizer.cmake
vendored
Normal file
166
deps/zlib-ng/cmake/detect-sanitizer.cmake
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
macro(add_common_sanitizer_flags)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
add_compile_options(-g3)
|
||||
endif()
|
||||
check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
|
||||
if(HAVE_NO_OMIT_FRAME_POINTER)
|
||||
add_compile_options(-fno-omit-frame-pointer)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
endif()
|
||||
check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
|
||||
if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
|
||||
add_compile_options(-fno-optimize-sibling-calls)
|
||||
add_link_options(-fno-optimize-sibling-calls)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_sanitizer_support known_checks supported_checks)
|
||||
set(available_checks "")
|
||||
|
||||
# Build list of supported sanitizer flags by incrementally trying compilation with
|
||||
# known sanitizer checks
|
||||
|
||||
foreach(check ${known_checks})
|
||||
if(available_checks STREQUAL "")
|
||||
set(compile_checks "${check}")
|
||||
else()
|
||||
set(compile_checks "${available_checks},${check}")
|
||||
endif()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
|
||||
|
||||
check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
|
||||
FAIL_REGEX "not supported|unrecognized command|unknown option")
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
if(HAVE_SANITIZER_${check})
|
||||
set(available_checks ${compile_checks})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(${supported_checks} ${available_checks})
|
||||
endmacro()
|
||||
|
||||
macro(add_address_sanitizer)
|
||||
set(known_checks
|
||||
address
|
||||
pointer-compare
|
||||
pointer-subtract
|
||||
)
|
||||
|
||||
check_sanitizer_support("${known_checks}" supported_checks)
|
||||
if(NOT ${supported_checks} STREQUAL "")
|
||||
message(STATUS "Address sanitizer is enabled: ${supported_checks}")
|
||||
add_compile_options(-fsanitize=${supported_checks})
|
||||
add_link_options(-fsanitize=${supported_checks})
|
||||
add_common_sanitizer_flags()
|
||||
else()
|
||||
message(STATUS "Address sanitizer is not supported")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CROSSCOMPILING_EMULATOR)
|
||||
# Only check for leak sanitizer if not cross-compiling due to qemu crash
|
||||
message(WARNING "Leak sanitizer is not supported when cross compiling")
|
||||
else()
|
||||
# Leak sanitizer requires address sanitizer
|
||||
check_sanitizer_support("leak" supported_checks)
|
||||
if(NOT ${supported_checks} STREQUAL "")
|
||||
message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
|
||||
add_compile_options(-fsanitize=${supported_checks})
|
||||
add_link_options(-fsanitize=${supported_checks})
|
||||
add_common_sanitizer_flags()
|
||||
else()
|
||||
message(STATUS "Leak sanitizer is not supported")
|
||||
endif()
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(add_memory_sanitizer)
|
||||
check_sanitizer_support("memory" supported_checks)
|
||||
if(NOT ${supported_checks} STREQUAL "")
|
||||
message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
|
||||
add_compile_options(-fsanitize=${supported_checks})
|
||||
add_link_options(-fsanitize=${supported_checks})
|
||||
add_common_sanitizer_flags()
|
||||
|
||||
check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
|
||||
if(HAVE_MEMORY_TRACK_ORIGINS)
|
||||
add_compile_options(-fsanitize-memory-track-origins)
|
||||
add_link_options(-fsanitize-memory-track-origins)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Memory sanitizer is not supported")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(add_thread_sanitizer)
|
||||
check_sanitizer_support("thread" supported_checks)
|
||||
if(NOT ${supported_checks} STREQUAL "")
|
||||
message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
|
||||
add_compile_options(-fsanitize=${supported_checks})
|
||||
add_link_options(-fsanitize=${supported_checks})
|
||||
add_common_sanitizer_flags()
|
||||
else()
|
||||
message(STATUS "Thread sanitizer is not supported")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(add_undefined_sanitizer)
|
||||
set(known_checks
|
||||
array-bounds
|
||||
bool
|
||||
bounds
|
||||
builtin
|
||||
enum
|
||||
float-cast-overflow
|
||||
float-divide-by-zero
|
||||
function
|
||||
integer-divide-by-zero
|
||||
local-bounds
|
||||
null
|
||||
nonnull-attribute
|
||||
pointer-overflow
|
||||
return
|
||||
returns-nonnull-attribute
|
||||
shift
|
||||
shift-base
|
||||
shift-exponent
|
||||
signed-integer-overflow
|
||||
undefined
|
||||
unsigned-integer-overflow
|
||||
unsigned-shift-base
|
||||
vla-bound
|
||||
vptr
|
||||
)
|
||||
|
||||
# Only check for alignment sanitizer flag if unaligned access is not supported
|
||||
if(NOT WITH_UNALIGNED)
|
||||
list(APPEND known_checks alignment)
|
||||
endif()
|
||||
# Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
|
||||
if(NOT CMAKE_C_FLAGS MATCHES "-O0")
|
||||
list(APPEND known_checks object-size)
|
||||
endif()
|
||||
|
||||
check_sanitizer_support("${known_checks}" supported_checks)
|
||||
|
||||
if(NOT ${supported_checks} STREQUAL "")
|
||||
message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
|
||||
add_compile_options(-fsanitize=${supported_checks})
|
||||
add_link_options(-fsanitize=${supported_checks})
|
||||
|
||||
# Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
|
||||
# it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
|
||||
if(WITH_UNALIGNED)
|
||||
add_compile_options(-fno-sanitize=alignment)
|
||||
endif()
|
||||
|
||||
add_common_sanitizer_flags()
|
||||
else()
|
||||
message(STATUS "Undefined behavior sanitizer is not supported")
|
||||
endif()
|
||||
endmacro()
|
||||
19
deps/zlib-ng/cmake/fallback-macros.cmake
vendored
Normal file
19
deps/zlib-ng/cmake/fallback-macros.cmake
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# fallback-macros.cmake -- CMake fallback macros
|
||||
# Copyright (C) 2022 Nathan Moinvaziri
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
# CMake less than version 3.5.2
|
||||
if(NOT COMMAND add_compile_options)
|
||||
macro(add_compile_options options)
|
||||
string(APPEND CMAKE_C_FLAGS ${options})
|
||||
string(APPEND CMAKE_CXX_FLAGS ${options})
|
||||
endmacro()
|
||||
endif()
|
||||
|
||||
# CMake less than version 3.14
|
||||
if(NOT COMMAND add_link_options)
|
||||
macro(add_link_options options)
|
||||
string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
|
||||
string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
|
||||
endmacro()
|
||||
endif()
|
||||
24
deps/zlib-ng/cmake/toolchain-aarch64.cmake
vendored
Normal file
24
deps/zlib-ng/cmake/toolchain-aarch64.cmake
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR aarch64)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
set(CMAKE_C_COMPILER_TARGET "aarch64-linux-gnu")
|
||||
set(CMAKE_CXX_COMPILER_TARGET "aarch64-linux-gnu")
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-aarch64 -L /usr/${CMAKE_C_COMPILER_TARGET}/)
|
||||
|
||||
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
|
||||
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
29
deps/zlib-ng/cmake/toolchain-arm.cmake
vendored
Normal file
29
deps/zlib-ng/cmake/toolchain-arm.cmake
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR arm)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
|
||||
set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabi)
|
||||
endif()
|
||||
if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
|
||||
set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabi)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
|
||||
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
25
deps/zlib-ng/cmake/toolchain-armhf.cmake
vendored
Normal file
25
deps/zlib-ng/cmake/toolchain-armhf.cmake
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR arm)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabihf)
|
||||
set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabihf)
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${CMAKE_C_COMPILER_TARGET}/)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
|
||||
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
35
deps/zlib-ng/cmake/toolchain-mingw-i686.cmake
vendored
Normal file
35
deps/zlib-ng/cmake/toolchain-mingw-i686.cmake
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
set(CMAKE_SYSTEM_NAME Windows)
|
||||
|
||||
set(CMAKE_C_COMPILER_TARGET i686-w64-mingw32)
|
||||
set(CMAKE_CXX_COMPILER_TARGET i686-w64-mingw32)
|
||||
set(CMAKE_RC_COMPILER_TARGET i686-w64-mingw32)
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR wine)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
|
||||
# Prefer posix gcc variant for gtest pthread support
|
||||
find_program(C_COMPILER_FULL_PATH NAMES
|
||||
${CMAKE_C_COMPILER_TARGET}-gcc-posix
|
||||
${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES
|
||||
${CMAKE_CXX_COMPILER_TARGET}-g++-posix
|
||||
${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
|
||||
find_program(RC_COMPILER_FULL_PATH NAMES
|
||||
${CMAKE_RC_COMPILER_TARGET}-windres)
|
||||
if(RC_COMPILER_FULL_PATH)
|
||||
set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
34
deps/zlib-ng/cmake/toolchain-mingw-x86_64.cmake
vendored
Normal file
34
deps/zlib-ng/cmake/toolchain-mingw-x86_64.cmake
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
set(CMAKE_SYSTEM_NAME Windows)
|
||||
|
||||
set(CMAKE_C_COMPILER_TARGET x86_64-w64-mingw32)
|
||||
set(CMAKE_CXX_COMPILER_TARGET x86_64-w64-mingw32)
|
||||
set(CMAKE_RC_COMPILER_TARGET x86_64-w64-mingw32)
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR wine)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
|
||||
# Prefer posix gcc variant for gtest pthread support
|
||||
find_program(C_COMPILER_FULL_PATH NAMES
|
||||
${CMAKE_C_COMPILER_TARGET}-gcc-posix
|
||||
${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES
|
||||
${CMAKE_CXX_COMPILER_TARGET}-g++-posix
|
||||
${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
|
||||
find_program(RC_COMPILER_FULL_PATH NAMES ${CMAKE_RC_COMPILER_TARGET}-windres)
|
||||
if(RC_COMPILER_FULL_PATH)
|
||||
set(CMAKE_RC_COMPILER ${RC_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
29
deps/zlib-ng/cmake/toolchain-mips.cmake
vendored
Normal file
29
deps/zlib-ng/cmake/toolchain-mips.cmake
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR mips)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
if(NOT DEFINED CMAKE_C_COMPILER_TARGET)
|
||||
set(CMAKE_C_COMPILER_TARGET mips-linux-gnu)
|
||||
endif()
|
||||
if(NOT DEFINED CMAKE_CXX_COMPILER_TARGET)
|
||||
set(CMAKE_CXX_COMPILER_TARGET mips-linux-gnu)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CROSSCOMPILING TRUE)
|
||||
set(CMAKE_CROSSCOMPILING_EMULATOR qemu-mips -L /usr/${CMAKE_C_COMPILER_TARGET}/)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
|
||||
find_program(C_COMPILER_FULL_PATH NAMES ${CMAKE_C_COMPILER_TARGET}-gcc)
|
||||
if(NOT C_COMPILER_FULL_PATH)
|
||||
message(FATAL_ERROR "Cross-compiler for ${CMAKE_C_COMPILER_TARGET} not found")
|
||||
endif()
|
||||
set(CMAKE_C_COMPILER ${C_COMPILER_FULL_PATH})
|
||||
|
||||
find_program(CXX_COMPILER_FULL_PATH NAMES g++-${CMAKE_CXX_COMPILER_TARGET} ${CMAKE_CXX_COMPILER_TARGET}-g++)
|
||||
if(CXX_COMPILER_FULL_PATH)
|
||||
set(CMAKE_CXX_COMPILER ${CXX_COMPILER_FULL_PATH})
|
||||
endif()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user