Skip to content

Commit ddcd7d6

Browse files
authored
Merge branch 'develop' into Threading_Callback
2 parents 7102367 + de465ff commit ddcd7d6

File tree

2,261 files changed

+9629
-37262
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,261 files changed

+9629
-37262
lines changed

.cirrus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ FreeBSD_task:
156156
image_family: freebsd-13-2
157157
install_script:
158158
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
159-
- ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
159+
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
160160
compile_script:
161161
- gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
162162

.github/workflows/c910v.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ jobs:
8484
run: |
8585
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
8686
qemu-riscv64 ./utest/openblas_utest
87+
qemu-riscv64 ./utest/openblas_utest_ext
8788
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
8889
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
8990
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1

.github/workflows/docs.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Publish docs via GitHub Pages
2+
on:
3+
push:
4+
branches:
5+
- develop
6+
jobs:
7+
build:
8+
name: Deploy docs
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/checkout@v2
12+
- uses: actions/setup-python@v2
13+
with:
14+
python-version: "3.10"
15+
- run: pip install mkdocs mkdocs-material
16+
# mkdocs gh-deploy command only builds to the top-level, hence building then deploying ourselves
17+
- run: mkdocs build
18+
- name: Deploy docs
19+
uses: peaceiris/actions-gh-pages@v3
20+
if: ${{ github.ref == 'refs/heads/develop' }}
21+
with:
22+
github_token: ${{ secrets.GITHUB_TOKEN }}
23+
publish_dir: ./site
24+
destination_dir: docs/

.github/workflows/loongarch64.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ jobs:
7777
- name: Test
7878
run: |
7979
qemu-loongarch64-static ./utest/openblas_utest
80+
qemu-loongarch64-static ./utest/openblas_utest_ext
8081
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
8182
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
8283
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1

.github/workflows/mips64.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ jobs:
8080
run: |
8181
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
8282
qemu-mips64el ./utest/openblas_utest
83+
qemu-mips64el ./utest/openblas_utest_ext
8384
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
8485
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
8586
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,43 +51,55 @@ utest/openblas_utest_ext
5151
ctest/xccblat1
5252
ctest/xccblat2
5353
ctest/xccblat3
54+
ctest/xccblat3_3m
5455
ctest/xdcblat1
5556
ctest/xdcblat2
5657
ctest/xdcblat3
58+
ctest/xdcblat3_3m
5759
ctest/xscblat1
5860
ctest/xscblat2
5961
ctest/xscblat3
62+
ctest/xscblat3_3m
6063
ctest/xzcblat1
6164
ctest/xzcblat2
6265
ctest/xzcblat3
66+
ctest/xzcblat3_3m
6367
exports/linktest.c
6468
exports/linux.def
6569
kernel/setparam_*.c
6670
kernel/kernel_*.h
6771
test/CBLAT2.SUMM
6872
test/CBLAT3.SUMM
73+
test/CBLAT3_3M.SUMM
6974
test/DBLAT2.SUMM
7075
test/DBLAT3.SUMM
76+
test/DBLAT3_3M.SUMM
7177
test/SBLAT2.SUMM
7278
test/SBLAT3.SUMM
79+
test/SBLAT3_3M.SUMM
7380
test/ZBLAT2.SUMM
7481
test/ZBLAT3.SUMM
82+
test/ZBLAT3_3M.SUMM
7583
test/SHBLAT3.SUMM
7684
test/SBBLAT3.SUMM
7785
test/cblat1
7886
test/cblat2
7987
test/cblat3
88+
test/cblat3_3m
8089
test/dblat1
8190
test/dblat2
8291
test/dblat3
92+
test/dblat3_3m
8393
test/sblat1
8494
test/sblat2
8595
test/sblat3
96+
test/sblat3_3m
8697
test/test_shgemm
8798
test/test_sbgemm
8899
test/zblat1
89100
test/zblat2
90101
test/zblat3
102+
test/zblat3_3m
91103
build
92104
build.*
93105
*.swp

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
88

99
set(OpenBLAS_MAJOR_VERSION 0)
1010
set(OpenBLAS_MINOR_VERSION 3)
11-
set(OpenBLAS_PATCH_VERSION 26.dev)
11+
set(OpenBLAS_PATCH_VERSION 27.dev)
1212

1313
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1414

Changelog.txt

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,104 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.27
4+
4-Apr-2024
5+
6+
general:
7+
- added initial (generic) support for the CSKY architecture
8+
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
9+
underutilized or idle threads
10+
- sped up multithreaded POTRF on all platforms
11+
- added extension openblas_set_num_threads_local() that returns the previous thread count
12+
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
13+
for too small workloads
14+
- improved the fallback code used when the precompiled number of threads is exceeded,
15+
and made it callable multiple times during the lifetime of an instance
16+
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
17+
- fixed a potential buffer overflow in the interface to the GEMMT kernels
18+
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
19+
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
20+
- sped up the OpenMP thread management code
21+
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
22+
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
23+
- added a testsuite for the BLAS extensions
24+
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
25+
spurious errors
26+
- added support for building the benchmark collection with CMAKE
27+
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
28+
with OpenMP enabled that use clang with gfortran
29+
- fixed building on systems with ucLibc
30+
- added support for calling ?NRM2 with a negative increment value on all architectures
31+
- added support for the LLVM18 version of the flang-new compiler
32+
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
33+
- Integrated fixes from the Reference-LAPACK project:
34+
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
35+
36+
x86:
37+
- fixed handling of NaN and Inf arguments in ZSCAL
38+
- fixed GEMM3M functions failing in CMAKE builds
39+
40+
x86-64:
41+
- removed all instances of sched_yield() on Linux and BSD
42+
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
43+
- fixed GEMM3M functions failing in CMAKE builds
44+
- fixed handling of NaN and Inf arguments in ZSCAL
45+
- added compiler checks for AVX512BF16 compatibility
46+
- fixed LLVM compiler options for Sapphire Rapids
47+
- fixed cpu handling fallbacks for Sapphire Rapids with
48+
disabled AVX2 in DYNAMIC_ARCH mode
49+
- fixed extensions SCSUM and DZSUM
50+
- improved GEMM performance for ZEN targets
51+
52+
arm:
53+
- fixed handling of NaN and Inf arguments in ZSCAL
54+
55+
arm64:
56+
- added initial support for the Cortex-A76 cpu
57+
- fixed handling of NaN and Inf arguments in ZSCAL
58+
- fixed default compiler options for gcc (-march and -mtune)
59+
- added support for ArmCompilerForLinux
60+
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
61+
- fixed mishandling of the INTERFACE64 option in CMAKE builds
62+
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
63+
- added SVE-enabled kernels for CSUM/ZSUM
64+
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
65+
66+
power:
67+
- improved performance of SGEMM on POWER8/9/10
68+
- improved performance of DGEMM on POWER10
69+
- added support for OpenMP builds with xlc/xlf on AIX
70+
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
71+
- fixed cpu core counting on AIX
72+
- added support for building a shared library on AIX
73+
74+
riscv64:
75+
- added support for the X280 cpu
76+
- added support for semi-generic RISCV models with vector length 128 or 256
77+
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
78+
- fixed handling of NaN and Inf arguments in ZSCAL
79+
- improved cpu model autodetection
80+
- fixed corner cases in ?AXPBY for C910V
81+
- fixed handling of zero increments in ?AXPY kernels for C910V
82+
83+
loongarch64:
84+
- added optimized kernels for ?AMIN and ?AMAX
85+
- fixed handling of NaN and Inf arguments in ZSCAL
86+
- fixed handling of corner cases in ?AXPBY
87+
- fixed computation of SAMIN and DAMIN in LSX mode
88+
- fixed computation of ?ROT
89+
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
90+
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
91+
- added optimized CGEMV and ZGEMV kernels
92+
93+
mips:
94+
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
95+
- fixed handling of NaN and Inf arguments in ZSCAL
96+
- fixed mishandling of the INTERFACE64 option in CMAKE builds
97+
98+
zarch:
99+
- fixed handling of NaN and Inf arguments in ZSCAL
100+
- fixed calculation of ?SUM on Z13
101+
2102
====================================================================
3103
Version 0.3.26
4104
2-Jan-2024

Makefile.arm64

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
5858
endif
5959
endif
6060

61+
ifeq ($(CORE), CORTEXA76)
62+
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
63+
ifneq ($(F_COMPILER), NAG)
64+
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
65+
endif
66+
endif
67+
6168
ifeq ($(CORE), FT2000)
6269
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
6370
ifneq ($(F_COMPILER), NAG)
@@ -138,13 +145,13 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
138145
ifneq ($(OSNAME), Darwin)
139146
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
140147
else
141-
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
148+
CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
142149
endif
143150
ifneq ($(F_COMPILER), NAG)
144151
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
145152
endif
146153
else
147-
CCOMMON_OPT += -march=armv8.5-a+sve
154+
CCOMMON_OPT += -march=armv8.5-a+sve+bf16
148155
ifneq ($(CROSS), 1)
149156
CCOMMON_OPT += -mtune=native
150157
endif
@@ -156,13 +163,13 @@ endif
156163
endif
157164
endif
158165
else
159-
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
166+
CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
160167
ifneq ($(F_COMPILER), NAG)
161168
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
162169
endif
163170
endif
164171
else
165-
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
172+
CCOMMON_OPT += -march=armv8-a+sve+bf16 -mtune=cortex-a72
166173
ifneq ($(F_COMPILER), NAG)
167174
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
168175
endif

Makefile.rule

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.26.dev
6+
VERSION = 0.3.27.dev
77

88
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
99
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -173,6 +173,10 @@ NO_AFFINITY = 1
173173
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
174174
# BIGNUMA = 1
175175

176+
# If you are compiling for an embedded system ("bare metal") like Cortex M series
177+
# Note that you will have to provide implementations of malloc() and free() in this case
178+
# EMBEDDED = 1
179+
176180
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
177181
# and OS. However, the performance is low.
178182
# NO_AVX = 1

0 commit comments

Comments
 (0)