diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml index b50f9be5b5..3024264cda 100644 --- a/.github/workflows/apple_m.yml +++ b/.github/workflows/apple_m.yml @@ -1,6 +1,14 @@ name: apple m -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml index 498b012e61..64a082fa26 100644 --- a/.github/workflows/arm64_graviton.yml +++ b/.github/workflows/arm64_graviton.yml @@ -5,10 +5,16 @@ on: branches: - develop - release-** + paths-ignore: + - 'docs/**' + - '**/*.md' pull_request: branches: - develop - release-** + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 17e7741349..826f71725d 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -1,6 +1,14 @@ name: c910v qemu test -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml index 9bc8be161c..19e3933046 100644 --- a/.github/workflows/codspeed-bench.yml +++ b/.github/workflows/codspeed-bench.yml @@ -1,6 +1,14 @@ name: Run codspeed benchmarks -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b6cae3788f..6dcccd1fed 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -4,9 +4,17 @@ on: push: branches: - develop + paths: + - 'docs/**' + - 'mkdocs.yml' + - '.github/workflows/docs.yml' pull_request: branches: - develop + paths: + - 'docs/**' + - 'mkdocs.yml' + - '.github/workflows/docs.yml' jobs: build: diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 22f018ab2c..db077bdff3 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -1,6 +1,15 @@ name: continuous build -on: [push, pull_request, workflow_dispatch] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml index 859f2e7873..8d6c52b201 100644 --- a/.github/workflows/harmonyos.yml +++ b/.github/workflows/harmonyos.yml @@ -1,6 +1,14 @@ name: harmonyos -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 0b6f64782d..058e5c8219 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -1,6 +1,14 @@ name: loongarch64 qemu test -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index 42d22be2d2..bf4fed2048 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -1,6 +1,14 @@ name: loongarch64 clang qemu test -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 1efbf2c576..6ee8d43de7 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -1,6 +1,14 @@ name: mips64 qemu test -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml index 3b231392cb..9ebbe25465 100644 --- a/.github/workflows/riscv64_vector.yml +++ b/.github/workflows/riscv64_vector.yml @@ -1,6 +1,14 @@ name: riscv64 zvl256b qemu test -on: [push, pull_request] +on: + push: + paths-ignore: + - 'docs/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/windows_arm64.yml b/.github/workflows/windows_arm64.yml index 5a3a8fe84e..811c1fb7d4 100644 --- a/.github/workflows/windows_arm64.yml +++ b/.github/workflows/windows_arm64.yml @@ -4,9 +4,15 @@ on: push: branches: - develop + paths-ignore: + - 'docs/**' + - '**/*.md' pull_request: branches: - develop + paths-ignore: + - 'docs/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/docs/build_system.md b/docs/build_system.md index d5d76cc463..9b76178418 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -98,7 +98,7 @@ though - please read the linked Makefiles if you want to see all variables. - `BUILD_DOUBLE`: build the double-precision real functions - `BUILD_COMPLEX`: build the single-precision complex functions - `BUILD_COMPLEX16`: build the double-precision complex functions -- `BUILD_BFLOAT16`: build the "half precision brainfloat" real functions +- `BUILD_BFLOAT16`: build the `bfloat16` real functions - `EXPRECISION`: (do not use, this is a work in progress) option to use `long double` functions diff --git a/docs/developers.md b/docs/developers.md index b2c62eeb68..d47fd5b606 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -80,8 +80,8 @@ According to the above `KERNEL.HASWELL`, OpenBLAS Haswell dgemm kernel file is ` ["Anatomy of High-Performance Matrix Multiplication"](http://delivery.acm.org/10.1145/1360000/1356053/a12-goto.pdf?ip=155.68.162.54&id=1356053&acc=ACTIVE%20SERVICE&key=A79D83B43E50B5B8%2EF070BBE7E45C3F17%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517932837_edfe766f1e295d9a7830812371e1d173). ACM Transactions on Mathematical Software 34 (3): Article 12 - (The above link is available only to ACM members, but this and many related - papers is also available on [the pages of van de Geijn's FLAME project](http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html)) + (The above link is available only to ACM members, but this paper and many + related papers are also available on [the pages of van de Geijn's FLAME project](http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html)) The `driver/level3/level3.c` is the implementation of Goto's algorithm. Meanwhile, you can look at `kernel/generic/gemmkernel_2x2.c`, which is a naive diff --git a/docs/distributing.md b/docs/distributing.md index 98b390a9ff..f0d6889f59 100644 --- a/docs/distributing.md +++ b/docs/distributing.md @@ -4,13 +4,14 @@ This document contains recommendations only - packagers and other redistributors are in charge of how OpenBLAS is built and distributed in their systems, and may have good reasons to deviate from the guidance given on this - page. These recommendations are aimed at general packaging systems, with a user - base that typically is large, open source (or freely available at least), and - doesn't behave uniformly or that the packager is directly connected with.* + page. These recommendations are aimed at general packaging systems that are + open source (or at least freely available) and typically serve a large user + base that does not behave uniformly and is not directly connected with the + packager. OpenBLAS has a large number of build-time options which can be used to change how it behaves at runtime, how artifacts or symbols are named, etc. Variation -in build configuration can be necessary to acheive a given end goal within a +in build configuration can be necessary to achieve a given end goal within a distribution or as an end user. However, such variation can also make it more difficult to build on top of OpenBLAS and ship code or other packages in a way that works across many different distros. Here we provide guidance about the @@ -42,7 +43,7 @@ settings): while it does make up a significant part of the binary size of the installed library, that does not outweigh the regression in usability when deviating from the default here.[^1] -3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency +3. Always distribute the pkg-config (`.pc`) and CMake (`.cmake`) dependency detection files. These files are used by build systems when users want to link against OpenBLAS, and there is no benefit of leaving them out. 4. Provide the LP64 interface by default, and if in addition to that you choose diff --git a/docs/extensions.md b/docs/extensions.md index ea6eff5a23..30db9895d1 100644 --- a/docs/extensions.md +++ b/docs/extensions.md @@ -13,9 +13,8 @@ This page documents those non-standard APIs. | ?omatcopy | s,d,c,z | out-of-place transposition/copying | | ?geadd | s,d,c,z | ATLAS-like matrix add `B = α*A+β*B` | | ?gemmt | s,d,c,z | `gemm` but only a triangular part updated | -| cblas_?gemm_batch | s,d,c,z,b | `gemm` with several groups of input data -| -| cblas_?gemm_batch_strided | s,d,c,z,b | `gemm` with groups of data stored at fixed offsets in the input arrays +| cblas_?gemm_batch | s,d,c,z,b | `gemm` with several groups of input data | +| cblas_?gemm_batch_strided | s,d,c,z,b | `gemm` with groups of data stored at fixed offsets in the input arrays | ## bfloat16 functionality @@ -48,4 +47,3 @@ BLAS-like and conversion functions for `hfloat16` (available when OpenBLAS was c * `int openblas_set_affinity(int thread_index, size_t cpusetsize, cpu_set_t *cpuset)` sets the CPU affinity mask of the given thread to the provided cpuset. Only available on Linux, with semantics identical to `pthread_setaffinity_np`. * `openblas_set_thread_callback_function` overrides the default multithreading backend with the provided argument - diff --git a/docs/faq.md b/docs/faq.md index 294204fd88..0ad2bac15c 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -110,7 +110,7 @@ Zaheer has fixed this bug. You can now use the structure instead of C99 complex ### I get a SEGFAULT with multi-threading on Linux. What's wrong? -This may be related to a bug in the Linux kernel 2.6.32 (?). Try applying the patch segaults.patch to disable mbind using +This may be related to a bug in the Linux kernel 2.6.32 (?). Try applying the patch segfaults.patch to disable mbind using patch < segfaults.patch @@ -213,7 +213,7 @@ AVX-512 (SKYLAKEX) support requires devtoolset-8-gcc-gfortran (which exceeds for ### Building OpenBLAS in QEMU/KVM/XEN -By default, QEMU reports the CPU as "QEMU Virtual CPU version 2.2.0", which shares CPUID with existing 32bit CPU even in 64bit virtual machine, and OpenBLAS recognizes it as PENTIUM2. Depending on the exact combination of CPU features the hypervisor choses to expose, this may not correspond to any CPU that exists, and OpenBLAS will error when trying to build. To fix this, pass `-cpu host` or `-cpu passthough` to QEMU, or another CPU model. +By default, QEMU reports the CPU as "QEMU Virtual CPU version 2.2.0", which shares CPUID with existing 32bit CPU even in 64bit virtual machine, and OpenBLAS recognizes it as PENTIUM2. Depending on the exact combination of CPU features the hypervisor chooses to expose, this may not correspond to any CPU that exists, and OpenBLAS will error when trying to build. To fix this, pass `-cpu host` to QEMU, or choose another CPU model. Similarly, the XEN hypervisor may not pass through all features of the host cpu while reporting the cpu type itself correctly, which can lead to compiler error messages about an "ABI change" when compiling AVX512 code. Again changing the Xen configuration by running e.g. "xen-cmdline --set-xen cpuid=avx512" should get around this (as would building OpenBLAS for an older cpu lacking that particular feature, e.g. TARGET=HASWELL) @@ -290,7 +290,7 @@ There have been a few reports of wrong calculation results and build-time test f ### Program is Terminated. Because you tried to allocate too many memory regions -In OpenBLAS, we mange a pool of memory buffers and allocate the number of buffers as the following. +In OpenBLAS, we manage a pool of memory buffers and set the number of buffers as follows. ``` #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) ``` @@ -301,7 +301,7 @@ In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. ### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH -The environment variable which control the kernel selection is `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) +The environment variable that controls the kernel selection is `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export OPENBLAS_CORETYPE=Haswell`. And the function `char* openblas_get_corename()` returns the used target. ### After updating the installed OpenBLAS, a program complains about "undefined symbol gotoblas" @@ -325,7 +325,7 @@ Specifying the "correct" library location with the `-L` flag (like `-L /opt/Open ### I want to use OpenBLAS with CUDA in the HPL 2.3 benchmark code but it keeps looking for Intel MKL -You need to edit file src/cuda/cuda_dgemm.c in the NVIDIA version of HPL, change the "handle2" and "handle" dlopen calls to use libopenblas.so instead of libmkl_intel_lp64.so, and add an trailing underscore in the dlsym lines for dgemm_mkl and dtrsm_mkl (like `dgemm_mkl = (void(*)())dlsym(handle, “dgemm_”);`) +You need to edit file src/cuda/cuda_dgemm.c in the NVIDIA version of HPL, change the "handle2" and "handle" dlopen calls to use libopenblas.so instead of libmkl_intel_lp64.so, and add a trailing underscore in the dlsym lines for dgemm_mkl and dtrsm_mkl (like `dgemm_mkl = (void(*)())dlsym(handle, "dgemm_");`) ### Multithreaded OpenBLAS runs no faster or is even slower than singlethreaded on my ARMV7 board diff --git a/docs/user_manual.md b/docs/user_manual.md index 7abdcf0b85..d3b7de1050 100644 --- a/docs/user_manual.md +++ b/docs/user_manual.md @@ -107,9 +107,9 @@ OpenBLAS can be used as a shared or a static library. ### Link a shared library -The shared library is normally called `libopenblas.so`, but not that the name +The shared library is normally called `libopenblas.so`, but note that the name may be different as a result of build flags used or naming choices by a distro -packager (see [distributing.md] for details). To link a shared library named +packager (see [distributing.md](distributing.md) for details). To link a shared library named `libopenblas.so`, the flag `-lopenblas` is needed. To find the OpenBLAS headers, a `-I/path/to/includedir` is needed. And unless the library is installed in a directory that the linker searches by default, also `-L` and `-Wl,-rpath` flags