{{ message }}
PR02: System/Environment & Platform Macros#10435
Open
agibsonccc wants to merge 6 commits into
Open
Conversation
Part of the 22-PR split of ag_new_release_updates_2 branch. Merge layer: 0 (no dependencies) Files: 26 See pr-plans/00-master-plan.md for the full split plan and merge order.
This was referenced Jun 15, 2026
Contributor
There was a problem hiding this comment.
Pull request overview
This PR introduces a typed, subsystem-based Environment configuration architecture for libnd4j (Core/CUDA/DSP/Triton/Lifecycle/Print/Memory), alongside platform/macro header updates intended to reduce include fan-out and improve multi-backend isolation support.
Changes:
- Adds new
system/config/*Configheaders +initFromEnvironment()implementations to centralize env-var parsing and expose atomic, thread-safe tunables. - Introduces backend namespace macro infrastructure (
BackendNamespace.h) and a minimal export-macros header (sd_export.h). - Updates several high-fan-out system headers/macros (
common.h,op_boilerplate.h,type_boilerplate.h,openmp_pragmas.h) to align with the new architecture and selective rendering/type dispatch needs.
Reviewed changes
Copilot reviewed 25 out of 26 changed files in this pull request and generated 9 comments.
Show a summary per file
| File | Description |
|---|---|
| libnd4j/include/system/common.h | Adds backend namespace header include; adjusts macros/status values; introduces allocation padding macros. |
| libnd4j/include/system/BackendNamespace.h | Defines backend namespace token-paste macros and backend-identification helpers. |
| libnd4j/include/system/sd_export.h | New “minimal” export macro header intended to reduce circular dependencies. |
| libnd4j/include/system/env_functions.h | Declares lightweight env_*() accessors intended to avoid pulling in Environment.h in high-fan-out headers. |
| libnd4j/include/system/config/EnvHelper.h | Adds env-var parsing helpers (bool/int/int64/float/string). |
| libnd4j/include/system/config/CoreConfig.h / impl/CoreConfig.cpp | New typed Core environment subsystem + env initialization. |
| libnd4j/include/system/config/CudaDeviceConfig.h / impl/CudaDeviceConfig.cpp | New typed CUDA device subsystem + env initialization. |
| libnd4j/include/system/config/DspConfig.h / impl/DspConfig.cpp | New typed DSP subsystem + env initialization. |
| libnd4j/include/system/config/TritonConfig.h / impl/TritonConfig.cpp | New typed Triton subsystem + env initialization. |
| libnd4j/include/system/config/MemoryConfig.h / impl/MemoryConfig.cpp | New typed memory subsystem + env initialization. |
| libnd4j/include/system/config/LifecycleConfig.h / impl/LifecycleConfig.cpp | New typed lifecycle tracking subsystem + env initialization. |
| libnd4j/include/system/config/PrintConfig.h / impl/PrintConfig.cpp | New typed print subsystem + env initialization. |
| libnd4j/include/system/op_boilerplate.h | Switches thresholds/debug checks to env_*() accessors; modifies CUDA temp allocation macros. |
| libnd4j/include/system/type_boilerplate.h | Include-path tweaks and type-dispatch macro updates; adds selective rendering include. |
| libnd4j/include/system/type_boiler_plate_expansions.h | Adds expansion support for non-type bool template args. |
| libnd4j/include/system/openmp_pragmas.h | Removes float16/bfloat16 OpenMP reduction declarations; replaces with explanatory comment. |
| libnd4j/include/system/RequirementsHelper.h | Adds helper version/capability requirement helpers. |
| libnd4j/include/system/buffer.h | Rewrites/normalizes a small CPU/GPU buffer helper used mainly for testing. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Comment on lines
+2588
to
2592
| #include <memory/cuda/CudaMemoryPool.h> | ||
|
|
||
| // we intentionally add 8 tail bytes here to avoid problems with atomic operations | ||
| // Use CudaMemoryPool for all CUDA allocations to ensure consistent alloc/free pairing | ||
| #define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) \ | ||
| if (WORKSPACE == nullptr) { \ |
Comment on lines
+2606
to
+2610
| #define RELEASE_SPECIAL_WITH_DEVICE(VARIABLE, DEVICE_ID, WORKSPACE) \ | ||
| if (VARIABLE != nullptr) { \ | ||
| if (WORKSPACE == nullptr) { \ | ||
| int deviceIdToUse = (DEVICE_ID >= 0) ? DEVICE_ID : 0; \ | ||
| sd::memory::CudaMemoryPool::getInstance().free(reinterpret_cast<void*>(VARIABLE), deviceIdToUse, nullptr); \ |
Comment on lines
+18
to
+27
| // | ||
| // Lightweight free-function accessors for the most commonly queried | ||
| // Environment settings. These are declared here (with no Environment.h | ||
| // dependency) so that high-fan-out headers like logger.h, Threads.h, | ||
| // and op_boilerplate.h can avoid pulling in all 6 subsystem config | ||
| // headers through Environment.h. | ||
| // | ||
| // Implementations live in helpers/impl/logger.cpp (which includes | ||
| // Environment.h for itself). | ||
| // |
Comment on lines
19
to
25
| #ifndef LIBND4J_REQUIREMENTSHELPER_H | ||
| #define LIBND4J_REQUIREMENTSHELPER_H | ||
| #include <ConstMessages.h> | ||
| #include <helpers/HelperVersionRegistry.h> | ||
| #include <helpers/ShapeUtils.h> | ||
| #include <helpers/logger.h> | ||
| #include <system/Environment.h> |
Comment on lines
+26
to
+28
| // DataTypeValidation.h is included for the BUILD_SINGLE_SELECTOR macros. | ||
| // It uses sd_export.h (not common.h) to avoid circular dependencies. | ||
| #include <array/DataTypeValidation.h> |
Comment on lines
+19
to
+23
| // | ||
| // Minimal export macros header to avoid circular dependencies | ||
| // This header should ONLY define SD_LIB_EXPORT and SD_LIB_HIDDEN | ||
| // with no other includes to prevent circular dependency issues. | ||
| // |
Comment on lines
+206
to
+209
| // OpenMP reduction declarations for float16/bfloat16 are in | ||
| // types/omp_reductions.h — included AFTER type headers are fully defined. | ||
| // Do NOT put them here: openmp_pragmas.h is included from common.h | ||
| // which is included by float16.h/bfloat16.h, creating a circular dependency. |
Contributor
Author
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

Summary
PR 02 of 22 PRs in the
ag_new_release_updates_2branch split. No dependencies — can merge immediately.Environmentdelegates to 7 typed config classes (CoreConfig,CudaDeviceConfig,DspConfig,TritonConfig,LifecycleConfig,PrintConfig,MemoryConfig)SD_NSfromSD_BACKEND_NAMESPACE; enables multi-backend co-loading without symbol conflicts;sdnamespace preserved as aliasreadBoolEnvTriState()returns -1/0/1 — distinguishes absent env var from explicitly false (critical for opt-in flags)env.dsp(),env.triton(),env.cuda(),env.core(),env.lifecycle(),env.print()for lock-free concurrent readscommon.haddsSD_TLS_EXPORT,SD_NO_INSTRUMENT,SD_CONCAT;openmp_pragmas.haddsOMP_IF,OMP_SCHEDULE, missing MSVC fixes;type_boilerplate.haddsBUILD_SINGLE_SELECTOR_HALFcommon.his included by virtually every translation unit — merging this PR causes widespread ccache invalidationWhat Changed
New Headers: Backend Namespace (1 file)
libnd4j/include/system/BackendNamespace.h—SD_NSmacro via token-paste fromSD_BACKEND_NAMESPACE;sdalias for backward compat;SD_NS_NAMEstring for loggingNew Headers: Export / Utility (2 files)
libnd4j/include/system/sd_export.h— standaloneSD_LIB_EXPORT/SD_LIB_HIDDENextracted from common.hlibnd4j/include/system/env_functions.h—readBoolEnv,readBoolEnvTriState,readIntEnv,readInt64Env,readStringEnvutilitiesNew: Environment Subsystem Config Headers (7 files)
libnd4j/include/system/config/CoreConfig.h— verbose/debug, TAD/element thresholds, max threads, BLAS fallback; allstd::atomic<T>libnd4j/include/system/config/CudaDeviceConfig.h— device ID, limits, tensor core toggle, stream affinity, memory headroomlibnd4j/include/system/config/DspConfig.h— batch-zero modes, batched GEMM, pool trim interval, cast elimination, cuBLAS TF32/workspace, gap capture, OOM retrylibnd4j/include/system/config/TritonConfig.h— build threads, module residency (512 MB LRU), kernel tuning, section fusion scoring, op exclusion lists, CUDA graph flagslibnd4j/include/system/config/MemoryConfig.h—_poolReleaseThresholdPercent= 75%;_nonPeerHeadroomPercent= 50%libnd4j/include/system/config/LifecycleConfig.h— funcTrace print options, leak detection, allocation logginglibnd4j/include/system/config/PrintConfig.h— edge items, threshold, NumPy-style summarizationNew: Environment Subsystem Config Implementations (7 files)
libnd4j/include/system/config/impl/CoreConfig.cpp—initFromEnvironment()reads ~20 ND4J_* env varslibnd4j/include/system/config/impl/CudaDeviceConfig.cpp— reads ND4J_CUDA_* env varslibnd4j/include/system/config/impl/DspConfig.cpp— reads ND4J_DSP_* env varslibnd4j/include/system/config/impl/TritonConfig.cpp— reads ND4J_TRITON_* env varslibnd4j/include/system/config/impl/MemoryConfig.cpp— reads pool release threshold and non-peer headroom varslibnd4j/include/system/config/impl/LifecycleConfig.cpp— reads funcTrace and leak detection varslibnd4j/include/system/config/impl/PrintConfig.cpp— reads ND4J_PRINT_* env varsModified: Core Environment Header (1 file)
libnd4j/include/system/Environment.h— holds typed subsystem members; exposescore(),cuda(),triton(),dsp(),lifecycle(),print()accessors; retains all flat accessors for backward compatModified: Platform Macro Headers (4 files)
libnd4j/include/system/common.h— includesBackendNamespace.h; addsSD_TLS_EXPORT,SD_NO_INSTRUMENT,SD_CONCAT/SD_CONCAT_IMPLlibnd4j/include/system/openmp_pragmas.h— addsOMP_IF,OMP_SCHEDULE,PRAGMA_OMP_PARALLEL_FOR_THREADS(n); MSVCPRAGMA_OMP_ATOMICfixlibnd4j/include/system/type_boilerplate.h— addsBUILD_SINGLE_SELECTOR_HALF; fixesBUILD_DOUBLE_SELECTOR/BUILD_TRIPLE_SELECTORedge caseslibnd4j/include/system/type_boiler_plate_expansions.h— float8/half2 type combination entriesModified: Other System Headers (3 files)
libnd4j/include/system/RequirementsHelper.h— assertion helpers for subsystem config validationlibnd4j/include/system/buffer.h— minor cleanup; include guard alignmentlibnd4j/include/system/op_boilerplate.h— op dispatch macros aligned with type_boilerplate additionsDependencies
env.dsp().xxx()/env.triton().xxx()Merge Order
These 22 PRs must merge in layer order. Each layer depends on the layers above it being merged first. PRs within the same layer are independent and can merge in parallel.