clang 20.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
12#include "llvm/Support/AMDGPUAddrSpace.h"
13
14using namespace clang;
15using namespace clang::CodeGen;
16
17//===----------------------------------------------------------------------===//
18// AMDGPU ABI Implementation
19//===----------------------------------------------------------------------===//
20
21namespace {
22
23class AMDGPUABIInfo final : public DefaultABIInfo {
24private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 unsigned numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(Ty->getContext(), ToAS);
40 return Ty;
41 }
42
43public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63 return T;
65 }
66};
67
68bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70}
71
72bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78}
79
80/// Estimate number of registers the type will use when passed in registers.
81unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 unsigned NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 unsigned EltSize = getContext().getTypeSize(EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 unsigned EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const RecordType *RT = Ty->getAs<RecordType>()) {
99 const RecordDecl *RD = RT->getDecl();
100 assert(!RD->hasFlexibleArrayMember());
101
102 for (const FieldDecl *Field : RD->fields()) {
103 QualType FieldTy = Field->getType();
104 NumRegs += numRegsForType(FieldTy);
105 }
106
107 return NumRegs;
108 }
109
110 return (getContext().getTypeSize(Ty) + 31) / 32;
111}
112
113void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
114 llvm::CallingConv::ID CC = FI.getCallingConvention();
115
116 if (!getCXXABI().classifyReturnType(FI))
118
119 unsigned ArgumentIndex = 0;
120 const unsigned numFixedArguments = FI.getNumRequiredArgs();
121
122 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
123 for (auto &Arg : FI.arguments()) {
124 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
125 Arg.info = classifyKernelArgumentType(Arg.type);
126 } else {
127 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
128 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
129 }
130 }
131}
132
133RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
134 QualType Ty, AggValueSlot Slot) const {
135 const bool IsIndirect = false;
136 const bool AllowHigherAlign = false;
137 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
138 getContext().getTypeInfoInChars(Ty),
139 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
140}
141
142ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
143 if (isAggregateTypeForABI(RetTy)) {
144 // Records with non-trivial destructors/copy-constructors should not be
145 // returned by value.
146 if (!getRecordArgABI(RetTy, getCXXABI())) {
147 // Ignore empty structs/unions.
148 if (isEmptyRecord(getContext(), RetTy, true))
149 return ABIArgInfo::getIgnore();
150
151 // Lower single-element structs to just return a regular value.
152 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
153 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
154
155 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
156 const RecordDecl *RD = RT->getDecl();
157 if (RD->hasFlexibleArrayMember())
159 }
160
161 // Pack aggregates <= 4 bytes into single VGPR or pair.
162 uint64_t Size = getContext().getTypeSize(RetTy);
163 if (Size <= 16)
164 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
165
166 if (Size <= 32)
167 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
168
169 if (Size <= 64) {
170 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
171 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
172 }
173
174 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
175 return ABIArgInfo::getDirect();
176 }
177 }
178
179 // Otherwise just do the default thing.
181}
182
183/// For kernels all parameters are really passed in a special buffer. It doesn't
184/// make sense to pass anything byval, so everything must be direct.
185ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
187
188 // TODO: Can we omit empty structs?
189
190 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
191 Ty = QualType(SeltTy, 0);
192
193 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
194 llvm::Type *LTy = OrigLTy;
195 if (getContext().getLangOpts().HIP) {
196 LTy = coerceKernelArgumentType(
197 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
198 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
199 }
200
201 // FIXME: Should also use this for OpenCL, but it requires addressing the
202 // problem of kernels being called.
203 //
204 // FIXME: This doesn't apply the optimization of coercing pointers in structs
205 // to global address space when using byref. This would require implementing a
206 // new kind of coercion of the in-memory type when for indirect arguments.
207 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
210 getContext().getTypeAlignInChars(Ty),
211 getContext().getTargetAddressSpace(LangAS::opencl_constant),
212 false /*Realign*/, nullptr /*Padding*/);
213 }
214
215 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
216 // individual elements, which confuses the Clover OpenCL backend; therefore we
217 // have to set it to false here. Other args of getDirect() are just defaults.
218 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
219}
220
221ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
222 unsigned &NumRegsLeft) const {
223 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
224
226
227 if (Variadic) {
228 return ABIArgInfo::getDirect(/*T=*/nullptr,
229 /*Offset=*/0,
230 /*Padding=*/nullptr,
231 /*CanBeFlattened=*/false,
232 /*Align=*/0);
233 }
234
235 if (isAggregateTypeForABI(Ty)) {
236 // Records with non-trivial destructors/copy-constructors should not be
237 // passed by value.
238 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
239 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
240
241 // Ignore empty structs/unions.
242 if (isEmptyRecord(getContext(), Ty, true))
243 return ABIArgInfo::getIgnore();
244
245 // Lower single-element structs to just pass a regular value. TODO: We
246 // could do reasonable-size multiple-element structs too, using getExpand(),
247 // though watch out for things like bitfields.
248 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
249 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
250
251 if (const RecordType *RT = Ty->getAs<RecordType>()) {
252 const RecordDecl *RD = RT->getDecl();
253 if (RD->hasFlexibleArrayMember())
255 }
256
257 // Pack aggregates <= 8 bytes into single VGPR or pair.
258 uint64_t Size = getContext().getTypeSize(Ty);
259 if (Size <= 64) {
260 unsigned NumRegs = (Size + 31) / 32;
261 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
262
263 if (Size <= 16)
264 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
265
266 if (Size <= 32)
267 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
268
269 // XXX: Should this be i64 instead, and should the limit increase?
270 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
271 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
272 }
273
274 if (NumRegsLeft > 0) {
275 unsigned NumRegs = numRegsForType(Ty);
276 if (NumRegsLeft >= NumRegs) {
277 NumRegsLeft -= NumRegs;
278 return ABIArgInfo::getDirect();
279 }
280 }
281
282 // Use pass-by-reference in stead of pass-by-value for struct arguments in
283 // function ABI.
285 getContext().getTypeAlignInChars(Ty),
286 getContext().getTargetAddressSpace(LangAS::opencl_private));
287 }
288
289 // Otherwise just do the default thing.
291 if (!ArgInfo.isIndirect()) {
292 unsigned NumRegs = numRegsForType(Ty);
293 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
294 }
295
296 return ArgInfo;
297}
298
299class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
300public:
301 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
302 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
303
304 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
305 CodeGenModule &CGM) const;
306
307 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
308
309 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
310 CodeGen::CodeGenModule &M) const override;
311 unsigned getOpenCLKernelCallingConv() const override;
312
313 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
314 llvm::PointerType *T, QualType QT) const override;
315
316 LangAS getASTAllocaAddressSpace() const override {
318 getABIInfo().getDataLayout().getAllocaAddrSpace());
319 }
321 const VarDecl *D) const override;
322 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
324 llvm::AtomicOrdering Ordering,
325 llvm::LLVMContext &Ctx) const override;
327 llvm::Instruction &AtomicInst,
328 const AtomicExpr *Expr = nullptr) const override;
330 llvm::Function *BlockInvokeFunc,
331 llvm::Type *BlockTy) const override;
332 bool shouldEmitStaticExternCAliases() const override;
333 bool shouldEmitDWARFBitFieldSeparators() const override;
334 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
335};
336}
337
339 llvm::GlobalValue *GV) {
340 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
341 return false;
342
343 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
344 (D->hasAttr<OpenCLKernelAttr>() ||
345 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
346 (isa<VarDecl>(D) &&
347 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
348 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
349 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
350}
351
352void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
353 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
354 const auto *ReqdWGS =
355 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
356 const bool IsOpenCLKernel =
357 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
358 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
359
360 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
361 if (ReqdWGS || FlatWGS) {
362 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
363 } else if (IsOpenCLKernel || IsHIPKernel) {
364 // By default, restrict the maximum size to a value specified by
365 // --gpu-max-threads-per-block=n or its default value for HIP.
366 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
367 const unsigned DefaultMaxWorkGroupSize =
368 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
369 : M.getLangOpts().GPUMaxThreadsPerBlock;
370 std::string AttrVal =
371 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
372 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
373 }
374
375 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
377
378 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
379 unsigned NumSGPR = Attr->getNumSGPR();
380
381 if (NumSGPR != 0)
382 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
383 }
384
385 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
386 uint32_t NumVGPR = Attr->getNumVGPR();
387
388 if (NumVGPR != 0)
389 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
390 }
391
392 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
393 uint32_t X = Attr->getMaxNumWorkGroupsX()
394 ->EvaluateKnownConstInt(M.getContext())
395 .getExtValue();
396 // Y and Z dimensions default to 1 if not specified
397 uint32_t Y = Attr->getMaxNumWorkGroupsY()
398 ? Attr->getMaxNumWorkGroupsY()
399 ->EvaluateKnownConstInt(M.getContext())
400 .getExtValue()
401 : 1;
402 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
403 ? Attr->getMaxNumWorkGroupsZ()
404 ->EvaluateKnownConstInt(M.getContext())
405 .getExtValue()
406 : 1;
407
408 llvm::SmallString<32> AttrVal;