xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp (revision c9539b89010900499a200cdd6c0265ea5d950875)
1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for R600
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "R600ISelLowering.h"
15 #include "AMDGPU.h"
16 #include "MCTargetDesc/R600MCTargetDesc.h"
17 #include "R600Defines.h"
18 #include "R600InstrInfo.h"
19 #include "R600MachineFunctionInfo.h"
20 #include "R600Subtarget.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/IR/IntrinsicsAMDGPU.h"
23 #include "llvm/IR/IntrinsicsR600.h"
24 
25 using namespace llvm;
26 
27 #include "R600GenCallingConv.inc"
28 
29 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
30                                        const R600Subtarget &STI)
31     : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
32   addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
33   addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
34   addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
35   addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
36   addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
37   addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
38 
39   setBooleanContents(ZeroOrNegativeOneBooleanContent);
40   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
41 
42   computeRegisterProperties(Subtarget->getRegisterInfo());
43 
44   // Legalize loads and stores to the private address space.
45   setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
46 
47   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
48   // spaces, so it is custom lowered to handle those where it isn't.
49   for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
50     for (MVT VT : MVT::integer_valuetypes()) {
51       setLoadExtAction(Op, VT, MVT::i1, Promote);
52       setLoadExtAction(Op, VT, MVT::i8, Custom);
53       setLoadExtAction(Op, VT, MVT::i16, Custom);
54     }
55 
56   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
57   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
58                    MVT::v2i1, Expand);
59 
60   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v4i32,
61                    MVT::v4i1, Expand);
62 
63   setOperationAction(ISD::STORE, {MVT::i8, MVT::i32, MVT::v2i32, MVT::v4i32},
64                      Custom);
65 
66   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
67   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
68   // We need to include these since trunc STORES to PRIVATE need
69   // special handling to accommodate RMW
70   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
71   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
72   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
73   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
74   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
75   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
76   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
77   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
78   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
79   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
80 
81   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
82   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
83   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
84 
85   // Set condition code actions
86   setCondCodeAction({ISD::SETO, ISD::SETUO, ISD::SETLT, ISD::SETLE, ISD::SETOLT,
87                      ISD::SETOLE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGE,
88                      ISD::SETUGT, ISD::SETULT, ISD::SETULE},
89                     MVT::f32, Expand);
90 
91   setCondCodeAction({ISD::SETLE, ISD::SETLT, ISD::SETULE, ISD::SETULT},
92                     MVT::i32, Expand);
93 
94   setOperationAction({ISD::FCOS, ISD::FSIN}, MVT::f32, Custom);
95 
96   setOperationAction(ISD::SETCC, {MVT::v4i32, MVT::v2i32}, Expand);
97 
98   setOperationAction(ISD::BR_CC, {MVT::i32, MVT::f32}, Expand);
99   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
100 
101   setOperationAction(ISD::FSUB, MVT::f32, Expand);
102 
103   setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
104                      MVT::f64, Custom);
105 
106   setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom);
107 
108   setOperationAction(ISD::SETCC, {MVT::i32, MVT::f32}, Expand);
109   setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT}, {MVT::i1, MVT::i64},
110                      Custom);
111 
112   setOperationAction(ISD::SELECT, {MVT::i32, MVT::f32, MVT::v2i32, MVT::v4i32},
113                      Expand);
114 
115   // ADD, SUB overflow.
116   // TODO: turn these into Legal?
117   if (Subtarget->hasCARRY())
118     setOperationAction(ISD::UADDO, MVT::i32, Custom);
119 
120   if (Subtarget->hasBORROW())
121     setOperationAction(ISD::USUBO, MVT::i32, Custom);
122 
123   // Expand sign extension of vectors
124   if (!Subtarget->hasBFE())
125     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
126 
127   setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i1, MVT::v4i1}, Expand);
128 
129   if (!Subtarget->hasBFE())
130     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
131   setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i8, MVT::v4i8}, Expand);
132 
133   if (!Subtarget->hasBFE())
134     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
135   setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v4i16}, Expand);
136 
137   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
138   setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i32, MVT::v4i32}, Expand);
139 
140   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
141 
142   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
143 
144   setOperationAction(ISD::EXTRACT_VECTOR_ELT,
145                      {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
146 
147   setOperationAction(ISD::INSERT_VECTOR_ELT,
148                      {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
149 
150   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
151   //  to be Legal/Custom in order to avoid library calls.
152   setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, MVT::i32,
153                      Custom);
154 
155   if (!Subtarget->hasFMA())
156     setOperationAction(ISD::FMA, {MVT::f32, MVT::f64}, Expand);
157 
158   // FIXME: May need no denormals check
159   setOperationAction(ISD::FMAD, MVT::f32, Legal);
160 
161   if (!Subtarget->hasBFI())
162     // fcopysign can be done in a single instruction with BFI.
163     setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
164 
165   if (!Subtarget->hasBCNT(32))
166     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
167 
168   if (!Subtarget->hasBCNT(64))
169     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
170 
171   if (Subtarget->hasFFBH())
172     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
173 
174   if (Subtarget->hasFFBL())
175     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
176 
177   // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
178   // need it for R600.
179   if (Subtarget->hasBFE())
180     setHasExtractBitsInsn(true);
181 
182   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
183 
184   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
185   for (MVT VT : ScalarIntVTs)
186     setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT,
187                        Expand);
188 
189   // LLVM will expand these to atomic_cmp_swap(0)
190   // and atomic_swap, respectively.
191   setOperationAction({ISD::ATOMIC_LOAD, ISD::ATOMIC_STORE}, MVT::i32, Expand);
192 
193   // We need to custom lower some of the intrinsics
194   setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN}, MVT::Other,
195                      Custom);
196 
197   setSchedulingPreference(Sched::Source);
198 
199   setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT,
200                        ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD});
201 }
202 
203 static inline bool isEOP(MachineBasicBlock::iterator I) {
204   if (std::next(I) == I->getParent()->end())
205     return false;
206   return std::next(I)->getOpcode() == R600::RETURN;
207 }
208 
209 MachineBasicBlock *
210 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
211                                                 MachineBasicBlock *BB) const {
212   MachineFunction *MF = BB->getParent();
213   MachineRegisterInfo &MRI = MF->getRegInfo();
214   MachineBasicBlock::iterator I = MI;
215   const R600InstrInfo *TII = Subtarget->getInstrInfo();
216 
217   switch (MI.getOpcode()) {
218   default:
219     // Replace LDS_*_RET instruction that don't have any uses with the
220     // equivalent LDS_*_NORET instruction.
221     if (TII->isLDSRetInstr(MI.getOpcode())) {
222       int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
223       assert(DstIdx != -1);
224       MachineInstrBuilder NewMI;
225       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
226       //        LDS_1A2D support and remove this special case.
227       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
228           MI.getOpcode() == R600::LDS_CMPST_RET)
229         return BB;
230 
231       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
232                       TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
233       for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
234         NewMI.add(MO);
235     } else {
236       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
237     }
238     break;
239 
240   case R600::FABS_R600: {
241     MachineInstr *NewMI = TII->buildDefaultInstruction(
242         *BB, I, R600::MOV, MI.getOperand(0).getReg(),
243         MI.getOperand(1).getReg());
244     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
245     break;
246   }
247 
248   case R600::FNEG_R600: {
249     MachineInstr *NewMI = TII->buildDefaultInstruction(
250         *BB, I, R600::MOV, MI.getOperand(0).getReg(),
251         MI.getOperand(1).getReg());
252     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
253     break;
254   }
255 
256   case R600::MASK_WRITE: {
257     Register maskedRegister = MI.getOperand(0).getReg();
258     assert(maskedRegister.isVirtual());
259     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
260     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
261     break;
262   }
263 
264   case R600::MOV_IMM_F32:
265     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
266                                                             .getFPImm()
267                                                             ->getValueAPF()
268                                                             .bitcastToAPInt()
269                                                             .getZExtValue());
270     break;
271 
272   case R600::MOV_IMM_I32:
273     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
274                      MI.getOperand(1).getImm());
275     break;
276 
277   case R600::MOV_IMM_GLOBAL_ADDR: {
278     //TODO: Perhaps combine this instruction with the next if possible
279     auto MIB = TII->buildDefaultInstruction(
280         *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
281     int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
282     //TODO: Ugh this is rather ugly
283     const MachineOperand &MO = MI.getOperand(1);
284     MIB->getOperand(Idx).ChangeToGA(MO.getGlobal(), MO.getOffset(),
285                                     MO.getTargetFlags());
286     break;
287   }
288 
289   case R600::CONST_COPY: {
290     MachineInstr *NewMI = TII->buildDefaultInstruction(
291         *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
292     TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
293                        MI.getOperand(1).getImm());
294     break;
295   }
296 
297   case R600::RAT_WRITE_CACHELESS_32_eg:
298   case R600::RAT_WRITE_CACHELESS_64_eg:
299   case R600::RAT_WRITE_CACHELESS_128_eg:
300     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
301         .add(MI.getOperand(0))
302         .add(MI.getOperand(1))
303         .addImm(isEOP(I)); // Set End of program bit
304     break;
305 
306   case R600::RAT_STORE_TYPED_eg:
307     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
308         .add(MI.getOperand(0))
309         .add(MI.getOperand(1))
310         .add(MI.getOperand(2))
311         .addImm(isEOP(I)); // Set End of program bit
312     break;
313 
314   case R600::BRANCH:
315     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
316         .add(MI.getOperand(0));
317     break;
318 
319   case R600::BRANCH_COND_f32: {
320     MachineInstr *NewMI =
321         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
322                 R600::PREDICATE_BIT)
323             .add(MI.getOperand(1))
324             .addImm(R600::PRED_SETNE)
325             .addImm(0); // Flags
326     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
328         .add(MI.getOperand(0))
329         .addReg(R600::PREDICATE_BIT, RegState::Kill);
330     break;
331   }
332 
333   case R600::BRANCH_COND_i32: {
334     MachineInstr *NewMI =
335         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
336                 R600::PREDICATE_BIT)
337             .add(MI.getOperand(1))
338             .addImm(R600::PRED_SETNE_INT)
339             .addImm(0); // Flags
340     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
341     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
342         .add(MI.getOperand(0))
343         .addReg(R600::PREDICATE_BIT, RegState::Kill);
344     break;
345   }
346 
347   case R600::EG_ExportSwz:
348   case R600::R600_ExportSwz: {
349     // Instruction is left unmodified if its not the last one of its type
350     bool isLastInstructionOfItsType = true;
351     unsigned InstExportType = MI.getOperand(1).getImm();
352     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
353          EndBlock = BB->end(); NextExportInst != EndBlock;
354          NextExportInst = std::next(NextExportInst)) {
355       if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
356           NextExportInst->getOpcode() == R600::R600_ExportSwz) {
357         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
358             .getImm();
359         if (CurrentInstExportType == InstExportType) {
360           isLastInstructionOfItsType = false;
361           break;
362         }
363       }
364     }
365     bool EOP = isEOP(I);
366     if (!EOP && !isLastInstructionOfItsType)
367       return BB;
368     unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
369     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
370         .add(MI.getOperand(0))
371         .add(MI.getOperand(1))
372         .add(MI.getOperand(2))
373         .add(MI.getOperand(3))
374         .add(MI.getOperand(4))
375         .add(MI.getOperand(5))
376         .add(MI.getOperand(6))
377         .addImm(CfInst)
378         .addImm(EOP);
379     break;
380   }
381   case R600::RETURN: {
382     return BB;
383   }
384   }
385 
386   MI.eraseFromParent();
387   return BB;
388 }
389 
390 //===----------------------------------------------------------------------===//
391 // Custom DAG Lowering Operations
392 //===----------------------------------------------------------------------===//
393 
394 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
395   MachineFunction &MF = DAG.getMachineFunction();
396   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
397   switch (Op.getOpcode()) {
398   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
399   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
400   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
401   case ISD::SHL_PARTS:
402   case ISD::SRA_PARTS:
403   case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
404   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
405   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
406   case ISD::FCOS:
407   case ISD::FSIN: return LowerTrig(Op, DAG);
408   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
409   case ISD::STORE: return LowerSTORE(Op, DAG);
410   case ISD::LOAD: {
411     SDValue Result = LowerLOAD(Op, DAG);
412     assert((!Result.getNode() ||
413             Result.getNode()->getNumValues() == 2) &&
414            "Load should return a value and a chain");
415     return Result;
416   }
417 
418   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
419   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
420   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
421   case ISD::INTRINSIC_VOID: {
422     SDValue Chain = Op.getOperand(0);
423     unsigned IntrinsicID =
424                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
425     switch (IntrinsicID) {
426     case Intrinsic::r600_store_swizzle: {
427       SDLoc DL(Op);
428       const SDValue Args[8] = {
429         Chain,
430         Op.getOperand(2), // Export Value
431         Op.getOperand(3), // ArrayBase
432         Op.getOperand(4), // Type
433         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
434         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
435         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
436         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
437       };
438       return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
439     }
440 
441     // default for switch(IntrinsicID)
442     default: break;
443     }
444     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
445     break;
446   }
447   case ISD::INTRINSIC_WO_CHAIN: {
448     unsigned IntrinsicID =
449                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
450     EVT VT = Op.getValueType();
451     SDLoc DL(Op);
452     switch (IntrinsicID) {
453     case Intrinsic::r600_tex:
454     case Intrinsic::r600_texc: {
455       unsigned TextureOp;
456       switch (IntrinsicID) {
457       case Intrinsic::r600_tex:
458         TextureOp = 0;
459         break;
460       case Intrinsic::r600_texc:
461         TextureOp = 1;
462         break;
463       default:
464         llvm_unreachable("unhandled texture operation");
465       }
466 
467       SDValue TexArgs[19] = {
468         DAG.getConstant(TextureOp, DL, MVT::i32),
469         Op.getOperand(1),
470         DAG.getConstant(0, DL, MVT::i32),
471         DAG.getConstant(1, DL, MVT::i32),
472         DAG.getConstant(2, DL, MVT::i32),
473         DAG.getConstant(3, DL, MVT::i32),
474         Op.getOperand(2),
475         Op.getOperand(3),
476         Op.getOperand(4),
477         DAG.getConstant(0, DL, MVT::i32),
478         DAG.getConstant(1, DL, MVT::i32),
479         DAG.getConstant(2, DL, MVT::i32),
480         DAG.getConstant(3, DL, MVT::i32),
481         Op.getOperand(5),
482         Op.getOperand(6),
483         Op.getOperand(7),
484         Op.getOperand(8),
485         Op.getOperand(9),
486         Op.getOperand(10)
487       };
488       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
489     }
490     case Intrinsic::r600_dot4: {
491       SDValue Args[8] = {
492       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
493           DAG.getConstant(0, DL, MVT::i32)),
494       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
495           DAG.getConstant(0, DL, MVT::i32)),
496       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
497           DAG.getConstant(1, DL, MVT::i32)),
498       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
499           DAG.getConstant(1, DL, MVT::i32)),
500       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
501           DAG.getConstant(2, DL, MVT::i32)),
502       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
503           DAG.getConstant(2, DL, MVT::i32)),
504       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
505           DAG.getConstant(3, DL, MVT::i32)),
506       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
507           DAG.getConstant(3, DL, MVT::i32))
508       };
509       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
510     }
511 
512     case Intrinsic::r600_implicitarg_ptr: {
513       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
514       uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
515       return DAG.getConstant(ByteOffset, DL, PtrVT);
516     }
517     case Intrinsic::r600_read_ngroups_x:
518       return LowerImplicitParameter(DAG, VT, DL, 0);
519     case Intrinsic::r600_read_ngroups_y:
520       return LowerImplicitParameter(DAG, VT, DL, 1);
521     case Intrinsic::r600_read_ngroups_z:
522       return LowerImplicitParameter(DAG, VT, DL, 2);
523     case Intrinsic::r600_read_global_size_x:
524       return LowerImplicitParameter(DAG, VT, DL, 3);
525     case Intrinsic::r600_read_global_size_y:
526       return LowerImplicitParameter(DAG, VT, DL, 4);
527     case Intrinsic::r600_read_global_size_z:
528       return LowerImplicitParameter(DAG, VT, DL, 5);
529     case Intrinsic::r600_read_local_size_x:
530       return LowerImplicitParameter(DAG, VT, DL, 6);
531     case Intrinsic::r600_read_local_size_y:
532       return LowerImplicitParameter(DAG, VT, DL, 7);
533     case Intrinsic::r600_read_local_size_z:
534       return LowerImplicitParameter(DAG, VT, DL, 8);
535 
536     case Intrinsic::r600_read_tgid_x:
537     case Intrinsic::amdgcn_workgroup_id_x:
538       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
539                                      R600::T1_X, VT);
540     case Intrinsic::r600_read_tgid_y:
541     case Intrinsic::amdgcn_workgroup_id_y:
542       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
543                                      R600::T1_Y, VT);
544     case Intrinsic::r600_read_tgid_z:
545     case Intrinsic::amdgcn_workgroup_id_z:
546       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
547                                      R600::T1_Z, VT);
548     case Intrinsic::r600_read_tidig_x:
549     case Intrinsic::amdgcn_workitem_id_x:
550       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
551                                      R600::T0_X, VT);
552     case Intrinsic::r600_read_tidig_y:
553     case Intrinsic::amdgcn_workitem_id_y:
554       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
555                                      R600::T0_Y, VT);
556     case Intrinsic::r600_read_tidig_z:
557     case Intrinsic::amdgcn_workitem_id_z:
558       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
559                                      R600::T0_Z, VT);
560 
561     case Intrinsic::r600_recipsqrt_ieee:
562       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
563 
564     case Intrinsic::r600_recipsqrt_clamped:
565       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
566     default:
567       return Op;
568     }
569 
570     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
571     break;
572   }
573   } // end switch(Op.getOpcode())
574   return SDValue();
575 }
576 
577 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
578                                             SmallVectorImpl<SDValue> &Results,
579                                             SelectionDAG &DAG) const {
580   switch (N->getOpcode()) {
581   default:
582     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
583     return;
584   case ISD::FP_TO_UINT:
585     if (N->getValueType(0) == MVT::i1) {
586       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
587       return;
588     }
589     // Since we don't care about out of bounds values we can use FP_TO_SINT for
590     // uints too. The DAGLegalizer code for uint considers some extra cases
591     // which are not necessary here.
592     LLVM_FALLTHROUGH;
593   case ISD::FP_TO_SINT: {
594     if (N->getValueType(0) == MVT::i1) {
595       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
596       return;
597     }
598 
599     SDValue Result;
600     if (expandFP_TO_SINT(N, Result, DAG))
601       Results.push_back(Result);
602     return;
603   }
604   case ISD::SDIVREM: {
605     SDValue Op = SDValue(N, 1);
606     SDValue RES = LowerSDIVREM(Op, DAG);
607     Results.push_back(RES);
608     Results.push_back(RES.getValue(1));
609     break;
610   }
611   case ISD::UDIVREM: {
612     SDValue Op = SDValue(N, 0);
613     LowerUDIVREM64(Op, DAG, Results);
614     break;
615   }
616   }
617 }
618 
619 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
620                                                    SDValue Vector) const {
621   SDLoc DL(Vector);
622   EVT VecVT = Vector.getValueType();
623   EVT EltVT = VecVT.getVectorElementType();
624   SmallVector<SDValue, 8> Args;
625 
626   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
627     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
628                                DAG.getVectorIdxConstant(i, DL)));
629   }
630 
631   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
632 }
633 
634 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
635                                                     SelectionDAG &DAG) const {
636   SDLoc DL(Op);
637   SDValue Vector = Op.getOperand(0);
638   SDValue Index = Op.getOperand(1);
639 
640   if (isa<ConstantSDNode>(Index) ||
641       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
642     return Op;
643 
644   Vector = vectorToVerticalVector(DAG, Vector);
645   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
646                      Vector, Index);
647 }
648 
649 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
650                                                    SelectionDAG &DAG) const {
651   SDLoc DL(Op);
652   SDValue Vector = Op.getOperand(0);
653   SDValue Value = Op.getOperand(1);
654   SDValue Index = Op.getOperand(2);
655 
656   if (isa<ConstantSDNode>(Index) ||
657       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
658     return Op;
659 
660   Vector = vectorToVerticalVector(DAG, Vector);
661   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
662                                Vector, Value, Index);
663   return vectorToVerticalVector(DAG, Insert);
664 }
665 
666 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
667                                                SDValue Op,
668                                                SelectionDAG &DAG) const {
669   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
670   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
671     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
672 
673   const DataLayout &DL = DAG.getDataLayout();
674   const GlobalValue *GV = GSD->getGlobal();
675   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
676 
677   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
678   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
679 }
680 
681 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
682   // On hw >= R700, COS/SIN input must be between -1. and 1.
683   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
684   EVT VT = Op.getValueType();
685   SDValue Arg = Op.getOperand(0);
686   SDLoc DL(Op);
687 
688   // TODO: Should this propagate fast-math-flags?
689   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
690       DAG.getNode(ISD::FADD, DL, VT,
691         DAG.getNode(ISD::FMUL, DL, VT, Arg,
692           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
693         DAG.getConstantFP(0.5, DL, MVT::f32)));
694   unsigned TrigNode;
695   switch (Op.getOpcode()) {
696   case ISD::FCOS:
697     TrigNode = AMDGPUISD::COS_HW;
698     break;
699   case ISD::FSIN:
700     TrigNode = AMDGPUISD::SIN_HW;
701     break;
702   default:
703     llvm_unreachable("Wrong trig opcode");
704   }
705   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
706       DAG.getNode(ISD::FADD, DL, VT, FractPart,
707         DAG.getConstantFP(-0.5, DL, MVT::f32)));
708   if (Gen >= AMDGPUSubtarget::R700)
709     return TrigVal;
710   // On R600 hw, COS/SIN input must be between -Pi and Pi.
711   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
712       DAG.getConstantFP(numbers::pif, DL, MVT::f32));
713 }
714 
715 SDValue R600TargetLowering::LowerShiftParts(SDValue Op,
716                                             SelectionDAG &DAG) const {
717   SDValue Lo, Hi;
718   expandShiftParts(Op.getNode(), Lo, Hi, DAG);
719   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
720 }
721 
722 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
723                                           unsigned mainop, unsigned ovf) const {
724   SDLoc DL(Op);
725   EVT VT = Op.getValueType();
726 
727   SDValue Lo = Op.getOperand(0);
728   SDValue Hi = Op.getOperand(1);
729 
730   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
731   // Extend sign.
732   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
733                     DAG.getValueType(MVT::i1));
734 
735   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
736 
737   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
738 }
739 
740 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
741   SDLoc DL(Op);
742   return DAG.getNode(
743       ISD::SETCC,
744       DL,
745       MVT::i1,
746       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
747       DAG.getCondCode(ISD::SETEQ));
748 }
749 
750 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
751   SDLoc DL(Op);
752   return DAG.getNode(
753       ISD::SETCC,
754       DL,
755       MVT::i1,
756       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
757       DAG.getCondCode(ISD::SETEQ));
758 }
759 
760 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
761                                                    const SDLoc &DL,
762                                                    unsigned DwordOffset) const {
763   unsigned ByteOffset = DwordOffset * 4;
764   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
765                                       AMDGPUAS::PARAM_I_ADDRESS);
766 
767   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
768   assert(isInt<16>(ByteOffset));
769 
770   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
771                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
772                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
773 }
774 
775 bool R600TargetLowering::isZero(SDValue Op) const {
776   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
777     return Cst->isZero();
778   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
779     return CstFP->isZero();
780   } else {
781     return false;
782   }
783 }
784 
785 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
786   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
787     return CFP->isExactlyValue(1.0);
788   }
789   return isAllOnesConstant(Op);
790 }
791 
792 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
793   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
794     return CFP->getValueAPF().isZero();
795   }
796   return isNullConstant(Op);
797 }
798 
799 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
800   SDLoc DL(Op);
801   EVT VT = Op.getValueType();
802 
803   SDValue LHS = Op.getOperand(0);
804   SDValue RHS = Op.getOperand(1);
805   SDValue True = Op.getOperand(2);
806   SDValue False = Op.getOperand(3);
807   SDValue CC = Op.getOperand(4);
808   SDValue Temp;
809 
810   if (VT == MVT::f32) {
811     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
812     SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
813     if (MinMax)
814       return MinMax;
815   }
816 
817   // LHS and RHS are guaranteed to be the same value type
818   EVT CompareVT = LHS.getValueType();
819 
820   // Check if we can lower this to a native operation.
821 
822   // Try to lower to a SET* instruction:
823   //
824   // SET* can match the following patterns:
825   //
826   // select_cc f32, f32, -1,  0, cc_supported
827   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
828   // select_cc i32, i32, -1,  0, cc_supported
829   //
830 
831   // Move hardware True/False values to the correct operand.
832   if (isHWTrueValue(False) && isHWFalseValue(True)) {
833     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
834     ISD::CondCode InverseCC = ISD::getSetCCInverse(CCOpcode, CompareVT);
835     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
836       std::swap(False, True);
837       CC = DAG.getCondCode(InverseCC);
838     } else {
839       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
840       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
841         std::swap(False, True);
842         std::swap(LHS, RHS);
843         CC = DAG.getCondCode(SwapInvCC);
844       }
845     }
846   }
847 
848   if (isHWTrueValue(True) && isHWFalseValue(False) &&
849       (CompareVT == VT || VT == MVT::i32)) {
850     // This can be matched by a SET* instruction.
851     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
852   }
853 
854   // Try to lower to a CND* instruction:
855   //
856   // CND* can match the following patterns:
857   //
858   // select_cc f32, 0.0, f32, f32, cc_supported
859   // select_cc f32, 0.0, i32, i32, cc_supported
860   // select_cc i32, 0,   f32, f32, cc_supported
861   // select_cc i32, 0,   i32, i32, cc_supported
862   //
863 
864   // Try to move the zero value to the RHS
865   if (isZero(LHS)) {
866     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
867     // Try swapping the operands
868     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
869     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
870       std::swap(LHS, RHS);
871       CC = DAG.getCondCode(CCSwapped);
872     } else {
873       // Try inverting the condition and then swapping the operands
874       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT);
875       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
876       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
877         std::swap(True, False);
878         std::swap(LHS, RHS);
879         CC = DAG.getCondCode(CCSwapped);
880       }
881     }
882   }
883   if (isZero(RHS)) {
884     SDValue Cond = LHS;
885     SDValue Zero = RHS;
886     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
887     if (CompareVT != VT) {
888       // Bitcast True / False to the correct types.  This will end up being
889       // a nop, but it allows us to define only a single pattern in the
890       // .TD files for each CND* instruction rather than having to have
891       // one pattern for integer True/False and one for fp True/False
892       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
893       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
894     }
895 
896     switch (CCOpcode) {
897     case ISD::SETONE:
898     case ISD::SETUNE:
899     case ISD::SETNE:
900       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT);
901       Temp = True;
902       True = False;
903       False = Temp;
904       break;
905     default:
906       break;
907     }
908     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
909         Cond, Zero,
910         True, False,
911         DAG.getCondCode(CCOpcode));
912     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
913   }
914 
915   // If we make it this for it means we have no native instructions to handle
916   // this SELECT_CC, so we must lower it.
917   SDValue HWTrue, HWFalse;
918 
919   if (CompareVT == MVT::f32) {
920     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
921     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
922   } else if (CompareVT == MVT::i32) {
923     HWTrue = DAG.getConstant(-1, DL, CompareVT);
924     HWFalse = DAG.getConstant(0, DL, CompareVT);
925   }
926   else {
927     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
928   }
929 
930   // Lower this unsupported SELECT_CC into a combination of two supported
931   // SELECT_CC operations.
932   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
933 
934   return DAG.getNode(ISD::SELECT_CC, DL, VT,
935       Cond, HWFalse,
936       True, False,
937       DAG.getCondCode(ISD::SETNE));
938 }
939 
940 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
941 /// convert these pointers to a register index.  Each register holds
942 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
943 /// \p StackWidth, which tells us how many of the 4 sub-registers will be used
944 /// for indirect addressing.
945 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
946                                                unsigned StackWidth,
947                                                SelectionDAG &DAG) const {
948   unsigned SRLPad;
949   switch(StackWidth) {
950   case 1:
951     SRLPad = 2;
952     break;
953   case 2:
954     SRLPad = 3;
955     break;
956   case 4:
957     SRLPad = 4;
958     break;
959   default: llvm_unreachable("Invalid stack width");
960   }
961 
962   SDLoc DL(Ptr);
963   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
964                      DAG.getConstant(SRLPad, DL, MVT::i32));
965 }
966 
967 void R600TargetLowering::getStackAddress(unsigned StackWidth,
968                                          unsigned ElemIdx,
969                                          unsigned &Channel,
970                                          unsigned &PtrIncr) const {
971   switch (StackWidth) {
972   default:
973   case 1:
974     Channel = 0;
975     if (ElemIdx > 0) {
976       PtrIncr = 1;
977     } else {
978       PtrIncr = 0;
979     }
980     break;
981   case 2:
982     Channel = ElemIdx % 2;
983     if (ElemIdx == 2) {
984       PtrIncr = 1;
985     } else {
986       PtrIncr = 0;
987     }
988     break;
989   case 4:
990     Channel = ElemIdx;
991     PtrIncr = 0;
992     break;
993   }
994 }
995 
996 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
997                                                    SelectionDAG &DAG) const {
998   SDLoc DL(Store);
999   //TODO: Who creates the i8 stores?
1000   assert(Store->isTruncatingStore()
1001          || Store->getValue().getValueType() == MVT::i8);
1002   assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
1003 
1004   SDValue Mask;
1005   if (Store->getMemoryVT() == MVT::i8) {
1006     assert(Store->getAlignment() >= 1);
1007     Mask = DAG.getConstant(0xff, DL, MVT::i32);
1008   } else if (Store->getMemoryVT() == MVT::i16) {
1009     assert(Store->getAlignment() >= 2);
1010     Mask = DAG.getConstant(0xffff, DL, MVT::i32);
1011   } else {
1012     llvm_unreachable("Unsupported private trunc store");
1013   }
1014 
1015   SDValue OldChain = Store->getChain();
1016   bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
1017   // Skip dummy
1018   SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
1019   SDValue BasePtr = Store->getBasePtr();
1020   SDValue Offset = Store->getOffset();
1021   EVT MemVT = Store->getMemoryVT();
1022 
1023   SDValue LoadPtr = BasePtr;
1024   if (!Offset.isUndef()) {
1025     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
1026   }
1027 
1028   // Get dword location
1029   // TODO: this should be eliminated by the future SHR ptr, 2
1030   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1031                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
1032 
1033   // Load dword
1034   // TODO: can we be smarter about machine pointer info?
1035   MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
1036   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
1037 
1038   Chain = Dst.getValue(1);
1039 
1040   // Get offset in dword
1041   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1042                                 DAG.getConstant(0x3, DL, MVT::i32));
1043 
1044   // Convert byte offset to bit shift
1045   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1046                                  DAG.getConstant(3, DL, MVT::i32));
1047 
1048   // TODO: Contrary to the name of the function,
1049   // it also handles sub i32 non-truncating stores (like i1)
1050   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1051                                   Store->getValue());
1052 
1053   // Mask the value to the right type
1054   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1055 
1056   // Shift the value in place
1057   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1058                                      MaskedValue, ShiftAmt);
1059 
1060   // Shift the mask in place
1061   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
1062 
1063   // Invert the mask. NOTE: if we had native ROL instructions we could
1064   // use inverted mask
1065   DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
1066 
1067   // Cleanup the target bits
1068   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1069 
1070   // Add the new bits
1071   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1072 
1073   // Store dword
1074   // TODO: Can we be smarter about MachinePointerInfo?
1075   SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo);
1076 
1077   // If we are part of expanded vector, make our neighbors depend on this store
1078   if (VectorTrunc) {
1079     // Make all other vector elements depend on this store
1080     Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
1081     DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
1082   }
1083   return NewStore;
1084 }
1085 
1086 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1087   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1088   unsigned AS = StoreNode->getAddressSpace();
1089 
1090   SDValue Chain = StoreNode->getChain();
1091   SDValue Ptr = StoreNode->getBasePtr();
1092   SDValue Value = StoreNode->getValue();
1093 
1094   EVT VT = Value.getValueType();
1095   EVT MemVT = StoreNode->getMemoryVT();
1096   EVT PtrVT = Ptr.getValueType();
1097 
1098   SDLoc DL(Op);
1099 
1100   const bool TruncatingStore = StoreNode->isTruncatingStore();
1101 
1102   // Neither LOCAL nor PRIVATE can do vectors at the moment
1103   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ||
1104        TruncatingStore) &&
1105       VT.isVector()) {
1106     if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
1107       // Add an extra level of chain to isolate this vector
1108       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
1109       // TODO: can the chain be replaced without creating a new store?
1110       SDValue NewStore = DAG.getTruncStore(
1111           NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
1112           StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
1113           StoreNode->getAAInfo());
1114       StoreNode = cast<StoreSDNode>(NewStore);
1115     }
1116 
1117     return scalarizeVectorStore(StoreNode, DAG);
1118   }
1119 
1120   Align Alignment = StoreNode->getAlign();
1121   if (Alignment < MemVT.getStoreSize() &&
1122       !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
1123                                       StoreNode->getMemOperand()->getFlags(),
1124                                       nullptr)) {
1125     return expandUnalignedStore(StoreNode, DAG);
1126   }
1127 
1128   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
1129                                   DAG.getConstant(2, DL, PtrVT));
1130 
1131   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1132     // It is beneficial to create MSKOR here instead of combiner to avoid
1133     // artificial dependencies introduced by RMW
1134     if (TruncatingStore) {
1135       assert(VT.bitsLE(MVT::i32));
1136       SDValue MaskConstant;
1137       if (MemVT == MVT::i8) {
1138         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1139       } else {
1140         assert(MemVT == MVT::i16);
1141         assert(StoreNode->getAlignment() >= 2);
1142         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1143       }
1144 
1145       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
1146                                       DAG.getConstant(0x00000003, DL, PtrVT));
1147       SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1148                                      DAG.getConstant(3, DL, VT));
1149 
1150       // Put the mask in correct place
1151       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
1152 
1153       // Put the value bits in correct place
1154       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1155       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
1156 
1157       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1158       // vector instead.
1159       SDValue Src[4] = {
1160         ShiftedValue,
1161         DAG.getConstant(0, DL, MVT::i32),
1162         DAG.getConstant(0, DL, MVT::i32),
1163         Mask
1164       };
1165       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1166       SDValue Args[3] = { Chain, Input, DWordAddr };
1167       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1168                                      Op->getVTList(), Args, MemVT,
1169                                      StoreNode->getMemOperand());
1170     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
1171       // Convert pointer from byte address to dword address.
1172       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
1173 
1174       if (StoreNode->isIndexed()) {
1175         llvm_unreachable("Indexed stores not supported yet");
1176       } else {
1177         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1178       }
1179       return Chain;
1180     }
1181   }
1182 
1183   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
1184   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1185     return SDValue();
1186 
1187   if (MemVT.bitsLT(MVT::i32))
1188     return lowerPrivateTruncStore(StoreNode, DAG);
1189 
1190   // Standard i32+ store, tag it with DWORDADDR to note that the address
1191   // has been shifted
1192   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
1193     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
1194     return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1195   }
1196 
1197   // Tagged i32+ stores will be matched by patterns
1198   return SDValue();
1199 }
1200 
1201 // return (512 + (kc_bank << 12)
1202 static int
1203 ConstantAddressBlock(unsigned AddressSpace) {
1204   switch (AddressSpace) {
1205   case AMDGPUAS::CONSTANT_BUFFER_0:
1206     return 512;
1207   case AMDGPUAS::CONSTANT_BUFFER_1:
1208     return 512 + 4096;
1209   case AMDGPUAS::CONSTANT_BUFFER_2:
1210     return 512 + 4096 * 2;
1211   case AMDGPUAS::CONSTANT_BUFFER_3:
1212     return 512 + 4096 * 3;
1213   case AMDGPUAS::CONSTANT_BUFFER_4:
1214     return 512 + 4096 * 4;
1215   case AMDGPUAS::CONSTANT_BUFFER_5:
1216     return 512 + 4096 * 5;
1217   case AMDGPUAS::CONSTANT_BUFFER_6:
1218     return 512 + 4096 * 6;
1219   case AMDGPUAS::CONSTANT_BUFFER_7:
1220     return 512 + 4096 * 7;
1221   case AMDGPUAS::CONSTANT_BUFFER_8:
1222     return 512 + 4096 * 8;
1223   case AMDGPUAS::CONSTANT_BUFFER_9:
1224     return 512 + 4096 * 9;
1225   case AMDGPUAS::CONSTANT_BUFFER_10:
1226     return 512 + 4096 * 10;
1227   case AMDGPUAS::CONSTANT_BUFFER_11:
1228     return 512 + 4096 * 11;
1229   case AMDGPUAS::CONSTANT_BUFFER_12:
1230     return 512 + 4096 * 12;
1231   case AMDGPUAS::CONSTANT_BUFFER_13:
1232     return 512 + 4096 * 13;
1233   case AMDGPUAS::CONSTANT_BUFFER_14:
1234     return 512 + 4096 * 14;
1235   case AMDGPUAS::CONSTANT_BUFFER_15:
1236     return 512 + 4096 * 15;
1237   default:
1238     return -1;
1239   }
1240 }
1241 
1242 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1243                                                 SelectionDAG &DAG) const {
1244   SDLoc DL(Op);
1245   LoadSDNode *Load = cast<LoadSDNode>(Op);
1246   ISD::LoadExtType ExtType = Load->getExtensionType();
1247   EVT MemVT = Load->getMemoryVT();
1248   assert(Load->getAlignment() >= MemVT.getStoreSize());
1249 
1250   SDValue BasePtr = Load->getBasePtr();
1251   SDValue Chain = Load->getChain();
1252   SDValue Offset = Load->getOffset();
1253 
1254   SDValue LoadPtr = BasePtr;
1255   if (!Offset.isUndef()) {
1256     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
1257   }
1258 
1259   // Get dword location
1260   // NOTE: this should be eliminated by the future SHR ptr, 2
1261   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1262                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
1263 
1264   // Load dword
1265   // TODO: can we be smarter about machine pointer info?
1266   MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
1267   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
1268 
1269   // Get offset within the register.
1270   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1271                                 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
1272 
1273   // Bit offset of target byte (byteIdx * 8).
1274   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1275                                  DAG.getConstant(3, DL, MVT::i32));
1276 
1277   // Shift to the right.
1278   SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
1279 
1280   // Eliminate the upper bits by setting them to ...
1281   EVT MemEltVT = MemVT.getScalarType();
1282 
1283   if (ExtType == ISD::SEXTLOAD) { // ... ones.
1284     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1285     Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
1286   } else { // ... or zeros.
1287     Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
1288   }
1289 
1290   SDValue Ops[] = {
1291     Ret,
1292     Read.getValue(1) // This should be our output chain
1293   };
1294 
1295   return DAG.getMergeValues(Ops, DL);
1296 }
1297 
1298 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1299   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1300   unsigned AS = LoadNode->getAddressSpace();
1301   EVT MemVT = LoadNode->getMemoryVT();
1302   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1303 
1304   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1305       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1306     return lowerPrivateExtLoad(Op, DAG);
1307   }
1308 
1309   SDLoc DL(Op);
1310   EVT VT = Op.getValueType();
1311   SDValue Chain = LoadNode->getChain();
1312   SDValue Ptr = LoadNode->getBasePtr();
1313 
1314   if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1315       LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
1316       VT.isVector()) {
1317     SDValue Ops[2];
1318     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LoadNode, DAG);
1319     return DAG.getMergeValues(Ops, DL);
1320   }
1321 
1322   // This is still used for explicit load from addrspace(8)
1323   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1324   if (ConstantBlock > -1 &&
1325       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1326        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1327     SDValue Result;
1328     if (isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1329         isa<ConstantSDNode>(Ptr)) {
1330       return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG);
1331     } else {
1332       //TODO: Does this even work?
1333       // non-constant ptr can't be folded, keeps it as a v4f32 load
1334       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1335           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1336                       DAG.getConstant(4, DL, MVT::i32)),
1337                       DAG.getConstant(LoadNode->getAddressSpace() -
1338                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1339           );
1340     }
1341 
1342     if (!VT.isVector()) {
1343       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1344                            DAG.getConstant(0, DL, MVT::i32));
1345     }
1346 
1347     SDValue MergedValues[2] = {
1348       Result,
1349       Chain
1350     };
1351     return DAG.getMergeValues(MergedValues, DL);
1352   }
1353 
1354   // For most operations returning SDValue() will result in the node being
1355   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1356   // need to manually expand loads that may be legal in some address spaces and
1357   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1358   // compute shaders, since the data is sign extended when it is uploaded to the
1359   // buffer. However SEXT loads from other address spaces are not supported, so
1360   // we need to expand them here.
1361   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1362     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1363     SDValue NewLoad = DAG.getExtLoad(
1364         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
1365         LoadNode->getAlign(), LoadNode->getMemOperand()->getFlags());
1366     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1367                               DAG.getValueType(MemVT));
1368 
1369     SDValue MergedValues[2] = { Res, Chain };
1370     return DAG.getMergeValues(MergedValues, DL);
1371   }
1372 
1373   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1374     return SDValue();
1375   }
1376 
1377   // DWORDADDR ISD marks already shifted address
1378   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
1379     assert(VT == MVT::i32);
1380     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
1381     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
1382     return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
1383   }
1384   return SDValue();
1385 }
1386 
1387 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1388   SDValue Chain = Op.getOperand(0);
1389   SDValue Cond  = Op.getOperand(1);
1390   SDValue Jump  = Op.getOperand(2);
1391 
1392   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1393                      Chain, Jump, Cond);
1394 }
1395 
1396 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1397                                             SelectionDAG &DAG) const {
1398   MachineFunction &MF = DAG.getMachineFunction();
1399   const R600FrameLowering *TFL = Subtarget->getFrameLowering();
1400 
1401   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1402 
1403   unsigned FrameIndex = FIN->getIndex();
1404   Register IgnoredFrameReg;
1405   StackOffset Offset =
1406       TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1407   return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF),
1408                          SDLoc(Op), Op.getValueType());
1409 }
1410 
1411 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1412                                                   bool IsVarArg) const {
1413   switch (CC) {
1414   case CallingConv::AMDGPU_KERNEL:
1415   case CallingConv::SPIR_KERNEL:
1416   case CallingConv::C:
1417   case CallingConv::Fast:
1418   case CallingConv::Cold:
1419     llvm_unreachable("kernels should not be handled here");
1420   case CallingConv::AMDGPU_VS:
1421   case CallingConv::AMDGPU_GS:
1422   case CallingConv::AMDGPU_PS:
1423   case CallingConv::AMDGPU_CS:
1424   case CallingConv::AMDGPU_HS:
1425   case CallingConv::AMDGPU_ES:
1426   case CallingConv::AMDGPU_LS:
1427     return CC_R600;
1428   default:
1429     report_fatal_error("Unsupported calling convention.");
1430   }
1431 }
1432 
1433 /// XXX Only kernel functions are supported, so we can assume for now that
1434 /// every function is a kernel function, but in the future we should use
1435 /// separate calling conventions for kernel and non-kernel functions.
1436 SDValue R600TargetLowering::LowerFormalArguments(
1437     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1438     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1439     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1440   SmallVector<CCValAssign, 16> ArgLocs;
1441   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1442                  *DAG.getContext());
1443   MachineFunction &MF = DAG.getMachineFunction();
1444   SmallVector<ISD::InputArg, 8> LocalIns;
1445 
1446   if (AMDGPU::isShader(CallConv)) {
1447     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
1448   } else {
1449     analyzeFormalArgumentsCompute(CCInfo, Ins);
1450   }
1451 
1452   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1453     CCValAssign &VA = ArgLocs[i];
1454     const ISD::InputArg &In = Ins[i];
1455     EVT VT = In.VT;
1456     EVT MemVT = VA.getLocVT();
1457     if (!VT.isVector() && MemVT.isVector()) {
1458       // Get load source type if scalarized.
1459       MemVT = MemVT.getVectorElementType();
1460     }
1461 
1462     if (AMDGPU::isShader(CallConv)) {
1463       Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
1464       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1465       InVals.push_back(Register);
1466       continue;
1467     }
1468 
1469     // i64 isn't a legal type, so the register type used ends up as i32, which
1470     // isn't expected here. It attempts to create this sextload, but it ends up
1471     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1472     // for <1 x i64>.
1473 
1474     // The first 36 bytes of the input buffer contains information about
1475     // thread group and global sizes.
1476     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1477     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1478       // FIXME: This should really check the extload type, but the handling of
1479       // extload vector parameters seems to be broken.
1480 
1481       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1482       Ext = ISD::SEXTLOAD;
1483     }
1484 
1485     // Compute the offset from the value.
1486     // XXX - I think PartOffset should give you this, but it seems to give the
1487     // size of the register which isn't useful.
1488 
1489     unsigned PartOffset = VA.getLocMemOffset();
1490     unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
1491 
1492     MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS);
1493     SDValue Arg = DAG.getLoad(
1494         ISD::UNINDEXED, Ext, VT, DL, Chain,
1495         DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
1496         PtrInfo,
1497         MemVT, Alignment, MachineMemOperand::MONonTemporal |
1498                                         MachineMemOperand::MODereferenceable |
1499                                         MachineMemOperand::MOInvariant);
1500 
1501     InVals.push_back(Arg);
1502   }
1503   return Chain;
1504 }
1505 
1506 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1507                                            EVT VT) const {
1508    if (!VT.isVector())
1509      return MVT::i32;
1510    return VT.changeVectorElementTypeToInteger();
1511 }
1512 
1513 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1514                                           const MachineFunction &MF) const {
1515   // Local and Private addresses do not handle vectors. Limit to i32
1516   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) {
1517     return (MemVT.getSizeInBits() <= 32);
1518   }
1519   return true;
1520 }
1521 
1522 bool R600TargetLowering::allowsMisalignedMemoryAccesses(
1523     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1524     bool *IsFast) const {
1525   if (IsFast)
1526     *IsFast = false;
1527 
1528   if (!VT.isSimple() || VT == MVT::Other)
1529     return false;
1530 
1531   if (VT.bitsLT(MVT::i32))
1532     return false;
1533 
1534   // TODO: This is a rough estimate.
1535   if (IsFast)
1536     *IsFast = true;
1537 
1538   return VT.bitsGT(MVT::i32) && Alignment >= Align(4);
1539 }
1540 
1541 static SDValue CompactSwizzlableVector(
1542   SelectionDAG &DAG, SDValue VectorEntry,
1543   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1544   assert(RemapSwizzle.empty());
1545 
1546   SDLoc DL(VectorEntry);
1547   EVT EltTy = VectorEntry.getValueType().getVectorElementType();
1548 
1549   SDValue NewBldVec[4];
1550   for (unsigned i = 0; i < 4; i++)
1551     NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
1552                                DAG.getIntPtrConstant(i, DL));
1553 
1554   for (unsigned i = 0; i < 4; i++) {
1555     if (NewBldVec[i].isUndef())
1556       // We mask write here to teach later passes that the ith element of this
1557       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1558       // break false dependencies and additionally make assembly easier to read.
1559       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1560     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1561       if (C->isZero()) {
1562         RemapSwizzle[i] = 4; // SEL_0
1563         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1564       } else if (C->isExactlyValue(1.0)) {
1565         RemapSwizzle[i] = 5; // SEL_1
1566         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1567       }
1568     }
1569 
1570     if (NewBldVec[i].isUndef())
1571       continue;
1572 
1573     for (unsigned j = 0; j < i; j++) {
1574       if (NewBldVec[i] == NewBldVec[j]) {
1575         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1576         RemapSwizzle[i] = j;
1577         break;
1578       }
1579     }
1580   }
1581 
1582   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1583                             NewBldVec);
1584 }
1585 
1586 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1587                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1588   assert(RemapSwizzle.empty());
1589 
1590   SDLoc DL(VectorEntry);
1591   EVT EltTy = VectorEntry.getValueType().getVectorElementType();
1592 
1593   SDValue NewBldVec[4];
1594   bool isUnmovable[4] = {false, false, false, false};
1595   for (unsigned i = 0; i < 4; i++)
1596     NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
1597                                DAG.getIntPtrConstant(i, DL));
1598 
1599   for (unsigned i = 0; i < 4; i++) {
1600     RemapSwizzle[i] = i;
1601     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1602       unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1603           ->getZExtValue();
1604       if (i == Idx)
1605         isUnmovable[Idx] = true;
1606     }
1607   }
1608 
1609   for (unsigned i = 0; i < 4; i++) {
1610     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1611       unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1612           ->getZExtValue();
1613       if (isUnmovable[Idx])
1614         continue;
1615       // Swap i and Idx
1616       std::swap(NewBldVec[Idx], NewBldVec[i]);
1617       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1618       break;
1619     }
1620   }
1621 
1622   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1623                             NewBldVec);
1624 }
1625 
1626 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
1627                                             SelectionDAG &DAG,
1628                                             const SDLoc &DL) const {
1629   // Old -> New swizzle values
1630   DenseMap<unsigned, unsigned> SwizzleRemap;
1631 
1632   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1633   for (unsigned i = 0; i < 4; i++) {
1634     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1635     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1636       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1637   }
1638 
1639   SwizzleRemap.clear();
1640   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1641   for (unsigned i = 0; i < 4; i++) {
1642     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1643     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1644       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1645   }
1646 
1647   return BuildVector;
1648 }
1649 
1650 SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
1651                                             SelectionDAG &DAG) const {
1652   SDLoc DL(LoadNode);
1653   EVT VT = LoadNode->getValueType(0);
1654   SDValue Chain = LoadNode->getChain();
1655   SDValue Ptr = LoadNode->getBasePtr();
1656   assert (isa<ConstantSDNode>(Ptr));
1657 
1658   //TODO: Support smaller loads
1659   if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
1660     return SDValue();
1661 
1662   if (LoadNode->getAlign() < Align(4))
1663     return SDValue();
1664 
1665   int ConstantBlock = ConstantAddressBlock(Block);
1666 
1667   SDValue Slots[4];
1668   for (unsigned i = 0; i < 4; i++) {
1669     // We want Const position encoded with the following formula :
1670     // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1671     // const_index is Ptr computed by llvm using an alignment of 16.
1672     // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1673     // then div by 4 at the ISel step
1674     SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1675         DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1676     Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1677   }
1678   EVT NewVT = MVT::v4i32;
1679   unsigned NumElements = 4;
1680   if (VT.isVector()) {
1681     NewVT = VT;
1682     NumElements = VT.getVectorNumElements();
1683   }
1684   SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1685   if (!VT.isVector()) {
1686     Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1687                          DAG.getConstant(0, DL, MVT::i32));
1688   }
1689   SDValue MergedValues[2] = {
1690     Result,
1691     Chain
1692   };
1693   return DAG.getMergeValues(MergedValues, DL);
1694 }
1695 
1696 //===----------------------------------------------------------------------===//
1697 // Custom DAG Optimizations
1698 //===----------------------------------------------------------------------===//
1699 
1700 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1701                                               DAGCombinerInfo &DCI) const {
1702   SelectionDAG &DAG = DCI.DAG;
1703   SDLoc DL(N);
1704 
1705   switch (N->getOpcode()) {
1706   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1707   case ISD::FP_ROUND: {
1708       SDValue Arg = N->getOperand(0);
1709       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1710         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
1711                            Arg.getOperand(0));
1712       }
1713       break;
1714     }
1715 
1716   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1717   // (i32 select_cc f32, f32, -1, 0 cc)
1718   //
1719   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1720   // this to one of the SET*_DX10 instructions.
1721   case ISD::FP_TO_SINT: {
1722     SDValue FNeg = N->getOperand(0);
1723     if (FNeg.getOpcode() != ISD::FNEG) {
1724       return SDValue();
1725     }
1726     SDValue SelectCC = FNeg.getOperand(0);
1727     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1728         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1729         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1730         !isHWTrueValue(SelectCC.getOperand(2)) ||
1731         !isHWFalseValue(SelectCC.getOperand(3))) {
1732       return SDValue();
1733     }
1734 
1735     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
1736                            SelectCC.getOperand(0), // LHS
1737                            SelectCC.getOperand(1), // RHS
1738                            DAG.getConstant(-1, DL, MVT::i32), // True
1739                            DAG.getConstant(0, DL, MVT::i32),  // False
1740                            SelectCC.getOperand(4)); // CC
1741   }
1742 
1743   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1744   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1745   case ISD::INSERT_VECTOR_ELT: {
1746     SDValue InVec = N->getOperand(0);
1747     SDValue InVal = N->getOperand(1);
1748     SDValue EltNo = N->getOperand(2);
1749 
1750     // If the inserted element is an UNDEF, just use the input vector.
1751     if (InVal.isUndef())
1752       return InVec;
1753 
1754     EVT VT = InVec.getValueType();
1755 
1756     // If we can't generate a legal BUILD_VECTOR, exit
1757     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1758       return SDValue();
1759 
1760     // Check that we know which element is being inserted
1761     if (!isa<ConstantSDNode>(EltNo))
1762       return SDValue();
1763     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1764 
1765     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1766     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1767     // vector elements.
1768     SmallVector<SDValue, 8> Ops;
1769     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1770       Ops.append(InVec.getNode()->op_begin(),
1771                  InVec.getNode()->op_end());
1772     } else if (InVec.isUndef()) {
1773       unsigned NElts = VT.getVectorNumElements();
1774       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1775     } else {
1776       return SDValue();
1777     }
1778 
1779     // Insert the element
1780     if (Elt < Ops.size()) {
1781       // All the operands of BUILD_VECTOR must have the same type;
1782       // we enforce that here.
1783       EVT OpVT = Ops[0].getValueType();
1784       if (InVal.getValueType() != OpVT)
1785         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1786           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
1787           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
1788       Ops[Elt] = InVal;
1789     }
1790 
1791     // Return the new vector
1792     return DAG.getBuildVector(VT, DL, Ops);
1793   }
1794 
1795   // Extract_vec (Build_vector) generated by custom lowering
1796   // also needs to be customly combined
1797   case ISD::EXTRACT_VECTOR_ELT: {
1798     SDValue Arg = N->getOperand(0);
1799     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1800       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1801         unsigned Element = Const->getZExtValue();
1802         return Arg->getOperand(Element);
1803       }
1804     }
1805     if (Arg.getOpcode() == ISD::BITCAST &&
1806         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
1807         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
1808          Arg.getValueType().getVectorNumElements())) {
1809       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1810         unsigned Element = Const->getZExtValue();
1811         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
1812                            Arg->getOperand(0).getOperand(Element));
1813       }
1814     }
1815     break;
1816   }
1817 
1818   case ISD::SELECT_CC: {
1819     // Try common optimizations
1820     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
1821       return Ret;
1822 
1823     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1824     //      selectcc x, y, a, b, inv(cc)
1825     //
1826     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1827     //      selectcc x, y, a, b, cc
1828     SDValue LHS = N->getOperand(0);
1829     if (LHS.getOpcode() != ISD::SELECT_CC) {
1830       return SDValue();
1831     }
1832 
1833     SDValue RHS = N->getOperand(1);
1834     SDValue True = N->getOperand(2);
1835     SDValue False = N->getOperand(3);
1836     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1837 
1838     if (LHS.getOperand(2).getNode() != True.getNode() ||
1839         LHS.getOperand(3).getNode() != False.getNode() ||
1840         RHS.getNode() != False.getNode()) {
1841       return SDValue();
1842     }
1843 
1844     switch (NCC) {
1845     default: return SDValue();
1846     case ISD::SETNE: return LHS;
1847     case ISD::SETEQ: {
1848       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1849       LHSCC = ISD::getSetCCInverse(LHSCC, LHS.getOperand(0).getValueType());
1850       if (DCI.isBeforeLegalizeOps() ||
1851           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1852         return DAG.getSelectCC(DL,
1853                                LHS.getOperand(0),
1854                                LHS.getOperand(1),
1855                                LHS.getOperand(2),
1856                                LHS.getOperand(3),
1857                                LHSCC);
1858       break;
1859     }
1860     }
1861     return SDValue();
1862   }
1863 
1864   case AMDGPUISD::R600_EXPORT: {
1865     SDValue Arg = N->getOperand(1);
1866     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1867       break;
1868 
1869     SDValue NewArgs[8] = {
1870       N->getOperand(0), // Chain
1871       SDValue(),
1872       N->getOperand(2), // ArrayBase
1873       N->getOperand(3), // Type
1874       N->getOperand(4), // SWZ_X
1875       N->getOperand(5), // SWZ_Y
1876       N->getOperand(6), // SWZ_Z
1877       N->getOperand(7) // SWZ_W
1878     };
1879     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
1880     return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
1881   }
1882   case AMDGPUISD::TEXTURE_FETCH: {
1883     SDValue Arg = N->getOperand(1);
1884     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1885       break;
1886 
1887     SDValue NewArgs[19] = {
1888       N->getOperand(0),
1889       N->getOperand(1),
1890       N->getOperand(2),
1891       N->getOperand(3),
1892       N->getOperand(4),
1893       N->getOperand(5),
1894       N->getOperand(6),
1895       N->getOperand(7),
1896       N->getOperand(8),
1897       N->getOperand(9),
1898       N->getOperand(10),
1899       N->getOperand(11),
1900       N->getOperand(12),
1901       N->getOperand(13),
1902       N->getOperand(14),
1903       N->getOperand(15),
1904       N->getOperand(16),
1905       N->getOperand(17),
1906       N->getOperand(18),
1907     };
1908     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
1909     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
1910   }
1911 
1912   case ISD::LOAD: {
1913     LoadSDNode *LoadNode = cast<LoadSDNode>(N);
1914     SDValue Ptr = LoadNode->getBasePtr();
1915     if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
1916          isa<ConstantSDNode>(Ptr))
1917       return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG);
1918     break;
1919   }
1920 
1921   default: break;
1922   }
1923 
1924   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1925 }
1926 
1927 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
1928                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
1929                                      SDValue &Sel, SDValue &Imm,
1930                                      SelectionDAG &DAG) const {
1931   const R600InstrInfo *TII = Subtarget->getInstrInfo();
1932   if (!Src.isMachineOpcode())
1933     return false;
1934 
1935   switch (Src.getMachineOpcode()) {
1936   case R600::FNEG_R600:
1937     if (!Neg.getNode())
1938       return false;
1939     Src = Src.getOperand(0);
1940     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
1941     return true;
1942   case R600::FABS_R600:
1943     if (!Abs.getNode())
1944       return false;
1945     Src = Src.getOperand(0);
1946     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
1947     return true;
1948   case R600::CONST_COPY: {
1949     unsigned Opcode = ParentNode->getMachineOpcode();
1950     bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
1951 
1952     if (!Sel.getNode())
1953       return false;
1954 
1955     SDValue CstOffset = Src.getOperand(0);
1956     if (ParentNode->getValueType(0).isVector())
1957       return false;
1958 
1959     // Gather constants values
1960     int SrcIndices[] = {
1961       TII->getOperandIdx(Opcode, R600::OpName::src0),
1962       TII->getOperandIdx(Opcode, R600::OpName::src1),
1963       TII->getOperandIdx(Opcode, R600::OpName::src2),
1964       TII->getOperandIdx(Opcode, R600::OpName::src0_X),
1965       TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
1966       TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
1967       TII->getOperandIdx(Opcode, R600::OpName::src0_W),
1968       TII->getOperandIdx(Opcode, R600::OpName::src1_X),
1969       TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
1970       TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
1971       TII->getOperandIdx(Opcode, R600::OpName::src1_W)
1972     };
1973     std::vector<unsigned> Consts;
1974     for (int OtherSrcIdx : SrcIndices) {
1975       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1976       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1977         continue;
1978       if (HasDst) {
1979         OtherSrcIdx--;
1980         OtherSelIdx--;
1981       }
1982       if (RegisterSDNode *Reg =
1983           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1984         if (Reg->getReg() == R600::ALU_CONST) {
1985           ConstantSDNode *Cst
1986             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
1987           Consts.push_back(Cst->getZExtValue());
1988         }
1989       }
1990     }
1991 
1992     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
1993     Consts.push_back(Cst->getZExtValue());
1994     if (!TII->fitsConstReadLimitations(Consts)) {
1995       return false;
1996     }
1997 
1998     Sel = CstOffset;
1999     Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
2000     return true;
2001   }
2002   case R600::MOV_IMM_GLOBAL_ADDR:
2003     // Check if the Imm slot is used. Taken from below.
2004     if (cast<ConstantSDNode>(Imm)->getZExtValue())
2005       return false;
2006     Imm = Src.getOperand(0);
2007     Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
2008     return true;
2009   case R600::MOV_IMM_I32:
2010   case R600::MOV_IMM_F32: {
2011     unsigned ImmReg = R600::ALU_LITERAL_X;
2012     uint64_t ImmValue = 0;
2013 
2014     if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
2015       ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0));
2016       float FloatValue = FPC->getValueAPF().convertToFloat();
2017       if (FloatValue == 0.0) {
2018         ImmReg = R600::ZERO;
2019       } else if (FloatValue == 0.5) {
2020         ImmReg = R600::HALF;
2021       } else if (FloatValue == 1.0) {
2022         ImmReg = R600::ONE;
2023       } else {
2024         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2025       }
2026     } else {
2027       ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0));
2028       uint64_t Value = C->getZExtValue();
2029       if (Value == 0) {
2030         ImmReg = R600::ZERO;
2031       } else if (Value == 1) {
2032         ImmReg = R600::ONE_INT;
2033       } else {
2034         ImmValue = Value;
2035       }
2036     }
2037 
2038     // Check that we aren't already using an immediate.
2039     // XXX: It's possible for an instruction to have more than one
2040     // immediate operand, but this is not supported yet.
2041     if (ImmReg == R600::ALU_LITERAL_X) {
2042       if (!Imm.getNode())
2043         return false;
2044       ConstantSDNode *C = cast<ConstantSDNode>(Imm);
2045       if (C->getZExtValue())
2046         return false;
2047       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2048     }
2049     Src = DAG.getRegister(ImmReg, MVT::i32);
2050     return true;
2051   }
2052   default:
2053     return false;
2054   }
2055 }
2056 
2057 /// Fold the instructions after selecting them
2058 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2059                                             SelectionDAG &DAG) const {
2060   const R600InstrInfo *TII = Subtarget->getInstrInfo();
2061   if (!Node->isMachineOpcode())
2062     return Node;
2063 
2064   unsigned Opcode = Node->getMachineOpcode();
2065   SDValue FakeOp;
2066 
2067   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2068 
2069   if (Opcode == R600::DOT_4) {
2070     int OperandIdx[] = {
2071       TII->getOperandIdx(Opcode, R600::OpName::src0_X),
2072       TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
2073       TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
2074       TII->getOperandIdx(Opcode, R600::OpName::src0_W),
2075       TII->getOperandIdx(Opcode, R600::OpName::src1_X),
2076       TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
2077       TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
2078       TII->getOperandIdx(Opcode, R600::OpName::src1_W)
2079         };
2080     int NegIdx[] = {
2081       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
2082       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
2083       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
2084       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
2085       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
2086       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
2087       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
2088       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
2089     };
2090     int AbsIdx[] = {
2091       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
2092       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
2093       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
2094       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
2095       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
2096       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
2097       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
2098       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
2099     };
2100     for (unsigned i = 0; i < 8; i++) {
2101       if (OperandIdx[i] < 0)
2102         return Node;
2103       SDValue &Src = Ops[OperandIdx[i] - 1];
2104       SDValue &Neg = Ops[NegIdx[i] - 1];
2105       SDValue &Abs = Ops[AbsIdx[i] - 1];
2106       bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
2107       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2108       if (HasDst)
2109         SelIdx--;
2110       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2111       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2112         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2113     }
2114   } else if (Opcode == R600::REG_SEQUENCE) {
2115     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2116       SDValue &Src = Ops[i];
2117       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2118         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2119     }
2120   } else {
2121     if (!TII->hasInstrModifiers(Opcode))
2122       return Node;
2123     int OperandIdx[] = {
2124       TII->getOperandIdx(Opcode, R600::OpName::src0),
2125       TII->getOperandIdx(Opcode, R600::OpName::src1),
2126       TII->getOperandIdx(Opcode, R600::OpName::src2)
2127     };
2128     int NegIdx[] = {
2129       TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
2130       TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
2131       TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
2132     };
2133     int AbsIdx[] = {
2134       TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
2135       TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
2136       -1
2137     };
2138     for (unsigned i = 0; i < 3; i++) {
2139       if (OperandIdx[i] < 0)
2140         return Node;
2141       SDValue &Src = Ops[OperandIdx[i] - 1];
2142       SDValue &Neg = Ops[NegIdx[i] - 1];
2143       SDValue FakeAbs;
2144       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2145       bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
2146       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2147       int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
2148       if (HasDst) {
2149         SelIdx--;
2150         ImmIdx--;
2151       }
2152       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2153       SDValue &Imm = Ops[ImmIdx];
2154       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2155         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2156     }
2157   }
2158 
2159   return Node;
2160 }
2161