PowerPC - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/Target/PowerPC/

//===- README_P9.txt - Notes for improving Power9 code gen ----------------===//

TODO: Instructions Need Implement Instrinstics or Map to LLVM IR

Altivec:
- Vector Compare Not Equal (Zero):
  vcmpneb(.) vcmpneh(.) vcmpnew(.)
  vcmpnezb(.) vcmpnezh(.) vcmpnezw(.)
  . Same as other VCMP*, use VCMP/VCMPo form (support intrinsic)

- Vector Extract Unsigned: vextractub vextractuh vextractuw vextractd
  . Don't use llvm extractelement because they have different semantics
  . Use instrinstics:
    (set v2i64:$vD, (int_ppc_altivec_vextractub v16i8:$vA, imm:$UIMM))
    (set v2i64:$vD, (int_ppc_altivec_vextractuh v8i16:$vA, imm:$UIMM))
    (set v2i64:$vD, (int_ppc_altivec_vextractuw v4i32:$vA, imm:$UIMM))
    (set v2i64:$vD, (int_ppc_altivec_vextractd  v2i64:$vA, imm:$UIMM))

- Vector Extract Unsigned Byte Left/Right-Indexed:
  vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
  . Use instrinstics:
    // Left-Indexed
    (set i64:$rD, (int_ppc_altivec_vextublx i64:$rA, v16i8:$vB))
    (set i64:$rD, (int_ppc_altivec_vextuhlx i64:$rA, v8i16:$vB))
    (set i64:$rD, (int_ppc_altivec_vextuwlx i64:$rA, v4i32:$vB))

    // Right-Indexed
    (set i64:$rD, (int_ppc_altivec_vextubrx i64:$rA, v16i8:$vB))
    (set i64:$rD, (int_ppc_altivec_vextuhrx i64:$rA, v8i16:$vB))
    (set i64:$rD, (int_ppc_altivec_vextuwrx i64:$rA, v4i32:$vB))

- Vector Insert Element Instructions: vinsertb vinsertd vinserth vinsertw
    (set v16i8:$vD, (int_ppc_altivec_vinsertb v16i8:$vA, imm:$UIMM))
    (set v8i16:$vD, (int_ppc_altivec_vinsertd v8i16:$vA, imm:$UIMM))
    (set v4i32:$vD, (int_ppc_altivec_vinserth v4i32:$vA, imm:$UIMM))
    (set v2i64:$vD, (int_ppc_altivec_vinsertw v2i64:$vA, imm:$UIMM))

- Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]:
  vclzlsbb vctzlsbb
  . Use intrinsic:
    (set i64:$rD, (int_ppc_altivec_vclzlsbb v16i8:$vB))
    (set i64:$rD, (int_ppc_altivec_vctzlsbb v16i8:$vB))

- Vector Count Trailing Zeros: vctzb vctzh vctzw vctzd
  . Map to llvm cttz
    (set v16i8:$vD, (cttz v16i8:$vB))     // vctzb
    (set v8i16:$vD, (cttz v8i16:$vB))     // vctzh
    (set v4i32:$vD, (cttz v4i32:$vB))     // vctzw
    (set v2i64:$vD, (cttz v2i64:$vB))     // vctzd

- Vector Extend Sign: vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
  . vextsb2w:
    (set v4i32:$vD, (sext v4i8:$vB))

    // PowerISA_V3.0:
    do i = 0 to 3
       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].byte[3])
    end

  . vextsh2w:
    (set v4i32:$vD, (sext v4i16:$vB))

    // PowerISA_V3.0:
    do i = 0 to 3
       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].hword[1])
    end

  . vextsb2d
    (set v2i64:$vD, (sext v2i8:$vB))

    // PowerISA_V3.0:
    do i = 0 to 1
       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].byte[7])
    end

  . vextsh2d
    (set v2i64:$vD, (sext v2i16:$vB))

    // PowerISA_V3.0:
    do i = 0 to 1
       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].hword[3])
    end

  . vextsw2d
    (set v2i64:$vD, (sext v2i32:$vB))

    // PowerISA_V3.0:
    do i = 0 to 1
       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].word[1])
    end

- Vector Integer Negate: vnegw vnegd
  . Map to llvm ineg
    (set v4i32:$rT, (ineg v4i32:$rA))       // vnegw
    (set v2i64:$rT, (ineg v2i64:$rA))       // vnegd

- Vector Parity Byte: vprtybw vprtybd vprtybq
  . Use intrinsic:
    (set v4i32:$rD, (int_ppc_altivec_vprtybw v4i32:$vB))
    (set v2i64:$rD, (int_ppc_altivec_vprtybd v2i64:$vB))
    (set v1i128:$rD, (int_ppc_altivec_vprtybq v1i128:$vB))

- Vector (Bit) Permute (Right-indexed):
  . vbpermd: Same as "vbpermq", use VX1_Int_Ty2:
    VX1_Int_Ty2<1484, "vbpermd", int_ppc_altivec_vbpermd, v2i64, v2i64>;

  . vpermr: use VA1a_Int_Ty3
    VA1a_Int_Ty3<59, "vpermr", int_ppc_altivec_vpermr, v16i8, v16i8, v16i8>;

- Vector Rotate Left Mask/Mask-Insert: vrlwnm vrlwmi vrldnm vrldmi
  . Use intrinsic:
    VX1_Int_Ty<389, "vrlwnm", int_ppc_altivec_vrlwnm, v4i32>;
    VX1_Int_Ty<133, "vrlwmi", int_ppc_altivec_vrlwmi, v4i32>;
    VX1_Int_Ty<453, "vrldnm", int_ppc_altivec_vrldnm, v2i64>;
    VX1_Int_Ty<197, "vrldmi", int_ppc_altivec_vrldmi, v2i64>;

- Vector Shift Left/Right: vslv vsrv
  . Use intrinsic, don't map to llvm shl and lshr, because they have different
    semantics, e.g. vslv:

      do i = 0 to 15
         sh ← VR[VRB].byte[i].bit[5:7]
         VR[VRT].byte[i] ← src.byte[i:i+1].bit[sh:sh+7]
      end

    VR[VRT].byte[i] is composed of 2 bytes from src.byte[i:i+1]

  . VX1_Int_Ty<1860, "vslv", int_ppc_altivec_vslv, v16i8>;
    VX1_Int_Ty<1796, "vsrv", int_ppc_altivec_vsrv, v16i8>;

- Vector Multiply-by-10 (& Write Carry) Unsigned Quadword:
  vmul10uq vmul10cuq
  . Use intrinsic:
    VX1_Int_Ty<513, "vmul10uq",   int_ppc_altivec_vmul10uq,  v1i128>;
    VX1_Int_Ty<  1, "vmul10cuq",  int_ppc_altivec_vmul10cuq, v1i128>;

- Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword:
  vmul10euq vmul10ecuq
  . Use intrinsic:
    VX1_Int_Ty<577, "vmul10euq",  int_ppc_altivec_vmul10euq, v1i128>;
    VX1_Int_Ty< 65, "vmul10ecuq", int_ppc_altivec_vmul10ecuq, v1i128>;

- Decimal Convert From/to National/Zoned/Signed-QWord:
  bcdcfn. bcdcfz. bcdctn. bcdctz. bcdcfsq. bcdctsq.
  . Use instrinstics:
    (set v1i128:$vD, (int_ppc_altivec_bcdcfno  v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcdcfzo  v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcdctno  v1i128:$vB))
    (set v1i128:$vD, (int_ppc_altivec_bcdctzo  v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcdcfsqo v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcdctsqo v1i128:$vB))

- Decimal Copy-Sign/Set-Sign: bcdcpsgn. bcdsetsgn.
  . Use instrinstics:
    (set v1i128:$vD, (int_ppc_altivec_bcdcpsgno v1i128:$vA, v1i128:$vB))
    (set v1i128:$vD, (int_ppc_altivec_bcdsetsgno v1i128:$vB, i1:$PS))

- Decimal Shift/Unsigned-Shift/Shift-and-Round: bcds. bcdus. bcdsr.
  . Use instrinstics:
    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
    (set v1i128:$vD, (int_ppc_altivec_bcdsro v1i128:$vA, v1i128:$vB, i1:$PS))

  . Note! Their VA is accessed only 1 byte, i.e. VA.byte[7]

- Decimal (Unsigned) Truncate: bcdtrunc. bcdutrunc.
  . Use instrinstics:
    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))

  . Note! Their VA is accessed only 2 byte, i.e. VA.hword[3] (VA.bit[48:63])

VSX:
- QP Copy Sign: xscpsgnqp
  . Similar to xscpsgndp
  . (set f128:$vT, (fcopysign f128:$vB, f128:$vA)

- QP Absolute/Negative-Absolute/Negate: xsabsqp xsnabsqp xsnegqp
  . Similar to xsabsdp/xsnabsdp/xsnegdp
  . (set f128:$vT, (fabs f128:$vB))             // xsabsqp
    (set f128:$vT, (fneg (fabs f128:$vB)))      // xsnabsqp
    (set f128:$vT, (fneg f128:$vB))             // xsnegqp

- QP Add/Divide/Multiply/Subtract/Square-Root:
  xsaddqp xsdivqp xsmulqp xssubqp xssqrtqp
  . Similar to xsadddp
  . isCommutable = 1
    (set f128:$vT, (fadd f128:$vA, f128:$vB))   // xsaddqp
    (set f128:$vT, (fmul f128:$vA, f128:$vB))   // xsmulqp

  . isCommutable = 0
    (set f128:$vT, (fdiv f128:$vA, f128:$vB))   // xsdivqp
    (set f128:$vT, (fsub f128:$vA, f128:$vB))   // xssubqp
    (set f128:$vT, (fsqrt f128:$vB)))           // xssqrtqp

- Round to Odd of QP Add/Divide/Multiply/Subtract/Square-Root:
  xsaddqpo xsdivqpo xsmulqpo xssubqpo xssqrtqpo
  . Similar to xsrsqrtedp??
      def XSRSQRTEDP : XX2Form<60, 74,
                               (outs vsfrc:$XT), (ins vsfrc:$XB),
                               "xsrsqrtedp $XT, $XB", IIC_VecFP,
                               [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;

  . Define DAG Node in PPCInstrInfo.td:
    def PPCfaddrto: SDNode<"PPCISD::FADDRTO", SDTFPBinOp, []>;
    def PPCfdivrto: SDNode<"PPCISD::FDIVRTO", SDTFPBinOp, []>;
    def PPCfmulrto: SDNode<"PPCISD::FMULRTO", SDTFPBinOp, []>;
    def PPCfsubrto: SDNode<"PPCISD::FSUBRTO", SDTFPBinOp, []>;
    def PPCfsqrtrto: SDNode<"PPCISD::FSQRTRTO", SDTFPUnaryOp, []>;

    DAG patterns of each instruction (PPCInstrVSX.td):
    . isCommutable = 1
      (set f128:$vT, (PPCfaddrto f128:$vA, f128:$vB))   // xsaddqpo
      (set f128:$vT, (PPCfmulrto f128:$vA, f128:$vB))   // xsmulqpo

    . isCommutable = 0
      (set f128:$vT, (PPCfdivrto f128:$vA, f128:$vB))   // xsdivqpo
      (set f128:$vT, (PPCfsubrto f128:$vA, f128:$vB))   // xssubqpo
      (set f128:$vT, (PPCfsqrtrto f128:$vB))            // xssqrtqpo

- QP (Negative) Multiply-{Add/Subtract}: xsmaddqp xsmsubqp xsnmaddqp xsnmsubqp
  . Ref: xsmaddadp/xsmsubadp/xsnmaddadp/xsnmsubadp

  . isCommutable = 1
    // xsmaddqp
    [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
    AltVSXFMARel;

    // xsmsubqp
    [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
    AltVSXFMARel;

    // xsnmaddqp
    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
    AltVSXFMARel;

    // xsnmsubqp
    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
    AltVSXFMARel;

- Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
  xsmaddqpo xsmsubqpo xsnmaddqpo xsnmsubqpo
  . Similar to xsrsqrtedp??

  . Define DAG Node in PPCInstrInfo.td:
    def PPCfmarto: SDNode<"PPCISD::FMARTO", SDTFPTernaryOp, []>;

    It looks like we only need to define "PPCfmarto" for these instructions,
    because according to PowerISA_V3.0, these instructions perform RTO on
    fma's result:
        xsmaddqp(o)
        v      ← bfp_MULTIPLY_ADD(src1, src3, src2)
        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
        result ← bfp_CONVERT_TO_BFP128(rnd)

        xsmsubqp(o)
        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
        result ← bfp_CONVERT_TO_BFP128(rnd)

        xsnmaddqp(o)
        v      ← bfp_MULTIPLY_ADD(src1,src3,src2)
        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
        result ← bfp_CONVERT_TO_BFP128(rnd)

        xsnmsubqp(o)
        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
        result ← bfp_CONVERT_TO_BFP128(rnd)

    DAG patterns of each instruction (PPCInstrVSX.td):
    . isCommutable = 1
      // xsmaddqpo
      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
      AltVSXFMARel;

      // xsmsubqpo
      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
      AltVSXFMARel;

      // xsnmaddqpo
      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
      AltVSXFMARel;

      // xsnmsubqpo
      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
      AltVSXFMARel;

- QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
  . ref: XSCMPUDP
      def XSCMPUDP : XX3Form_1<60, 35,
                               (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
                               "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;

  . No SDAG, intrinsic, builtin are required??
    Or llvm fcmp order/unorder compare??

- DP/QP Compare Exponents: xscmpexpdp xscmpexpqp
  . No SDAG, intrinsic, builtin are required?

- DP Compare ==, >=, >, !=: xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
  . I checked existing instruction "XSCMPUDP". They are different in target
    register. "XSCMPUDP" write to CR field, xscmp*dp write to VSX register

  . Use intrinsic:
    (set i128:$XT, (int_ppc_vsx_xscmpeqdp f64:$XA, f64:$XB))
    (set i128:$XT, (int_ppc_vsx_xscmpgedp f64:$XA, f64:$XB))
    (set i128:$XT, (int_ppc_vsx_xscmpgtdp f64:$XA, f64:$XB))
    (set i128:$XT, (int_ppc_vsx_xscmpnedp f64:$XA, f64:$XB))

- Vector Compare Not Equal: xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
  . Similar to xvcmpeqdp:
      defm XVCMPEQDP : XX3Form_Rcr<60, 99,
                                 "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
                                 int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;

  . So we should use "XX3Form_Rcr" to implement intrinsic

- Convert DP -> QP: xscvdpqp
  . Similar to XSCVDPSP:
      def XSCVDPSP : XX2Form<60, 265,
                          (outs vsfrc:$XT), (ins vsfrc:$XB),
                          "xscvdpsp $XT, $XB", IIC_VecFP, []>;
  . So, No SDAG, intrinsic, builtin are required??

- Round & Convert QP -> DP (dword[1] is set to zero): xscvqpdp xscvqpdpo
  . Similar to XSCVDPSP
  . No SDAG, intrinsic, builtin are required??

- Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero):
  xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
  . According to PowerISA_V3.0, these are similar to "XSCVDPSXDS", "XSCVDPSXWS",
    "XSCVDPUXDS", "XSCVDPUXWS"

  . DAG patterns:
    (set f128:$XT, (PPCfctidz f128:$XB))    // xscvqpsdz
    (set f128:$XT, (PPCfctiwz f128:$XB))    // xscvqpswz
    (set f128:$XT, (PPCfctiduz f128:$XB))   // xscvqpudz
    (set f128:$XT, (PPCfctiwuz f128:$XB))   // xscvqpuwz

- Convert (Un)Signed DWord -> QP: xscvsdqp xscvudqp
  . Similar to XSCVSXDSP
  . (set f128:$XT, (PPCfcfids f64:$XB))     // xscvsdqp
    (set f128:$XT, (PPCfcfidus f64:$XB))    // xscvudqp

- (Round &) Convert DP <-> HP: xscvdphp xscvhpdp
  . Similar to XSCVDPSP
  . No SDAG, intrinsic, builtin are required??

- Vector HP -> SP: xvcvhpsp xvcvsphp
  . Similar to XVCVDPSP:
      def XVCVDPSP : XX2Form<60, 393,
                          (outs vsrc:$XT), (ins vsrc:$XB),
                          "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
  . No SDAG, intrinsic, builtin are required??

- Round to Quad-Precision Integer: xsrqpi xsrqpix
  . These are combination of "XSRDPI", "XSRDPIC", "XSRDPIM", .., because you
    need to assign rounding mode in instruction
  . Provide builtin?
    (set f128:$vT, (int_ppc_vsx_xsrqpi f128:$vB))
    (set f128:$vT, (int_ppc_vsx_xsrqpix f128:$vB))

- Round Quad-Precision to Double-Extended Precision (fp80): xsrqpxp
  . Provide builtin?
    (set f128:$vT, (int_ppc_vsx_xsrqpxp f128:$vB))

Fixed Point Facility:

- Exploit cmprb and cmpeqb (perhaps for something like
  isalpha/isdigit/isupper/islower and isspace respectivelly). This can
  perhaps be done through a builtin.

- Provide testing for cnttz[dw]
- Insert Exponent DP/QP: xsiexpdp xsiexpqp
  . Use intrinsic?
  . xsiexpdp:
    // Note: rA and rB are the unsigned integer value.
    (set f128:$XT, (int_ppc_vsx_xsiexpdp i64:$rA, i64:$rB))

  . xsiexpqp:
    (set f128:$vT, (int_ppc_vsx_xsiexpqp f128:$vA, f64:$vB))

- Extract Exponent/Significand DP/QP: xsxexpdp xsxsigdp xsxexpqp xsxsigqp
  . Use intrinsic?
  . (set i64:$rT, (int_ppc_vsx_xsxexpdp f64$XB))    // xsxexpdp
    (set i64:$rT, (int_ppc_vsx_xsxsigdp f64$XB))    // xsxsigdp
    (set f128:$vT, (int_ppc_vsx_xsxexpqp f128$vB))  // xsxexpqp
    (set f128:$vT, (int_ppc_vsx_xsxsigqp f128$vB))  // xsxsigqp

- Vector Insert Word: xxinsertw
  - Useful for inserting f32/i32 elements into vectors (the element to be
    inserted needs to be prepared)
  . Note: llvm has insertelem in "Vector Operations"
    ; yields <n x <ty>>
    <result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx>

    But how to map to it??
    [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
    RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,

  . Or use intrinsic?
    (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))

- Vector Extract Unsigned Word: xxextractuw
  - Not useful for extraction of f32 from v4f32 (the current pattern is better -
    shift->convert)
  - It is useful for (uint_to_fp (vector_extract v4i32, N))
  - Unfortunately, it can't be used for (sint_to_fp (vector_extract v4i32, N))
  . Note: llvm has extractelement in "Vector Operations"
    ; yields <ty>
    <result> = extractelement <n x <ty>> <val>, <ty2> <idx>

    How to map to it??
    [(set f128:$XT, (extractelement v1f128:$XB, i4:$UIMM))]

  . Or use intrinsic?
    (set f128:$XT, (int_ppc_vsx_xxextractuw v1f128:$XB, i4:$UIMM))

- Vector Insert Exponent DP/SP: xviexpdp xviexpsp
  . Use intrinsic
    (set v2f64:$XT, (int_ppc_vsx_xviexpdp v2f64:$XA, v2f64:$XB))
    (set v4f32:$XT, (int_ppc_vsx_xviexpsp v4f32:$XA, v4f32:$XB))

- Vector Extract Exponent/Significand DP/SP: xvxexpdp xvxexpsp xvxsigdp xvxsigsp
  . Use intrinsic
    (set v2f64:$XT, (int_ppc_vsx_xvxexpdp v2f64:$XB))
    (set v4f32:$XT, (int_ppc_vsx_xvxexpsp v4f32:$XB))
    (set v2f64:$XT, (int_ppc_vsx_xvxsigdp v2f64:$XB))
    (set v4f32:$XT, (int_ppc_vsx_xvxsigsp v4f32:$XB))

- Test Data Class SP/DP/QP: xststdcsp xststdcdp xststdcqp
  . No SDAG, intrinsic, builtin are required?
    Because it seems that we have no way to map BF field?

    Instruction Form: [PO T XO B XO BX TX]
    Asm: xststd* BF,XB,DCMX

    BF is an index to CR register field.

- Vector Test Data Class SP/DP: xvtstdcsp xvtstdcdp
  . Use intrinsic
    (set v4f32:$XT, (int_ppc_vsx_xvtstdcsp v4f32:$XB, i7:$DCMX))
    (set v2f64:$XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, i7:$DCMX))

- Maximum/Minimum Type-C/Type-J DP: xsmaxcdp xsmaxjdp xsmincdp xsminjdp
  . PowerISA_V3.0:
    "xsmaxcdp can be used to implement the C/C++/Java conditional operation
     (x>y)?x:y for single-precision and double-precision arguments."

    Note! c type and j type have different behavior when:
    1. Either input is NaN
    2. Both input are +-Infinity, +-Zero

  . dtype map to llvm fmaxnum/fminnum
    jtype use intrinsic

  . xsmaxcdp xsmincdp
    (set f64:$XT, (fmaxnum f64:$XA, f64:$XB))
    (set f64:$XT, (fminnum f64:$XA, f64:$XB))

  . xsmaxjdp xsminjdp
    (set f64:$XT, (int_ppc_vsx_xsmaxjdp f64:$XA, f64:$XB))
    (set f64:$XT, (int_ppc_vsx_xsminjdp f64:$XA, f64:$XB))

- Vector Byte-Reverse H/W/D/Q Word: xxbrh xxbrw xxbrd xxbrq
  . Use intrinsic
    (set v8i16:$XT, (int_ppc_vsx_xxbrh v8i16:$XB))
    (set v4i32:$XT, (int_ppc_vsx_xxbrw v4i32:$XB))
    (set v2i64:$XT, (int_ppc_vsx_xxbrd v2i64:$XB))
    (set v1i128:$XT, (int_ppc_vsx_xxbrq v1i128:$XB))

- Vector Permute: xxperm xxpermr
  . I have checked "PPCxxswapd" in PPCInstrVSX.td, but they are different
  . Use intrinsic
    (set v16i8:$XT, (int_ppc_vsx_xxperm v16i8:$XA, v16i8:$XB))
    (set v16i8:$XT, (int_ppc_vsx_xxpermr v16i8:$XA, v16i8:$XB))

- Vector Splat Immediate Byte: xxspltib
  . Similar to XXSPLTW:
      def XXSPLTW : XX2Form_2<60, 164,
                           (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
                           "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;

  . No SDAG, intrinsic, builtin are required?

- Load/Store Vector: lxv stxv
  . Has likely SDAG match:
    (set v?:$XT, (load ix16addr:$src))
    (set v?:$XT, (store ix16addr:$dst))

  . Need define ix16addr in PPCInstrInfo.td
    ix16addr: 16-byte aligned, see "def memrix16" in PPCInstrInfo.td

- Load/Store Vector Indexed: lxvx stxvx
  . Has likely SDAG match:
    (set v?:$XT, (load xoaddr:$src))
    (set v?:$XT, (store xoaddr:$dst))

- Load/Store DWord: lxsd stxsd
  . Similar to lxsdx/stxsdx:
    def LXSDX : XX1Form<31, 588,
                        (outs vsfrc:$XT), (ins memrr:$src),
                        "lxsdx $XT, $src", IIC_LdStLFD,
                        [(set f64:$XT, (load xoaddr:$src))]>;

  . (set f64:$XT, (load iaddrX4:$src))
    (set f64:$XT, (store iaddrX4:$dst))

- Load/Store SP, with conversion from/to DP: lxssp stxssp
  . Similar to lxsspx/stxsspx:
    def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
                         "lxsspx $XT, $src", IIC_LdStLFD,
                         [(set f32:$XT, (load xoaddr:$src))]>;

  . (set f32:$XT, (load iaddrX4:$src))
    (set f32:$XT, (store iaddrX4:$dst))

- Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx
  . Similar to lxsiwzx:
    def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
                          "lxsiwzx $XT, $src", IIC_LdStLFD,
                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;

  . (set f64:$XT, (PPClfiwzx xoaddr:$src))

- Store as Integer Byte/Halfword Indexed: stxsibx stxsihx
  . Similar to stxsiwx:
    def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;

  . (PPCstfiwx f64:$XT, xoaddr:$dst)

- Load Vector Halfword*8/Byte*16 Indexed: lxvh8x lxvb16x
  . Similar to lxvd2x/lxvw4x:
    def LXVD2X : XX1Form<31, 844,
                         (outs vsrc:$XT), (ins memrr:$src),
                         "lxvd2x $XT, $src", IIC_LdStLFD,
                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;

  . (set v8i16:$XT, (int_ppc_vsx_lxvh8x xoaddr:$src))
    (set v16i8:$XT, (int_ppc_vsx_lxvb16x xoaddr:$src))

- Store Vector Halfword*8/Byte*16 Indexed: stxvh8x stxvb16x
  . Similar to stxvd2x/stxvw4x:
    def STXVD2X : XX1Form<31, 972,
                         (outs), (ins vsrc:$XT, memrr:$dst),
                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
                         [(store v2f64:$XT, xoaddr:$dst)]>;

  . (store v8i16:$XT, xoaddr:$dst)
    (store v16i8:$XT, xoaddr:$dst)

- Load/Store Vector (Left-justified) with Length: lxvl lxvll stxvl stxvll
  . Likely needs an intrinsic
  . (set v?:$XT, (int_ppc_vsx_lxvl xoaddr:$src))
    (set v?:$XT, (int_ppc_vsx_lxvll xoaddr:$src))

  . (int_ppc_vsx_stxvl xoaddr:$dst))
    (int_ppc_vsx_stxvll xoaddr:$dst))

- Load Vector Word & Splat Indexed: lxvwsx
  . Likely needs an intrinsic
  . (set v?:$XT, (int_ppc_vsx_lxvwsx xoaddr:$src))

Atomic operations (l[dw]at, st[dw]at):
- Provide custom lowering for common atomic operations to use these
  instructions with the correct Function Code
- Ensure the operands are in the correct register (i.e. RT+1, RT+2)
- Provide builtins since not all FC's necessarily have an existing LLVM
  atomic operation

Move to CR from XER Extended (mcrxrx):
- Is there a use for this in LLVM?

Fixed Point Facility:

- Copy-Paste Facility: copy copy_first cp_abort paste paste. paste_last
  . Use instrinstics:
    (int_ppc_copy_first i32:$rA, i32:$rB)
    (int_ppc_copy i32:$rA, i32:$rB)

    (int_ppc_paste i32:$rA, i32:$rB)
    (int_ppc_paste_last i32:$rA, i32:$rB)

    (int_cp_abort)

- Message Synchronize: msgsync
- SLB*: slbieg slbsync
- stop
  . No instrinstics
Name		Date	Size	#Lines	LOC
..		-	-
AsmParser/	H	-	-	1,944	1,627
Disassembler/	H	-	-	407	305
GISel/	H	-	-	1,621	1,138
MCTargetDesc/	H	-	-	4,075	3,015
TargetInfo/	H	-	-	67	39
P10InstrResources.td	H A D	28-Jul-2024	44.3 KiB	2,057	1,923
P9InstrResources.td	H A D	02-Sep-2023	37.6 KiB	1,440	1,353
PPC.h	H A D	28-Jul-2024	7.7 KiB	210	106
PPC.td	H A D	30-Jul-2024	38.6 KiB	750	702
PPCAsmPrinter.cpp	H A D	28-Jul-2024	128.5 KiB	3,340	2,400
PPCBack2BackFusion.def	H A D	28-Jul-2024	13.7 KiB	1,054	1,053
PPCBoolRetToInt.cpp	H A D	28-Jul-2024	10 KiB	291	194
PPCBranchCoalescing.cpp	H A D	28-Jul-2024	30.2 KiB	791	397
PPCBranchSelector.cpp	H A D	02-Dec-2021	15.8 KiB	414	228
PPCCCState.cpp	H A D	13-Jun-2021	1.1 KiB	36	23
PPCCCState.h	H A D	22-Aug-2021	2.2 KiB	74	46
PPCCTRLoops.cpp	H A D	28-Jul-2024	12.2 KiB	362	238
PPCCTRLoopsVerify.cpp	H A D	28-Jul-2024	5.6 KiB	184	136
PPCCallingConv.cpp	H A D	02-Sep-2023	7.9 KiB	199	129
PPCCallingConv.h	H A D	02-Sep-2023	2.1 KiB	51	31
PPCCallingConv.td	H A D	02-Sep-2023	18.6 KiB	431	348
PPCEarlyReturn.cpp	H A D	28-Jul-2024	7.1 KiB	211	148
PPCExpandAtomicPseudoInsts.cpp	H A D	28-Jul-2024	11 KiB	303	241
PPCExpandISEL.cpp	H A D	27-Aug-2020	17.9 KiB	492	308
PPCFastISel.cpp	H A D	28-Jul-2024	85.4 KiB	2,477	1,698
PPCFrameLowering.cpp	H A D	25-Aug-2024	107.7 KiB	2,812	1,953
PPCFrameLowering.h	H A D	28-Jul-2024	7.8 KiB	183	75
PPCGenRegisterBankInfo.def	H A D	18-Dec-2023	4.6 KiB	113	101
PPCGenScalarMASSEntries.cpp	H A D	04-Jul-2022	4.5 KiB	150	88
PPCHazardRecognizers.cpp	H A D	28-Jul-2024	14.1 KiB	436	281
PPCHazardRecognizers.h	H A D	20-Dec-2019	3.8 KiB	102	52
PPCISelDAGToDAG.cpp	H A D	28-Jul-2024	307.4 KiB	7,939	5,708
PPCISelLowering.cpp	H A D	22-Sep-2024	740.1 KiB	18,862	13,542
PPCISelLowering.h	H A D	28-Jul-2024	65.3 KiB	1,513	737
PPCInstr64Bit.td	H A D	04-Sep-2024	95.8 KiB	2,025	1,796
PPCInstrAltivec.td	H A D	02-Sep-2023	80.1 KiB	1,654	1,460
PPCInstrBuilder.h	H A D	20-Dec-2019	1.5 KiB	43	14
PPCInstrDFP.td	H A D	02-Sep-2023	10.1 KiB	194	165
PPCInstrFormats.td	H A D	18-Dec-2023	61.4 KiB	2,350	1,939
PPCInstrFuture.td	H A D	02-Sep-2023	3.3 KiB	89	74
PPCInstrFutureMMA.td	H A D	14-Apr-2023	3.8 KiB	117	98
PPCInstrHTM.td	H A D	02-Sep-2023	5.4 KiB	176	128
PPCInstrInfo.cpp	H A D	30-Jul-2024	204.7 KiB	5,581	4,110
PPCInstrInfo.h	H A D	28-Jul-2024	31.2 KiB	681	447
PPCInstrInfo.td	H A D	04-Sep-2024	243.1 KiB	5,336	4,711
PPCInstrMMA.td	H A D	02-Sep-2023	55.9 KiB	1,107	1,048
PPCInstrP10.td	H A D	28-Jul-2024	109.9 KiB	2,525	2,285
PPCInstrSPE.td	H A D	02-Sep-2023	49.7 KiB	882	775
PPCInstrVSX.td	H A D	28-Jul-2024	248.8 KiB	5,153	4,722
PPCLoopInstrFormPrep.cpp	H A D	28-Jul-2024	55.4 KiB	1,499	936
PPCLowerMASSVEntries.cpp	H A D	28-Jul-2024	6.5 KiB	200	123
PPCMCInstLower.cpp	H A D	28-Jul-2024	8.6 KiB	231	185
PPCMIPeephole.cpp	H A D	28-Jul-2024	81.2 KiB	2,048	1,475
PPCMachineFunctionInfo.cpp	H A D	14-Apr-2023	6.6 KiB	196	160
PPCMachineFunctionInfo.h	H A D	28-Jul-2024	10.8 KiB	292	134
PPCMachineScheduler.cpp	H A D	22-Aug-2021	9.5 KiB	252	153
PPCMachineScheduler.h	H A D	22-Aug-2021	1.8 KiB	54	30
PPCMacroFusion.cpp	H A D	18-Dec-2023	9.9 KiB	294	191
PPCMacroFusion.def	H A D	28-Jul-2024	6.6 KiB	160	128
PPCMacroFusion.h	H A D	22-Aug-2021	1 KiB	28	7
PPCMergeStringPool.cpp	H A D	28-Jul-2024	12.3 KiB	334	199
PPCPerfectShuffle.h	H A D	20-Dec-2019	397.6 KiB	6,591	6,567
PPCPfmCounters.td	H A D	20-Dec-2019	705	19	16
PPCPreEmitPeephole.cpp	H A D	28-Jul-2024	24.3 KiB	609	448
PPCReduceCRLogicals.cpp	H A D	28-Jul-2024	28.7 KiB	740	573
PPCRegisterInfo.cpp	H A D	28-Jul-2024	77.6 KiB	1,966	1,396
PPCRegisterInfo.h	H A D	18-Dec-2023	7.5 KiB	235	170
PPCRegisterInfo.td	H A D	06-Aug-2024	39.8 KiB	1,145	1,040
PPCRegisterInfoDMR.td	H A D	14-Apr-2023	5.5 KiB	165	145
PPCRegisterInfoMMA.td	H A D	14-Apr-2023	4.8 KiB	111	98
PPCSchedPredicates.td	H A D	14-Apr-2023	8.9 KiB	296	292
PPCSchedule.td	H A D	20-Mar-2022	5.4 KiB	147	143
PPCSchedule440.td	H A D	20-Dec-2019	34.6 KiB	601	586
PPCScheduleA2.td	H A D	20-Dec-2019	7.9 KiB	170	159
PPCScheduleE500.td	H A D	20-Dec-2019	16.6 KiB	280	272
PPCScheduleE500mc.td	H A D	20-Dec-2019	20.9 KiB	335	327
PPCScheduleE5500.td	H A D	20-Dec-2019	23.6 KiB	379	369
PPCScheduleG3.td	H A D	20-Dec-2019	4.5 KiB	81	78
PPCScheduleG4.td	H A D	20-Dec-2019	5.4 KiB	97	94
PPCScheduleG4Plus.td	H A D	20-Dec-2019	6.4 KiB	111	108
PPCScheduleG5.td	H A D	20-Dec-2019	7.1 KiB	129	121
PPCScheduleP10.td	H A D	18-Dec-2023	12.8 KiB	411	328
PPCScheduleP7.td	H A D	18-Dec-2023	14.8 KiB	405	388
PPCScheduleP8.td	H A D	18-Dec-2023	15.9 KiB	414	397
PPCScheduleP9.td	H A D	18-Dec-2023	12.3 KiB	429	356
PPCSubtarget.cpp	H A D	28-Jul-2024	9.2 KiB	269	179
PPCSubtarget.h	H A D	30-Jul-2024	9.9 KiB	319	204
PPCTLSDynamicCall.cpp	H A D	28-Jul-2024	14.6 KiB	345	229
PPCTOCRegDeps.cpp	H A D	28-Jul-2024	5.2 KiB	153	69
PPCTargetMachine.cpp	H A D	28-Jul-2024	22.2 KiB	647	459
PPCTargetMachine.h	H A D	28-Jul-2024	2.9 KiB	87	50
PPCTargetObjectFile.cpp	H A D	22-Aug-2021	2.5 KiB	60	30
PPCTargetObjectFile.h	H A D	20-Dec-2019	1.2 KiB	34	14
PPCTargetStreamer.h	H A D	14-Apr-2023	1.1 KiB	37	21
PPCTargetTransformInfo.cpp	H A D	30-Jul-2024	41.1 KiB	1,100	770
PPCTargetTransformInfo.h	H A D	28-Jul-2024	7.6 KiB	162	121
PPCVSXCopy.cpp	H A D	20-Mar-2022	5.6 KiB	170	116
PPCVSXFMAMutate.cpp	H A D	28-Jul-2024	14.9 KiB	392	223
PPCVSXSwapRemoval.cpp	H A D	02-Sep-2023	38 KiB	1,075	679
README_P9.txt	H A D	04-Jul-2022	22 KiB	601	475