1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "intrinsics_arm64.h"
18
19 #include "arch/arm64/callee_save_frame_arm64.h"
20 #include "arch/arm64/instruction_set_features_arm64.h"
21 #include "art_method.h"
22 #include "base/bit_utils.h"
23 #include "code_generator_arm64.h"
24 #include "common_arm64.h"
25 #include "data_type-inl.h"
26 #include "entrypoints/quick/quick_entrypoints.h"
27 #include "heap_poisoning.h"
28 #include "intrinsic_objects.h"
29 #include "intrinsics.h"
30 #include "intrinsics_utils.h"
31 #include "lock_word.h"
32 #include "mirror/array-inl.h"
33 #include "mirror/method_handle_impl.h"
34 #include "mirror/object_array-inl.h"
35 #include "mirror/reference.h"
36 #include "mirror/string-inl.h"
37 #include "mirror/var_handle.h"
38 #include "optimizing/data_type.h"
39 #include "scoped_thread_state_change-inl.h"
40 #include "thread-current-inl.h"
41 #include "utils/arm64/assembler_arm64.h"
42 #include "well_known_classes.h"
43
44 using namespace vixl::aarch64; // NOLINT(build/namespaces)
45
46 // TODO(VIXL): Make VIXL compile cleanly with -Wshadow, -Wdeprecated-declarations.
47 #pragma GCC diagnostic push
48 #pragma GCC diagnostic ignored "-Wshadow"
49 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
50 #include "aarch64/disasm-aarch64.h"
51 #include "aarch64/macro-assembler-aarch64.h"
52 #pragma GCC diagnostic pop
53
54 namespace art HIDDEN {
55
56 namespace arm64 {
57
58 using helpers::CPURegisterFrom;
59 using helpers::DRegisterFrom;
60 using helpers::HeapOperand;
61 using helpers::LocationFrom;
62 using helpers::Int64FromLocation;
63 using helpers::InputCPURegisterAt;
64 using helpers::InputCPURegisterOrZeroRegAt;
65 using helpers::OperandFrom;
66 using helpers::RegisterFrom;
67 using helpers::SRegisterFrom;
68 using helpers::WRegisterFrom;
69 using helpers::XRegisterFrom;
70 using helpers::HRegisterFrom;
71 using helpers::InputRegisterAt;
72 using helpers::OutputRegister;
73
74 namespace {
75
AbsoluteHeapOperandFrom(Location location,size_t offset=0)76 ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_t offset = 0) {
77 return MemOperand(XRegisterFrom(location), offset);
78 }
79
80 } // namespace
81
GetVIXLAssembler()82 MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
83 return codegen_->GetVIXLAssembler();
84 }
85
GetAllocator()86 ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
87 return codegen_->GetGraph()->GetAllocator();
88 }
89
90 using IntrinsicSlowPathARM64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM64,
91 SlowPathCodeARM64,
92 Arm64Assembler>;
93
94 #define __ codegen->GetVIXLAssembler()->
95
96 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
97 class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 {
98 public:
ReadBarrierSystemArrayCopySlowPathARM64(HInstruction * instruction,Location tmp)99 ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
100 : SlowPathCodeARM64(instruction), tmp_(tmp) {
101 }
102
EmitNativeCode(CodeGenerator * codegen_in)103 void EmitNativeCode(CodeGenerator* codegen_in) override {
104 DCHECK(codegen_in->EmitBakerReadBarrier());
105 CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
106 LocationSummary* locations = instruction_->GetLocations();
107 DCHECK(locations->CanCall());
108 DCHECK(instruction_->IsInvokeStaticOrDirect())
109 << "Unexpected instruction in read barrier arraycopy slow path: "
110 << instruction_->DebugName();
111 DCHECK(instruction_->GetLocations()->Intrinsified());
112 DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
113
114 const int32_t element_size = DataType::Size(DataType::Type::kReference);
115
116 Register src_curr_addr = XRegisterFrom(locations->GetTemp(0));
117 Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1));
118 Register src_stop_addr = XRegisterFrom(locations->GetTemp(2));
119 Register tmp_reg = WRegisterFrom(tmp_);
120
121 __ Bind(GetEntryLabel());
122 // The source range and destination pointer were initialized before entering the slow-path.
123 vixl::aarch64::Label slow_copy_loop;
124 __ Bind(&slow_copy_loop);
125 __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex));
126 codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg);
127 // TODO: Inline the mark bit check before calling the runtime?
128 // tmp_reg = ReadBarrier::Mark(tmp_reg);
129 // No need to save live registers; it's taken care of by the
130 // entrypoint. Also, there is no need to update the stack mask,
131 // as this runtime call will not trigger a garbage collection.
132 // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more
133 // explanations.)
134 DCHECK_NE(tmp_.reg(), LR);
135 DCHECK_NE(tmp_.reg(), WSP);
136 DCHECK_NE(tmp_.reg(), WZR);
137 // IP0 is used internally by the ReadBarrierMarkRegX entry point
138 // as a temporary (and not preserved). It thus cannot be used by
139 // any live register in this slow path.
140 DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0);
141 DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0);
142 DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0);
143 DCHECK_NE(tmp_.reg(), IP0);
144 DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg();
145 // TODO: Load the entrypoint once before the loop, instead of
146 // loading it at every iteration.
147 int32_t entry_point_offset =
148 Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
149 // This runtime call does not require a stack map.
150 codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
151 codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
152 __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex));
153 __ Cmp(src_curr_addr, src_stop_addr);
154 __ B(&slow_copy_loop, ne);
155 __ B(GetExitLabel());
156 }
157
GetDescription() const158 const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathARM64"; }
159
160 private:
161 Location tmp_;
162
163 DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64);
164 };
165
166 // The MethodHandle.invokeExact intrinsic sets up arguments to match the target method call. If we
167 // need to go to the slow path, we call art_quick_invoke_polymorphic_with_hidden_receiver, which
168 // expects the MethodHandle object in w0 (in place of the actual ArtMethod).
169 class InvokePolymorphicSlowPathARM64 : public SlowPathCodeARM64 {
170 public:
InvokePolymorphicSlowPathARM64(HInstruction * instruction,Register method_handle)171 InvokePolymorphicSlowPathARM64(HInstruction* instruction, Register method_handle)
172 : SlowPathCodeARM64(instruction), method_handle_(method_handle) {
173 DCHECK(instruction->IsInvokePolymorphic());
174 }
175
EmitNativeCode(CodeGenerator * codegen_in)176 void EmitNativeCode(CodeGenerator* codegen_in) override {
177 CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
178 __ Bind(GetEntryLabel());
179
180 SaveLiveRegisters(codegen, instruction_->GetLocations());
181 // Passing `MethodHandle` object as hidden argument.
182 __ Mov(w0, method_handle_.W());
183 codegen->InvokeRuntime(QuickEntrypointEnum::kQuickInvokePolymorphicWithHiddenReceiver,
184 instruction_,
185 instruction_->GetDexPc());
186
187 RestoreLiveRegisters(codegen, instruction_->GetLocations());
188 __ B(GetExitLabel());
189 }
190
GetDescription() const191 const char* GetDescription() const override { return "InvokePolymorphicSlowPathARM64"; }
192
193 private:
194 const Register method_handle_;
195 DISALLOW_COPY_AND_ASSIGN(InvokePolymorphicSlowPathARM64);
196 };
197
198 #undef __
199
TryDispatch(HInvoke * invoke)200 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
201 Dispatch(invoke);
202 LocationSummary* res = invoke->GetLocations();
203 if (res == nullptr) {
204 return false;
205 }
206 return res->Intrinsified();
207 }
208
209 #define __ masm->
210
CreateFPToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)211 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
212 LocationSummary* locations =
213 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
214 locations->SetInAt(0, Location::RequiresFpuRegister());
215 locations->SetOut(Location::RequiresRegister());
216 }
217
CreateIntToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)218 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
219 LocationSummary* locations =
220 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
221 locations->SetInAt(0, Location::RequiresRegister());
222 locations->SetOut(Location::RequiresFpuRegister());
223 }
224
MoveFPToInt(LocationSummary * locations,bool is64bit,MacroAssembler * masm)225 static void MoveFPToInt(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
226 Location input = locations->InAt(0);
227 Location output = locations->Out();
228 __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output),
229 is64bit ? DRegisterFrom(input) : SRegisterFrom(input));
230 }
231
MoveIntToFP(LocationSummary * locations,bool is64bit,MacroAssembler * masm)232 static void MoveIntToFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
233 Location input = locations->InAt(0);
234 Location output = locations->Out();
235 __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output),
236 is64bit ? XRegisterFrom(input) : WRegisterFrom(input));
237 }
238
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)239 void IntrinsicLocationsBuilderARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
240 CreateFPToIntLocations(allocator_, invoke);
241 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)242 void IntrinsicLocationsBuilderARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
243 CreateIntToFPLocations(allocator_, invoke);
244 }
245
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)246 void IntrinsicCodeGeneratorARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
247 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
248 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)249 void IntrinsicCodeGeneratorARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
250 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
251 }
252
VisitFloatFloatToRawIntBits(HInvoke * invoke)253 void IntrinsicLocationsBuilderARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
254 CreateFPToIntLocations(allocator_, invoke);
255 }
VisitFloatIntBitsToFloat(HInvoke * invoke)256 void IntrinsicLocationsBuilderARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
257 CreateIntToFPLocations(allocator_, invoke);
258 }
259
VisitFloatFloatToRawIntBits(HInvoke * invoke)260 void IntrinsicCodeGeneratorARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
261 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
262 }
VisitFloatIntBitsToFloat(HInvoke * invoke)263 void IntrinsicCodeGeneratorARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
264 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
265 }
266
CreateIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)267 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
268 LocationSummary* locations =
269 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
270 locations->SetInAt(0, Location::RequiresRegister());
271 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
272 }
273
CreateIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)274 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
275 LocationSummary* locations =
276 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
277 locations->SetInAt(0, Location::RequiresRegister());
278 locations->SetInAt(1, Location::RequiresRegister());
279 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
280 }
281
CreateIntIntToIntSlowPathCallLocations(ArenaAllocator * allocator,HInvoke * invoke)282 static void CreateIntIntToIntSlowPathCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
283 LocationSummary* locations =
284 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
285 locations->SetInAt(0, Location::RequiresRegister());
286 locations->SetInAt(1, Location::RequiresRegister());
287 // Force kOutputOverlap; see comments in IntrinsicSlowPath::EmitNativeCode.
288 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
289 }
290
GenerateReverseBytes(MacroAssembler * masm,DataType::Type type,CPURegister in,CPURegister out)291 static void GenerateReverseBytes(MacroAssembler* masm,
292 DataType::Type type,
293 CPURegister in,
294 CPURegister out) {
295 switch (type) {
296 case DataType::Type::kUint16:
297 __ Rev16(out.W(), in.W());
298 break;
299 case DataType::Type::kInt16:
300 __ Rev16(out.W(), in.W());
301 __ Sxth(out.W(), out.W());
302 break;
303 case DataType::Type::kInt32:
304 __ Rev(out.W(), in.W());
305 break;
306 case DataType::Type::kInt64:
307 __ Rev(out.X(), in.X());
308 break;
309 case DataType::Type::kFloat32:
310 __ Rev(in.W(), in.W()); // Note: Clobbers `in`.
311 __ Fmov(out.S(), in.W());
312 break;
313 case DataType::Type::kFloat64:
314 __ Rev(in.X(), in.X()); // Note: Clobbers `in`.
315 __ Fmov(out.D(), in.X());
316 break;
317 default:
318 LOG(FATAL) << "Unexpected type for reverse-bytes: " << type;
319 UNREACHABLE();
320 }
321 }
322
GenReverseBytes(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)323 static void GenReverseBytes(LocationSummary* locations,
324 DataType::Type type,
325 MacroAssembler* masm) {
326 Location in = locations->InAt(0);
327 Location out = locations->Out();
328 GenerateReverseBytes(masm, type, CPURegisterFrom(in, type), CPURegisterFrom(out, type));
329 }
330
VisitIntegerReverseBytes(HInvoke * invoke)331 void IntrinsicLocationsBuilderARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
332 CreateIntToIntLocations(allocator_, invoke);
333 }
334
VisitIntegerReverseBytes(HInvoke * invoke)335 void IntrinsicCodeGeneratorARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
336 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
337 }
338
VisitLongReverseBytes(HInvoke * invoke)339 void IntrinsicLocationsBuilderARM64::VisitLongReverseBytes(HInvoke* invoke) {
340 CreateIntToIntLocations(allocator_, invoke);
341 }
342
VisitLongReverseBytes(HInvoke * invoke)343 void IntrinsicCodeGeneratorARM64::VisitLongReverseBytes(HInvoke* invoke) {
344 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
345 }
346
VisitShortReverseBytes(HInvoke * invoke)347 void IntrinsicLocationsBuilderARM64::VisitShortReverseBytes(HInvoke* invoke) {
348 CreateIntToIntLocations(allocator_, invoke);
349 }
350
VisitShortReverseBytes(HInvoke * invoke)351 void IntrinsicCodeGeneratorARM64::VisitShortReverseBytes(HInvoke* invoke) {
352 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetVIXLAssembler());
353 }
354
GenNumberOfLeadingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)355 static void GenNumberOfLeadingZeros(LocationSummary* locations,
356 DataType::Type type,
357 MacroAssembler* masm) {
358 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
359
360 Location in = locations->InAt(0);
361 Location out = locations->Out();
362
363 __ Clz(RegisterFrom(out, type), RegisterFrom(in, type));
364 }
365
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)366 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
367 CreateIntToIntLocations(allocator_, invoke);
368 }
369
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)370 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
371 GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
372 }
373
VisitLongNumberOfLeadingZeros(HInvoke * invoke)374 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
375 CreateIntToIntLocations(allocator_, invoke);
376 }
377
VisitLongNumberOfLeadingZeros(HInvoke * invoke)378 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
379 GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
380 }
381
GenNumberOfTrailingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)382 static void GenNumberOfTrailingZeros(LocationSummary* locations,
383 DataType::Type type,
384 MacroAssembler* masm) {
385 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
386
387 Location in = locations->InAt(0);
388 Location out = locations->Out();
389
390 __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
391 __ Clz(RegisterFrom(out, type), RegisterFrom(out, type));
392 }
393
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)394 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
395 CreateIntToIntLocations(allocator_, invoke);
396 }
397
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)398 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
399 GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
400 }
401
VisitLongNumberOfTrailingZeros(HInvoke * invoke)402 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
403 CreateIntToIntLocations(allocator_, invoke);
404 }
405
VisitLongNumberOfTrailingZeros(HInvoke * invoke)406 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
407 GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
408 }
409
GenReverse(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)410 static void GenReverse(LocationSummary* locations,
411 DataType::Type type,
412 MacroAssembler* masm) {
413 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
414
415 Location in = locations->InAt(0);
416 Location out = locations->Out();
417
418 __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
419 }
420
VisitIntegerReverse(HInvoke * invoke)421 void IntrinsicLocationsBuilderARM64::VisitIntegerReverse(HInvoke* invoke) {
422 CreateIntToIntLocations(allocator_, invoke);
423 }
424
VisitIntegerReverse(HInvoke * invoke)425 void IntrinsicCodeGeneratorARM64::VisitIntegerReverse(HInvoke* invoke) {
426 GenReverse(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
427 }
428
VisitLongReverse(HInvoke * invoke)429 void IntrinsicLocationsBuilderARM64::VisitLongReverse(HInvoke* invoke) {
430 CreateIntToIntLocations(allocator_, invoke);
431 }
432
VisitLongReverse(HInvoke * invoke)433 void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) {
434 GenReverse(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
435 }
436
GenBitCount(HInvoke * instr,DataType::Type type,MacroAssembler * masm)437 static void GenBitCount(HInvoke* instr, DataType::Type type, MacroAssembler* masm) {
438 DCHECK(DataType::IsIntOrLongType(type)) << type;
439 DCHECK_EQ(instr->GetType(), DataType::Type::kInt32);
440 DCHECK_EQ(DataType::Kind(instr->InputAt(0)->GetType()), type);
441
442 UseScratchRegisterScope temps(masm);
443
444 Register src = InputRegisterAt(instr, 0);
445 Register dst = RegisterFrom(instr->GetLocations()->Out(), type);
446 VRegister fpr = (type == DataType::Type::kInt64) ? temps.AcquireD() : temps.AcquireS();
447
448 __ Fmov(fpr, src);
449 __ Cnt(fpr.V8B(), fpr.V8B());
450 __ Addv(fpr.B(), fpr.V8B());
451 __ Fmov(dst, fpr);
452 }
453
VisitLongBitCount(HInvoke * invoke)454 void IntrinsicLocationsBuilderARM64::VisitLongBitCount(HInvoke* invoke) {
455 CreateIntToIntLocations(allocator_, invoke);
456 }
457
VisitLongBitCount(HInvoke * invoke)458 void IntrinsicCodeGeneratorARM64::VisitLongBitCount(HInvoke* invoke) {
459 GenBitCount(invoke, DataType::Type::kInt64, GetVIXLAssembler());
460 }
461
VisitIntegerBitCount(HInvoke * invoke)462 void IntrinsicLocationsBuilderARM64::VisitIntegerBitCount(HInvoke* invoke) {
463 CreateIntToIntLocations(allocator_, invoke);
464 }
465
VisitIntegerBitCount(HInvoke * invoke)466 void IntrinsicCodeGeneratorARM64::VisitIntegerBitCount(HInvoke* invoke) {
467 GenBitCount(invoke, DataType::Type::kInt32, GetVIXLAssembler());
468 }
469
GenHighestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)470 static void GenHighestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
471 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
472
473 UseScratchRegisterScope temps(masm);
474
475 Register src = InputRegisterAt(invoke, 0);
476 Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
477 Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
478 size_t high_bit = (type == DataType::Type::kInt64) ? 63u : 31u;
479 size_t clz_high_bit = (type == DataType::Type::kInt64) ? 6u : 5u;
480
481 __ Clz(temp, src);
482 __ Mov(dst, UINT64_C(1) << high_bit); // MOV (bitmask immediate)
483 __ Bic(dst, dst, Operand(temp, LSL, high_bit - clz_high_bit)); // Clear dst if src was 0.
484 __ Lsr(dst, dst, temp);
485 }
486
VisitIntegerHighestOneBit(HInvoke * invoke)487 void IntrinsicLocationsBuilderARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
488 CreateIntToIntLocations(allocator_, invoke);
489 }
490
VisitIntegerHighestOneBit(HInvoke * invoke)491 void IntrinsicCodeGeneratorARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
492 GenHighestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
493 }
494
VisitLongHighestOneBit(HInvoke * invoke)495 void IntrinsicLocationsBuilderARM64::VisitLongHighestOneBit(HInvoke* invoke) {
496 CreateIntToIntLocations(allocator_, invoke);
497 }
498
VisitLongHighestOneBit(HInvoke * invoke)499 void IntrinsicCodeGeneratorARM64::VisitLongHighestOneBit(HInvoke* invoke) {
500 GenHighestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
501 }
502
GenLowestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)503 static void GenLowestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
504 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
505
506 UseScratchRegisterScope temps(masm);
507
508 Register src = InputRegisterAt(invoke, 0);
509 Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
510 Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
511
512 __ Neg(temp, src);
513 __ And(dst, temp, src);
514 }
515
VisitIntegerLowestOneBit(HInvoke * invoke)516 void IntrinsicLocationsBuilderARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
517 CreateIntToIntLocations(allocator_, invoke);
518 }
519
VisitIntegerLowestOneBit(HInvoke * invoke)520 void IntrinsicCodeGeneratorARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
521 GenLowestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
522 }
523
VisitLongLowestOneBit(HInvoke * invoke)524 void IntrinsicLocationsBuilderARM64::VisitLongLowestOneBit(HInvoke* invoke) {
525 CreateIntToIntLocations(allocator_, invoke);
526 }
527
VisitLongLowestOneBit(HInvoke * invoke)528 void IntrinsicCodeGeneratorARM64::VisitLongLowestOneBit(HInvoke* invoke) {
529 GenLowestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
530 }
531
CreateFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)532 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
533 LocationSummary* locations =
534 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
535 locations->SetInAt(0, Location::RequiresFpuRegister());
536 locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
537 }
538
VisitMathSqrt(HInvoke * invoke)539 void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) {
540 CreateFPToFPLocations(allocator_, invoke);
541 }
542
VisitMathSqrt(HInvoke * invoke)543 void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) {
544 LocationSummary* locations = invoke->GetLocations();
545 MacroAssembler* masm = GetVIXLAssembler();
546 __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
547 }
548
VisitMathCeil(HInvoke * invoke)549 void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) {
550 CreateFPToFPLocations(allocator_, invoke);
551 }
552
VisitMathCeil(HInvoke * invoke)553 void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) {
554 LocationSummary* locations = invoke->GetLocations();
555 MacroAssembler* masm = GetVIXLAssembler();
556 __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
557 }
558
VisitMathFloor(HInvoke * invoke)559 void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) {
560 CreateFPToFPLocations(allocator_, invoke);
561 }
562
VisitMathFloor(HInvoke * invoke)563 void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) {
564 LocationSummary* locations = invoke->GetLocations();
565 MacroAssembler* masm = GetVIXLAssembler();
566 __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
567 }
568
VisitMathRint(HInvoke * invoke)569 void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) {
570 CreateFPToFPLocations(allocator_, invoke);
571 }
572
VisitMathRint(HInvoke * invoke)573 void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) {
574 LocationSummary* locations = invoke->GetLocations();
575 MacroAssembler* masm = GetVIXLAssembler();
576 __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
577 }
578
CreateFPToIntPlusFPTempLocations(ArenaAllocator * allocator,HInvoke * invoke)579 static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* allocator, HInvoke* invoke) {
580 LocationSummary* locations =
581 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
582 locations->SetInAt(0, Location::RequiresFpuRegister());
583 locations->SetOut(Location::RequiresRegister());
584 locations->AddTemp(Location::RequiresFpuRegister());
585 }
586
GenMathRound(HInvoke * invoke,bool is_double,vixl::aarch64::MacroAssembler * masm)587 static void GenMathRound(HInvoke* invoke, bool is_double, vixl::aarch64::MacroAssembler* masm) {
588 // Java 8 API definition for Math.round():
589 // Return the closest long or int to the argument, with ties rounding to positive infinity.
590 //
591 // There is no single instruction in ARMv8 that can support the above definition.
592 // We choose to use FCVTAS here, because it has closest semantic.
593 // FCVTAS performs rounding to nearest integer, ties away from zero.
594 // For most inputs (positive values, zero or NaN), this instruction is enough.
595 // We only need a few handling code after FCVTAS if the input is negative half value.
596 //
597 // The reason why we didn't choose FCVTPS instruction here is that
598 // although it performs rounding toward positive infinity, it doesn't perform rounding to nearest.
599 // For example, FCVTPS(-1.9) = -1 and FCVTPS(1.1) = 2.
600 // If we were using this instruction, for most inputs, more handling code would be needed.
601 LocationSummary* l = invoke->GetLocations();
602 VRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0));
603 VRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0));
604 Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out());
605 vixl::aarch64::Label done;
606
607 // Round to nearest integer, ties away from zero.
608 __ Fcvtas(out_reg, in_reg);
609
610 // For positive values, zero or NaN inputs, rounding is done.
611 __ Tbz(out_reg, out_reg.GetSizeInBits() - 1, &done);
612
613 // Handle input < 0 cases.
614 // If input is negative but not a tie, previous result (round to nearest) is valid.
615 // If input is a negative tie, out_reg += 1.
616 __ Frinta(tmp_fp, in_reg);
617 __ Fsub(tmp_fp, in_reg, tmp_fp);
618 __ Fcmp(tmp_fp, 0.5);
619 __ Cinc(out_reg, out_reg, eq);
620
621 __ Bind(&done);
622 }
623
VisitMathRoundDouble(HInvoke * invoke)624 void IntrinsicLocationsBuilderARM64::VisitMathRoundDouble(HInvoke* invoke) {
625 CreateFPToIntPlusFPTempLocations(allocator_, invoke);
626 }
627
VisitMathRoundDouble(HInvoke * invoke)628 void IntrinsicCodeGeneratorARM64::VisitMathRoundDouble(HInvoke* invoke) {
629 GenMathRound(invoke, /* is_double= */ true, GetVIXLAssembler());
630 }
631
VisitMathRoundFloat(HInvoke * invoke)632 void IntrinsicLocationsBuilderARM64::VisitMathRoundFloat(HInvoke* invoke) {
633 CreateFPToIntPlusFPTempLocations(allocator_, invoke);
634 }
635
VisitMathRoundFloat(HInvoke * invoke)636 void IntrinsicCodeGeneratorARM64::VisitMathRoundFloat(HInvoke* invoke) {
637 GenMathRound(invoke, /* is_double= */ false, GetVIXLAssembler());
638 }
639
VisitMemoryPeekByte(HInvoke * invoke)640 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
641 CreateIntToIntLocations(allocator_, invoke);
642 }
643
VisitMemoryPeekByte(HInvoke * invoke)644 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) {
645 MacroAssembler* masm = GetVIXLAssembler();
646 __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()),
647 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
648 }
649
VisitMemoryPeekIntNative(HInvoke * invoke)650 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
651 CreateIntToIntLocations(allocator_, invoke);
652 }
653
VisitMemoryPeekIntNative(HInvoke * invoke)654 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
655 MacroAssembler* masm = GetVIXLAssembler();
656 __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()),
657 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
658 }
659
VisitMemoryPeekLongNative(HInvoke * invoke)660 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
661 CreateIntToIntLocations(allocator_, invoke);
662 }
663
VisitMemoryPeekLongNative(HInvoke * invoke)664 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
665 MacroAssembler* masm = GetVIXLAssembler();
666 __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()),
667 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
668 }
669
VisitMemoryPeekShortNative(HInvoke * invoke)670 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
671 CreateIntToIntLocations(allocator_, invoke);
672 }
673
VisitMemoryPeekShortNative(HInvoke * invoke)674 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
675 MacroAssembler* masm = GetVIXLAssembler();
676 __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()),
677 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
678 }
679
CreateIntIntToVoidLocations(ArenaAllocator * allocator,HInvoke * invoke)680 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
681 LocationSummary* locations =
682 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
683 locations->SetInAt(0, Location::RequiresRegister());
684 locations->SetInAt(1, Location::RequiresRegister());
685 }
686
VisitMemoryPokeByte(HInvoke * invoke)687 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) {
688 CreateIntIntToVoidLocations(allocator_, invoke);
689 }
690
VisitMemoryPokeByte(HInvoke * invoke)691 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) {
692 MacroAssembler* masm = GetVIXLAssembler();
693 __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)),
694 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
695 }
696
VisitMemoryPokeIntNative(HInvoke * invoke)697 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
698 CreateIntIntToVoidLocations(allocator_, invoke);
699 }
700
VisitMemoryPokeIntNative(HInvoke * invoke)701 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
702 MacroAssembler* masm = GetVIXLAssembler();
703 __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)),
704 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
705 }
706
VisitMemoryPokeLongNative(HInvoke * invoke)707 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
708 CreateIntIntToVoidLocations(allocator_, invoke);
709 }
710
VisitMemoryPokeLongNative(HInvoke * invoke)711 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
712 MacroAssembler* masm = GetVIXLAssembler();
713 __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)),
714 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
715 }
716
VisitMemoryPokeShortNative(HInvoke * invoke)717 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
718 CreateIntIntToVoidLocations(allocator_, invoke);
719 }
720
VisitMemoryPokeShortNative(HInvoke * invoke)721 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
722 MacroAssembler* masm = GetVIXLAssembler();
723 __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)),
724 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
725 }
726
VisitThreadCurrentThread(HInvoke * invoke)727 void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) {
728 LocationSummary* locations =
729 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
730 locations->SetOut(Location::RequiresRegister());
731 }
732
VisitThreadCurrentThread(HInvoke * invoke)733 void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) {
734 codegen_->Load(DataType::Type::kReference, WRegisterFrom(invoke->GetLocations()->Out()),
735 MemOperand(tr, Thread::PeerOffset<kArm64PointerSize>().Int32Value()));
736 }
737
ReadBarrierNeedsTemp(bool is_volatile,HInvoke * invoke)738 static bool ReadBarrierNeedsTemp(bool is_volatile, HInvoke* invoke) {
739 return is_volatile ||
740 !invoke->InputAt(2)->IsLongConstant() ||
741 invoke->InputAt(2)->AsLongConstant()->GetValue() >= kReferenceLoadMinFarOffset;
742 }
743
GenUnsafeGet(HInvoke * invoke,DataType::Type type,bool is_volatile,CodeGeneratorARM64 * codegen)744 static void GenUnsafeGet(HInvoke* invoke,
745 DataType::Type type,
746 bool is_volatile,
747 CodeGeneratorARM64* codegen) {
748 LocationSummary* locations = invoke->GetLocations();
749 DCHECK((type == DataType::Type::kInt8) ||
750 (type == DataType::Type::kInt32) ||
751 (type == DataType::Type::kInt64) ||
752 (type == DataType::Type::kReference));
753 Location base_loc = locations->InAt(1);
754 Register base = WRegisterFrom(base_loc); // Object pointer.
755 Location offset_loc = locations->InAt(2);
756 Location trg_loc = locations->Out();
757 Register trg = RegisterFrom(trg_loc, type);
758
759 if (type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
760 // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
761 Register temp = WRegisterFrom(locations->GetTemp(0));
762 MacroAssembler* masm = codegen->GetVIXLAssembler();
763 // Piggy-back on the field load path using introspection for the Baker read barrier.
764 if (offset_loc.IsConstant()) {
765 uint32_t offset = Int64FromLocation(offset_loc);
766 Location maybe_temp = ReadBarrierNeedsTemp(is_volatile, invoke)
767 ? locations->GetTemp(0) : Location::NoLocation();
768 DCHECK_EQ(locations->GetTempCount(), ReadBarrierNeedsTemp(is_volatile, invoke));
769 codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
770 trg_loc,
771 base.W(),
772 offset,
773 maybe_temp,
774 /* needs_null_check= */ false,
775 is_volatile);
776 } else {
777 __ Add(temp, base, WRegisterFrom(offset_loc)); // Offset should not exceed 32 bits.
778 codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
779 trg_loc,
780 base,
781 MemOperand(temp.X()),
782 /* needs_null_check= */ false,
783 is_volatile);
784 }
785 } else {
786 // Other cases.
787 MemOperand mem_op;
788 if (offset_loc.IsConstant()) {
789 mem_op = MemOperand(base.X(), Int64FromLocation(offset_loc));
790 } else {
791 mem_op = MemOperand(base.X(), XRegisterFrom(offset_loc));
792 }
793 if (is_volatile) {
794 codegen->LoadAcquire(invoke, type, trg, mem_op, /* needs_null_check= */ true);
795 } else {
796 codegen->Load(type, trg, mem_op);
797 }
798
799 if (type == DataType::Type::kReference) {
800 DCHECK(trg.IsW());
801 codegen->MaybeGenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0u, offset_loc);
802 }
803 }
804 }
805
GenUnsafeGetAbsolute(HInvoke * invoke,DataType::Type type,bool is_volatile,CodeGeneratorARM64 * codegen)806 static void GenUnsafeGetAbsolute(HInvoke* invoke,
807 DataType::Type type,
808 bool is_volatile,
809 CodeGeneratorARM64* codegen) {
810 LocationSummary* locations = invoke->GetLocations();
811 DCHECK((type == DataType::Type::kInt8) ||
812 (type == DataType::Type::kInt32) ||
813 (type == DataType::Type::kInt64));
814 Location address_loc = locations->InAt(1);
815 MemOperand mem_op = MemOperand(XRegisterFrom(address_loc));
816 Location trg_loc = locations->Out();
817 Register trg = RegisterFrom(trg_loc, type);
818
819 if (is_volatile) {
820 codegen->LoadAcquire(invoke, type, trg, mem_op, /* needs_null_check= */ true);
821 } else {
822 codegen->Load(type, trg, mem_op);
823 }
824 }
825
CreateUnsafeGetLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen,bool is_volatile=false)826 static void CreateUnsafeGetLocations(ArenaAllocator* allocator,
827 HInvoke* invoke,
828 CodeGeneratorARM64* codegen,
829 bool is_volatile = false) {
830 bool can_call = codegen->EmitReadBarrier() && IsUnsafeGetReference(invoke);
831 LocationSummary* locations =
832 new (allocator) LocationSummary(invoke,
833 can_call
834 ? LocationSummary::kCallOnSlowPath
835 : LocationSummary::kNoCall,
836 kIntrinsified);
837 if (can_call && kUseBakerReadBarrier) {
838 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
839 if (ReadBarrierNeedsTemp(is_volatile, invoke)) {
840 // We need a temporary register for the read barrier load in order to use
841 // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier().
842 locations->AddTemp(FixedTempLocation());
843 }
844 }
845 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
846 locations->SetInAt(1, Location::RequiresRegister());
847 locations->SetInAt(2, Location::RegisterOrConstant(invoke->InputAt(2)));
848 locations->SetOut(Location::RequiresRegister(),
849 (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
850 }
851
CreateUnsafeGetAbsoluteLocations(ArenaAllocator * allocator,HInvoke * invoke)852 static void CreateUnsafeGetAbsoluteLocations(ArenaAllocator* allocator,
853 HInvoke* invoke) {
854 LocationSummary* locations =
855 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
856 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
857 locations->SetInAt(1, Location::RequiresRegister());
858 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
859 }
860
VisitUnsafeGet(HInvoke * invoke)861 void IntrinsicLocationsBuilderARM64::VisitUnsafeGet(HInvoke* invoke) {
862 VisitJdkUnsafeGet(invoke);
863 }
VisitUnsafeGetAbsolute(HInvoke * invoke)864 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAbsolute(HInvoke* invoke) {
865 VisitJdkUnsafeGetAbsolute(invoke);
866 }
VisitUnsafeGetVolatile(HInvoke * invoke)867 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
868 VisitJdkUnsafeGetVolatile(invoke);
869 }
VisitUnsafeGetLong(HInvoke * invoke)870 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLong(HInvoke* invoke) {
871 VisitJdkUnsafeGetLong(invoke);
872 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)873 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
874 VisitJdkUnsafeGetLongVolatile(invoke);
875 }
VisitUnsafeGetObject(HInvoke * invoke)876 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObject(HInvoke* invoke) {
877 VisitJdkUnsafeGetReference(invoke);
878 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)879 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
880 VisitJdkUnsafeGetReferenceVolatile(invoke);
881 }
VisitUnsafeGetByte(HInvoke * invoke)882 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetByte(HInvoke* invoke) {
883 VisitJdkUnsafeGetByte(invoke);
884 }
VisitJdkUnsafeGet(HInvoke * invoke)885 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGet(HInvoke* invoke) {
886 CreateUnsafeGetLocations(allocator_, invoke, codegen_);
887 }
VisitJdkUnsafeGetAbsolute(HInvoke * invoke)888 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAbsolute(HInvoke* invoke) {
889 CreateUnsafeGetAbsoluteLocations(allocator_, invoke);
890 }
VisitJdkUnsafeGetVolatile(HInvoke * invoke)891 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetVolatile(HInvoke* invoke) {
892 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
893 }
VisitJdkUnsafeGetAcquire(HInvoke * invoke)894 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAcquire(HInvoke* invoke) {
895 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
896 }
VisitJdkUnsafeGetLong(HInvoke * invoke)897 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLong(HInvoke* invoke) {
898 CreateUnsafeGetLocations(allocator_, invoke, codegen_);
899 }
VisitJdkUnsafeGetLongVolatile(HInvoke * invoke)900 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLongVolatile(HInvoke* invoke) {
901 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
902 }
VisitJdkUnsafeGetLongAcquire(HInvoke * invoke)903 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLongAcquire(HInvoke* invoke) {
904 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
905 }
VisitJdkUnsafeGetReference(HInvoke * invoke)906 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReference(HInvoke* invoke) {
907 CreateUnsafeGetLocations(allocator_, invoke, codegen_);
908 }
VisitJdkUnsafeGetReferenceVolatile(HInvoke * invoke)909 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReferenceVolatile(HInvoke* invoke) {
910 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
911 }
VisitJdkUnsafeGetReferenceAcquire(HInvoke * invoke)912 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReferenceAcquire(HInvoke* invoke) {
913 CreateUnsafeGetLocations(allocator_, invoke, codegen_, /* is_volatile= */ true);
914 }
VisitJdkUnsafeGetByte(HInvoke * invoke)915 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetByte(HInvoke* invoke) {
916 CreateUnsafeGetLocations(allocator_, invoke, codegen_);
917 }
918
VisitUnsafeGet(HInvoke * invoke)919 void IntrinsicCodeGeneratorARM64::VisitUnsafeGet(HInvoke* invoke) {
920 VisitJdkUnsafeGet(invoke);
921 }
VisitUnsafeGetAbsolute(HInvoke * invoke)922 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAbsolute(HInvoke* invoke) {
923 VisitJdkUnsafeGetAbsolute(invoke);
924 }
VisitUnsafeGetVolatile(HInvoke * invoke)925 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
926 VisitJdkUnsafeGetVolatile(invoke);
927 }
VisitUnsafeGetLong(HInvoke * invoke)928 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLong(HInvoke* invoke) {
929 VisitJdkUnsafeGetLong(invoke);
930 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)931 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
932 VisitJdkUnsafeGetLongVolatile(invoke);
933 }
VisitUnsafeGetObject(HInvoke * invoke)934 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObject(HInvoke* invoke) {
935 VisitJdkUnsafeGetReference(invoke);
936 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)937 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
938 VisitJdkUnsafeGetReferenceVolatile(invoke);
939 }
VisitUnsafeGetByte(HInvoke * invoke)940 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetByte(HInvoke* invoke) {
941 VisitJdkUnsafeGetByte(invoke);
942 }
943
VisitJdkUnsafeGet(HInvoke * invoke)944 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGet(HInvoke* invoke) {
945 GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ false, codegen_);
946 }
VisitJdkUnsafeGetAbsolute(HInvoke * invoke)947 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAbsolute(HInvoke* invoke) {
948 GenUnsafeGetAbsolute(invoke, DataType::Type::kInt32, /*is_volatile=*/ false, codegen_);
949 }
VisitJdkUnsafeGetVolatile(HInvoke * invoke)950 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetVolatile(HInvoke* invoke) {
951 GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ true, codegen_);
952 }
VisitJdkUnsafeGetAcquire(HInvoke * invoke)953 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAcquire(HInvoke* invoke) {
954 GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ true, codegen_);
955 }
VisitJdkUnsafeGetLong(HInvoke * invoke)956 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLong(HInvoke* invoke) {
957 GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ false, codegen_);
958 }
VisitJdkUnsafeGetLongVolatile(HInvoke * invoke)959 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLongVolatile(HInvoke* invoke) {
960 GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ true, codegen_);
961 }
VisitJdkUnsafeGetLongAcquire(HInvoke * invoke)962 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLongAcquire(HInvoke* invoke) {
963 GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ true, codegen_);
964 }
VisitJdkUnsafeGetReference(HInvoke * invoke)965 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReference(HInvoke* invoke) {
966 GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ false, codegen_);
967 }
VisitJdkUnsafeGetReferenceVolatile(HInvoke * invoke)968 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReferenceVolatile(HInvoke* invoke) {
969 GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ true, codegen_);
970 }
VisitJdkUnsafeGetReferenceAcquire(HInvoke * invoke)971 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReferenceAcquire(HInvoke* invoke) {
972 GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ true, codegen_);
973 }
VisitJdkUnsafeGetByte(HInvoke * invoke)974 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetByte(HInvoke* invoke) {
975 GenUnsafeGet(invoke, DataType::Type::kInt8, /*is_volatile=*/ false, codegen_);
976 }
977
CreateUnsafePutLocations(ArenaAllocator * allocator,HInvoke * invoke)978 static void CreateUnsafePutLocations(ArenaAllocator* allocator, HInvoke* invoke) {
979 LocationSummary* locations =
980 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
981 static constexpr int kOffsetIndex = 2;
982 static constexpr int kValueIndex = 3;
983 // Unused receiver.
984 locations->SetInAt(0, Location::NoLocation());
985 // The object.
986 locations->SetInAt(1, Location::RequiresRegister());
987 // The offset.
988 locations->SetInAt(
989 kOffsetIndex, Location::RegisterOrConstant(invoke->InputAt(kOffsetIndex)));
990 // The value.
991 if (IsZeroBitPattern(invoke->InputAt(kValueIndex))) {
992 locations->SetInAt(kValueIndex, Location::ConstantLocation(invoke->InputAt(kValueIndex)));
993 } else {
994 locations->SetInAt(kValueIndex, Location::RequiresRegister());
995 }
996 }
997
CreateUnsafePutAbsoluteLocations(ArenaAllocator * allocator,HInvoke * invoke)998 static void CreateUnsafePutAbsoluteLocations(ArenaAllocator* allocator, HInvoke* invoke) {
999 LocationSummary* locations =
1000 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1001 static constexpr int kAddressIndex = 1;
1002 static constexpr int kValueIndex = 2;
1003 // Unused receiver.
1004 locations->SetInAt(0, Location::NoLocation());
1005 // The address.
1006 locations->SetInAt(kAddressIndex, Location::RequiresRegister());
1007 // The value.
1008 if (IsZeroBitPattern(invoke->InputAt(kValueIndex))) {
1009 locations->SetInAt(kValueIndex, Location::ConstantLocation(invoke->InputAt(kValueIndex)));
1010 } else {
1011 locations->SetInAt(kValueIndex, Location::RequiresRegister());
1012 }
1013 }
1014
VisitUnsafePut(HInvoke * invoke)1015 void IntrinsicLocationsBuilderARM64::VisitUnsafePut(HInvoke* invoke) {
1016 VisitJdkUnsafePut(invoke);
1017 }
VisitUnsafePutAbsolute(HInvoke * invoke)1018 void IntrinsicLocationsBuilderARM64::VisitUnsafePutAbsolute(HInvoke* invoke) {
1019 VisitJdkUnsafePutAbsolute(invoke);
1020 }
VisitUnsafePutOrdered(HInvoke * invoke)1021 void IntrinsicLocationsBuilderARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
1022 VisitJdkUnsafePutOrdered(invoke);
1023 }
VisitUnsafePutVolatile(HInvoke * invoke)1024 void IntrinsicLocationsBuilderARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
1025 VisitJdkUnsafePutVolatile(invoke);
1026 }
VisitUnsafePutObject(HInvoke * invoke)1027 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObject(HInvoke* invoke) {
1028 VisitJdkUnsafePutReference(invoke);
1029 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)1030 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1031 VisitJdkUnsafePutObjectOrdered(invoke);
1032 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)1033 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1034 VisitJdkUnsafePutReferenceVolatile(invoke);
1035 }
VisitUnsafePutLong(HInvoke * invoke)1036 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLong(HInvoke* invoke) {
1037 VisitJdkUnsafePutLong(invoke);
1038 }
VisitUnsafePutLongOrdered(HInvoke * invoke)1039 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1040 VisitJdkUnsafePutLongOrdered(invoke);
1041 }
VisitUnsafePutLongVolatile(HInvoke * invoke)1042 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1043 VisitJdkUnsafePutLongVolatile(invoke);
1044 }
VisitUnsafePutByte(HInvoke * invoke)1045 void IntrinsicLocationsBuilderARM64::VisitUnsafePutByte(HInvoke* invoke) {
1046 VisitJdkUnsafePutByte(invoke);
1047 }
1048
VisitJdkUnsafePut(HInvoke * invoke)1049 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePut(HInvoke* invoke) {
1050 CreateUnsafePutLocations(allocator_, invoke);
1051 }
VisitJdkUnsafePutAbsolute(HInvoke * invoke)1052 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutAbsolute(HInvoke* invoke) {
1053 CreateUnsafePutAbsoluteLocations(allocator_, invoke);
1054 }
VisitJdkUnsafePutOrdered(HInvoke * invoke)1055 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutOrdered(HInvoke* invoke) {
1056 CreateUnsafePutLocations(allocator_, invoke);
1057 }
VisitJdkUnsafePutVolatile(HInvoke * invoke)1058 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutVolatile(HInvoke* invoke) {
1059 CreateUnsafePutLocations(allocator_, invoke);
1060 }
VisitJdkUnsafePutRelease(HInvoke * invoke)1061 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutRelease(HInvoke* invoke) {
1062 CreateUnsafePutLocations(allocator_, invoke);
1063 }
VisitJdkUnsafePutReference(HInvoke * invoke)1064 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReference(HInvoke* invoke) {
1065 CreateUnsafePutLocations(allocator_, invoke);
1066 }
VisitJdkUnsafePutObjectOrdered(HInvoke * invoke)1067 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutObjectOrdered(HInvoke* invoke) {
1068 CreateUnsafePutLocations(allocator_, invoke);
1069 }
VisitJdkUnsafePutReferenceVolatile(HInvoke * invoke)1070 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReferenceVolatile(HInvoke* invoke) {
1071 CreateUnsafePutLocations(allocator_, invoke);
1072 }
VisitJdkUnsafePutReferenceRelease(HInvoke * invoke)1073 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReferenceRelease(HInvoke* invoke) {
1074 CreateUnsafePutLocations(allocator_, invoke);
1075 }
VisitJdkUnsafePutLong(HInvoke * invoke)1076 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLong(HInvoke* invoke) {
1077 CreateUnsafePutLocations(allocator_, invoke);
1078 }
VisitJdkUnsafePutLongOrdered(HInvoke * invoke)1079 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongOrdered(HInvoke* invoke) {
1080 CreateUnsafePutLocations(allocator_, invoke);
1081 }
VisitJdkUnsafePutLongVolatile(HInvoke * invoke)1082 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongVolatile(HInvoke* invoke) {
1083 CreateUnsafePutLocations(allocator_, invoke);
1084 }
VisitJdkUnsafePutLongRelease(HInvoke * invoke)1085 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongRelease(HInvoke* invoke) {
1086 CreateUnsafePutLocations(allocator_, invoke);
1087 }
VisitJdkUnsafePutByte(HInvoke * invoke)1088 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutByte(HInvoke* invoke) {
1089 CreateUnsafePutLocations(allocator_, invoke);
1090 }
1091
GenUnsafePut(HInvoke * invoke,DataType::Type type,bool is_volatile,bool is_ordered,CodeGeneratorARM64 * codegen)1092 static void GenUnsafePut(HInvoke* invoke,
1093 DataType::Type type,
1094 bool is_volatile,
1095 bool is_ordered,
1096 CodeGeneratorARM64* codegen) {
1097 LocationSummary* locations = invoke->GetLocations();
1098 MacroAssembler* masm = codegen->GetVIXLAssembler();
1099
1100 static constexpr int kOffsetIndex = 2;
1101 static constexpr int kValueIndex = 3;
1102 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
1103 Location offset = locations->InAt(kOffsetIndex); // Long offset.
1104 CPURegister value = InputCPURegisterOrZeroRegAt(invoke, kValueIndex);
1105 CPURegister source = value;
1106 MemOperand mem_op;
1107 if (offset.IsConstant()) {
1108 mem_op = MemOperand(base.X(), Int64FromLocation(offset));
1109 } else {
1110 mem_op = MemOperand(base.X(), XRegisterFrom(offset));
1111 }
1112
1113 {
1114 // We use a block to end the scratch scope before the write barrier, thus
1115 // freeing the temporary registers so they can be used in `MarkGCCard`.
1116 UseScratchRegisterScope temps(masm);
1117
1118 if (kPoisonHeapReferences &&
1119 type == DataType::Type::kReference &&
1120 !IsZeroBitPattern(invoke->InputAt(kValueIndex))) {
1121 DCHECK(value.IsW());
1122 Register temp = temps.AcquireW();
1123 __ Mov(temp.W(), value.W());
1124 codegen->GetAssembler()->PoisonHeapReference(temp.W());
1125 source = temp;
1126 }
1127
1128 if (is_volatile || is_ordered) {
1129 codegen->StoreRelease(invoke, type, source, mem_op, /* needs_null_check= */ false);
1130 } else {
1131 codegen->Store(type, source, mem_op);
1132 }
1133 }
1134
1135 if (type == DataType::Type::kReference && !IsZeroBitPattern(invoke->InputAt(kValueIndex))) {
1136 bool value_can_be_null = true; // TODO: Worth finding out this information?
1137 codegen->MaybeMarkGCCard(base, Register(source), value_can_be_null);
1138 }
1139 }
1140
GenUnsafePutAbsolute(HInvoke * invoke,DataType::Type type,bool is_volatile,bool is_ordered,CodeGeneratorARM64 * codegen)1141 static void GenUnsafePutAbsolute(HInvoke* invoke,
1142 DataType::Type type,
1143 bool is_volatile,
1144 bool is_ordered,
1145 CodeGeneratorARM64* codegen) {
1146 LocationSummary* locations = invoke->GetLocations();
1147
1148 static constexpr int kAddressIndex = 1;
1149 static constexpr int kValueIndex = 2;
1150 Location address_loc = locations->InAt(kAddressIndex);
1151 MemOperand mem_op = MemOperand(WRegisterFrom(address_loc).X());
1152 CPURegister value = InputCPURegisterOrZeroRegAt(invoke, kValueIndex);
1153
1154 if (is_volatile || is_ordered) {
1155 codegen->StoreRelease(invoke, type, value, mem_op, /* needs_null_check= */ false);
1156 } else {
1157 codegen->Store(type, value, mem_op);
1158 }
1159 }
1160
VisitUnsafePut(HInvoke * invoke)1161 void IntrinsicCodeGeneratorARM64::VisitUnsafePut(HInvoke* invoke) {
1162 VisitJdkUnsafePut(invoke);
1163 }
VisitUnsafePutAbsolute(HInvoke * invoke)1164 void IntrinsicCodeGeneratorARM64::VisitUnsafePutAbsolute(HInvoke* invoke) {
1165 VisitJdkUnsafePutAbsolute(invoke);
1166 }
VisitUnsafePutOrdered(HInvoke * invoke)1167 void IntrinsicCodeGeneratorARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
1168 VisitJdkUnsafePutOrdered(invoke);
1169 }
VisitUnsafePutVolatile(HInvoke * invoke)1170 void IntrinsicCodeGeneratorARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
1171 VisitJdkUnsafePutVolatile(invoke);
1172 }
VisitUnsafePutObject(HInvoke * invoke)1173 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObject(HInvoke* invoke) {
1174 VisitJdkUnsafePutReference(invoke);
1175 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)1176 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1177 VisitJdkUnsafePutObjectOrdered(invoke);
1178 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)1179 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1180 VisitJdkUnsafePutReferenceVolatile(invoke);
1181 }
VisitUnsafePutLong(HInvoke * invoke)1182 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLong(HInvoke* invoke) {
1183 VisitJdkUnsafePutLong(invoke);
1184 }
VisitUnsafePutLongOrdered(HInvoke * invoke)1185 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1186 VisitJdkUnsafePutLongOrdered(invoke);
1187 }
VisitUnsafePutLongVolatile(HInvoke * invoke)1188 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1189 VisitJdkUnsafePutLongVolatile(invoke);
1190 }
VisitUnsafePutByte(HInvoke * invoke)1191 void IntrinsicCodeGeneratorARM64::VisitUnsafePutByte(HInvoke* invoke) {
1192 VisitJdkUnsafePutByte(invoke);
1193 }
1194
VisitJdkUnsafePut(HInvoke * invoke)1195 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePut(HInvoke* invoke) {
1196 GenUnsafePut(invoke,
1197 DataType::Type::kInt32,
1198 /*is_volatile=*/ false,
1199 /*is_ordered=*/ false,
1200 codegen_);
1201 }
VisitJdkUnsafePutAbsolute(HInvoke * invoke)1202 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutAbsolute(HInvoke* invoke) {
1203 GenUnsafePutAbsolute(invoke,
1204 DataType::Type::kInt32,
1205 /*is_volatile=*/ false,
1206 /*is_ordered=*/ false,
1207 codegen_);
1208 }
VisitJdkUnsafePutOrdered(HInvoke * invoke)1209 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutOrdered(HInvoke* invoke) {
1210 GenUnsafePut(invoke,
1211 DataType::Type::kInt32,
1212 /*is_volatile=*/ false,
1213 /*is_ordered=*/ true,
1214 codegen_);
1215 }
VisitJdkUnsafePutVolatile(HInvoke * invoke)1216 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutVolatile(HInvoke* invoke) {
1217 GenUnsafePut(invoke,
1218 DataType::Type::kInt32,
1219 /*is_volatile=*/ true,
1220 /*is_ordered=*/ false,
1221 codegen_);
1222 }
VisitJdkUnsafePutRelease(HInvoke * invoke)1223 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutRelease(HInvoke* invoke) {
1224 GenUnsafePut(invoke,
1225 DataType::Type::kInt32,
1226 /*is_volatile=*/ true,
1227 /*is_ordered=*/ false,
1228 codegen_);
1229 }
VisitJdkUnsafePutReference(HInvoke * invoke)1230 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReference(HInvoke* invoke) {
1231 GenUnsafePut(invoke,
1232 DataType::Type::kReference,
1233 /*is_volatile=*/ false,
1234 /*is_ordered=*/ false,
1235 codegen_);
1236 }
VisitJdkUnsafePutObjectOrdered(HInvoke * invoke)1237 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutObjectOrdered(HInvoke* invoke) {
1238 GenUnsafePut(invoke,
1239 DataType::Type::kReference,
1240 /*is_volatile=*/ false,
1241 /*is_ordered=*/ true,
1242 codegen_);
1243 }
VisitJdkUnsafePutReferenceVolatile(HInvoke * invoke)1244 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReferenceVolatile(HInvoke* invoke) {
1245 GenUnsafePut(invoke,
1246 DataType::Type::kReference,
1247 /*is_volatile=*/ true,
1248 /*is_ordered=*/ false,
1249 codegen_);
1250 }
VisitJdkUnsafePutReferenceRelease(HInvoke * invoke)1251 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReferenceRelease(HInvoke* invoke) {
1252 GenUnsafePut(invoke,
1253 DataType::Type::kReference,
1254 /*is_volatile=*/ true,
1255 /*is_ordered=*/ false,
1256 codegen_);
1257 }
VisitJdkUnsafePutLong(HInvoke * invoke)1258 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLong(HInvoke* invoke) {
1259 GenUnsafePut(invoke,
1260 DataType::Type::kInt64,
1261 /*is_volatile=*/ false,
1262 /*is_ordered=*/ false,
1263 codegen_);
1264 }
VisitJdkUnsafePutLongOrdered(HInvoke * invoke)1265 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongOrdered(HInvoke* invoke) {
1266 GenUnsafePut(invoke,
1267 DataType::Type::kInt64,
1268 /*is_volatile=*/ false,
1269 /*is_ordered=*/ true,
1270 codegen_);
1271 }
VisitJdkUnsafePutLongVolatile(HInvoke * invoke)1272 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongVolatile(HInvoke* invoke) {
1273 GenUnsafePut(invoke,
1274 DataType::Type::kInt64,
1275 /*is_volatile=*/ true,
1276 /*is_ordered=*/ false,
1277 codegen_);
1278 }
VisitJdkUnsafePutLongRelease(HInvoke * invoke)1279 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongRelease(HInvoke* invoke) {
1280 GenUnsafePut(invoke,
1281 DataType::Type::kInt64,
1282 /*is_volatile=*/ true,
1283 /*is_ordered=*/ false,
1284 codegen_);
1285 }
VisitJdkUnsafePutByte(HInvoke * invoke)1286 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutByte(HInvoke* invoke) {
1287 GenUnsafePut(invoke,
1288 DataType::Type::kInt8,
1289 /*is_volatile=*/ false,
1290 /*is_ordered=*/ false,
1291 codegen_);
1292 }
1293
CreateUnsafeCASLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen)1294 static void CreateUnsafeCASLocations(ArenaAllocator* allocator,
1295 HInvoke* invoke,
1296 CodeGeneratorARM64* codegen) {
1297 const bool can_call = codegen->EmitReadBarrier() && IsUnsafeCASReference(invoke);
1298 LocationSummary* locations =
1299 new (allocator) LocationSummary(invoke,
1300 can_call
1301 ? LocationSummary::kCallOnSlowPath
1302 : LocationSummary::kNoCall,
1303 kIntrinsified);
1304 if (can_call && kUseBakerReadBarrier) {
1305 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
1306 }
1307 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
1308 locations->SetInAt(1, Location::RequiresRegister());
1309 locations->SetInAt(2, Location::RequiresRegister());
1310 locations->SetInAt(3, Location::RequiresRegister());
1311 locations->SetInAt(4, Location::RequiresRegister());
1312
1313 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
1314 }
1315
EmitLoadExclusive(CodeGeneratorARM64 * codegen,DataType::Type type,Register ptr,Register old_value,bool use_load_acquire)1316 static void EmitLoadExclusive(CodeGeneratorARM64* codegen,
1317 DataType::Type type,
1318 Register ptr,
1319 Register old_value,
1320 bool use_load_acquire) {
1321 Arm64Assembler* assembler = codegen->GetAssembler();
1322 MacroAssembler* masm = assembler->GetVIXLAssembler();
1323 switch (type) {
1324 case DataType::Type::kBool:
1325 case DataType::Type::kUint8:
1326 case DataType::Type::kInt8:
1327 if (use_load_acquire) {
1328 __ Ldaxrb(old_value, MemOperand(ptr));
1329 } else {
1330 __ Ldxrb(old_value, MemOperand(ptr));
1331 }
1332 break;
1333 case DataType::Type::kUint16:
1334 case DataType::Type::kInt16:
1335 if (use_load_acquire) {
1336 __ Ldaxrh(old_value, MemOperand(ptr));
1337 } else {
1338 __ Ldxrh(old_value, MemOperand(ptr));
1339 }
1340 break;
1341 case DataType::Type::kInt32:
1342 case DataType::Type::kInt64:
1343 case DataType::Type::kReference:
1344 if (use_load_acquire) {
1345 __ Ldaxr(old_value, MemOperand(ptr));
1346 } else {
1347 __ Ldxr(old_value, MemOperand(ptr));
1348 }
1349 break;
1350 default:
1351 LOG(FATAL) << "Unexpected type: " << type;
1352 UNREACHABLE();
1353 }
1354 switch (type) {
1355 case DataType::Type::kInt8:
1356 __ Sxtb(old_value, old_value);
1357 break;
1358 case DataType::Type::kInt16:
1359 __ Sxth(old_value, old_value);
1360 break;
1361 case DataType::Type::kReference:
1362 assembler->MaybeUnpoisonHeapReference(old_value);
1363 break;
1364 default:
1365 break;
1366 }
1367 }
1368
EmitStoreExclusive(CodeGeneratorARM64 * codegen,DataType::Type type,Register ptr,Register store_result,Register new_value,bool use_store_release)1369 static void EmitStoreExclusive(CodeGeneratorARM64* codegen,
1370 DataType::Type type,
1371 Register ptr,
1372 Register store_result,
1373 Register new_value,
1374 bool use_store_release) {
1375 Arm64Assembler* assembler = codegen->GetAssembler();
1376 MacroAssembler* masm = assembler->GetVIXLAssembler();
1377 if (type == DataType::Type::kReference) {
1378 assembler->MaybePoisonHeapReference(new_value);
1379 }
1380 switch (type) {
1381 case DataType::Type::kBool:
1382 case DataType::Type::kUint8:
1383 case DataType::Type::kInt8:
1384 if (use_store_release) {
1385 __ Stlxrb(store_result, new_value, MemOperand(ptr));
1386 } else {
1387 __ Stxrb(store_result, new_value, MemOperand(ptr));
1388 }
1389 break;
1390 case DataType::Type::kUint16:
1391 case DataType::Type::kInt16:
1392 if (use_store_release) {
1393 __ Stlxrh(store_result, new_value, MemOperand(ptr));
1394 } else {
1395 __ Stxrh(store_result, new_value, MemOperand(ptr));
1396 }
1397 break;
1398 case DataType::Type::kInt32:
1399 case DataType::Type::kInt64:
1400 case DataType::Type::kReference:
1401 if (use_store_release) {
1402 __ Stlxr(store_result, new_value, MemOperand(ptr));
1403 } else {
1404 __ Stxr(store_result, new_value, MemOperand(ptr));
1405 }
1406 break;
1407 default:
1408 LOG(FATAL) << "Unexpected type: " << type;
1409 UNREACHABLE();
1410 }
1411 if (type == DataType::Type::kReference) {
1412 assembler->MaybeUnpoisonHeapReference(new_value);
1413 }
1414 }
1415
GenerateCompareAndSet(CodeGeneratorARM64 * codegen,DataType::Type type,std::memory_order order,bool strong,vixl::aarch64::Label * cmp_failure,Register ptr,Register new_value,Register old_value,Register store_result,Register expected,Register expected2=Register ())1416 static void GenerateCompareAndSet(CodeGeneratorARM64* codegen,
1417 DataType::Type type,
1418 std::memory_order order,
1419 bool strong,
1420 vixl::aarch64::Label* cmp_failure,
1421 Register ptr,
1422 Register new_value,
1423 Register old_value,
1424 Register store_result,
1425 Register expected,
1426 Register expected2 = Register()) {
1427 // The `expected2` is valid only for reference slow path and represents the unmarked old value
1428 // from the main path attempt to emit CAS when the marked old value matched `expected`.
1429 DCHECK_IMPLIES(expected2.IsValid(), type == DataType::Type::kReference);
1430
1431 DCHECK(ptr.IsX());
1432 DCHECK_EQ(new_value.IsX(), type == DataType::Type::kInt64);
1433 DCHECK_EQ(old_value.IsX(), type == DataType::Type::kInt64);
1434 DCHECK(store_result.IsW());
1435 DCHECK_EQ(expected.IsX(), type == DataType::Type::kInt64);
1436 DCHECK_IMPLIES(expected2.IsValid(), expected2.IsW());
1437
1438 Arm64Assembler* assembler = codegen->GetAssembler();
1439 MacroAssembler* masm = assembler->GetVIXLAssembler();
1440
1441 bool use_load_acquire =
1442 (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
1443 bool use_store_release =
1444 (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
1445 DCHECK(use_load_acquire || use_store_release || order == std::memory_order_relaxed);
1446
1447 // repeat: {
1448 // old_value = [ptr]; // Load exclusive.
1449 // if (old_value != expected && old_value != expected2) goto cmp_failure;
1450 // store_result = failed([ptr] <- new_value); // Store exclusive.
1451 // }
1452 // if (strong) {
1453 // if (store_result) goto repeat; // Repeat until compare fails or store exclusive succeeds.
1454 // } else {
1455 // store_result = store_result ^ 1; // Report success as 1, failure as 0.
1456 // }
1457 //
1458 // Flag Z indicates whether `old_value == expected || old_value == expected2`.
1459 // (If `expected2` is not valid, the `old_value == expected2` part is not emitted.)
1460
1461 vixl::aarch64::Label loop_head;
1462 if (strong) {
1463 __ Bind(&loop_head);
1464 }
1465 EmitLoadExclusive(codegen, type, ptr, old_value, use_load_acquire);
1466 __ Cmp(old_value, expected);
1467 if (expected2.IsValid()) {
1468 __ Ccmp(old_value, expected2, ZFlag, ne);
1469 }
1470 // If the comparison failed, the Z flag is cleared as we branch to the `cmp_failure` label.
1471 // If the comparison succeeded, the Z flag is set and remains set after the end of the
1472 // code emitted here, unless we retry the whole operation.
1473 __ B(cmp_failure, ne);
1474 EmitStoreExclusive(codegen, type, ptr, store_result, new_value, use_store_release);
1475 if (strong) {
1476 __ Cbnz(store_result, &loop_head);
1477 } else {
1478 // Flip the `store_result` register to indicate success by 1 and failure by 0.
1479 __ Eor(store_result, store_result, 1);
1480 }
1481 }
1482
1483 class ReadBarrierCasSlowPathARM64 : public SlowPathCodeARM64 {
1484 public:
ReadBarrierCasSlowPathARM64(HInvoke * invoke,std::memory_order order,bool strong,Register base,Register offset,Register expected,Register new_value,Register old_value,Register old_value_temp,Register store_result,bool update_old_value,CodeGeneratorARM64 * arm64_codegen)1485 ReadBarrierCasSlowPathARM64(HInvoke* invoke,
1486 std::memory_order order,
1487 bool strong,
1488 Register base,
1489 Register offset,
1490 Register expected,
1491 Register new_value,
1492 Register old_value,
1493 Register old_value_temp,
1494 Register store_result,
1495 bool update_old_value,
1496 CodeGeneratorARM64* arm64_codegen)
1497 : SlowPathCodeARM64(invoke),
1498 order_(order),
1499 strong_(strong),
1500 base_(base),
1501 offset_(offset),
1502 expected_(expected),
1503 new_value_(new_value),
1504 old_value_(old_value),
1505 old_value_temp_(old_value_temp),
1506 store_result_(store_result),
1507 update_old_value_(update_old_value),
1508 mark_old_value_slow_path_(nullptr),
1509 update_old_value_slow_path_(nullptr) {
1510 if (!kUseBakerReadBarrier) {
1511 // We need to add the slow path now, it is too late when emitting slow path code.
1512 mark_old_value_slow_path_ = arm64_codegen->AddReadBarrierSlowPath(
1513 invoke,
1514 Location::RegisterLocation(old_value_temp.GetCode()),
1515 Location::RegisterLocation(old_value.GetCode()),
1516 Location::RegisterLocation(base.GetCode()),
1517 /*offset=*/ 0u,
1518 /*index=*/ Location::RegisterLocation(offset.GetCode()));
1519 if (update_old_value_) {
1520 update_old_value_slow_path_ = arm64_codegen->AddReadBarrierSlowPath(
1521 invoke,
1522 Location::RegisterLocation(old_value.GetCode()),
1523 Location::RegisterLocation(old_value_temp.GetCode()),
1524 Location::RegisterLocation(base.GetCode()),
1525 /*offset=*/ 0u,
1526 /*index=*/ Location::RegisterLocation(offset.GetCode()));
1527 }
1528 }
1529 }
1530
GetDescription() const1531 const char* GetDescription() const override { return "ReadBarrierCasSlowPathARM64"; }
1532
EmitNativeCode(CodeGenerator * codegen)1533 void EmitNativeCode(CodeGenerator* codegen) override {
1534 CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
1535 Arm64Assembler* assembler = arm64_codegen->GetAssembler();
1536 MacroAssembler* masm = assembler->GetVIXLAssembler();
1537 __ Bind(GetEntryLabel());
1538
1539 // Mark the `old_value_` from the main path and compare with `expected_`.
1540 if (kUseBakerReadBarrier) {
1541 DCHECK(mark_old_value_slow_path_ == nullptr);
1542 arm64_codegen->GenerateIntrinsicMoveWithBakerReadBarrier(old_value_temp_, old_value_);
1543 } else {
1544 DCHECK(mark_old_value_slow_path_ != nullptr);
1545 __ B(mark_old_value_slow_path_->GetEntryLabel());
1546 __ Bind(mark_old_value_slow_path_->GetExitLabel());
1547 }
1548 __ Cmp(old_value_temp_, expected_);
1549 if (update_old_value_) {
1550 // Update the old value if we're going to return from the slow path.
1551 __ Csel(old_value_, old_value_temp_, old_value_, ne);
1552 }
1553 __ B(GetExitLabel(), ne); // If taken, Z=false indicates failure.
1554
1555 // The `old_value` we have read did not match `expected` (which is always a to-space
1556 // reference) but after the read barrier the marked to-space value matched, so the
1557 // `old_value` must be a from-space reference to the same object. Do the same CAS loop
1558 // as the main path but check for both `expected` and the unmarked old value
1559 // representing the to-space and from-space references for the same object.
1560
1561 UseScratchRegisterScope temps(masm);
1562 DCHECK_IMPLIES(store_result_.IsValid(), !temps.IsAvailable(store_result_));
1563 Register tmp_ptr = temps.AcquireX();
1564 Register store_result = store_result_.IsValid() ? store_result_ : temps.AcquireW();
1565
1566 // Recalculate the `tmp_ptr` from main path clobbered by the read barrier above.
1567 __ Add(tmp_ptr, base_.X(), Operand(offset_));
1568
1569 vixl::aarch64::Label mark_old_value;
1570 GenerateCompareAndSet(arm64_codegen,
1571 DataType::Type::kReference,
1572 order_,
1573 strong_,
1574 /*cmp_failure=*/ update_old_value_ ? &mark_old_value : GetExitLabel(),
1575 tmp_ptr,
1576 new_value_,
1577 /*old_value=*/ old_value_temp_,
1578 store_result,
1579 expected_,
1580 /*expected2=*/ old_value_);
1581 if (update_old_value_) {
1582 // To reach this point, the `old_value_temp_` must be either a from-space or a to-space
1583 // reference of the `expected_` object. Update the `old_value_` to the to-space reference.
1584 __ Mov(old_value_, expected_);
1585 }
1586
1587 // Z=true from the CMP+CCMP in GenerateCompareAndSet() above indicates comparison success.
1588 // For strong CAS, that's the overall success. For weak CAS, the code also needs
1589 // to check the `store_result` after returning from the slow path.
1590 __ B(GetExitLabel());
1591
1592 if (update_old_value_) {
1593 __ Bind(&mark_old_value);
1594 if (kUseBakerReadBarrier) {
1595 DCHECK(update_old_value_slow_path_ == nullptr);
1596 arm64_codegen->GenerateIntrinsicMoveWithBakerReadBarrier(old_value_, old_value_temp_);
1597 } else {
1598 // Note: We could redirect the `failure` above directly to the entry label and bind
1599 // the exit label in the main path, but the main path would need to access the
1600 // `update_old_value_slow_path_`. To keep the code simple, keep the extra jumps.
1601 DCHECK(update_old_value_slow_path_ != nullptr);
1602 __ B(update_old_value_slow_path_->GetEntryLabel());
1603 __ Bind(update_old_value_slow_path_->GetExitLabel());
1604 }
1605 __ B(GetExitLabel());
1606 }
1607 }
1608
1609 private:
1610 std::memory_order order_;
1611 bool strong_;
1612 Register base_;
1613 Register offset_;
1614 Register expected_;
1615 Register new_value_;
1616 Register old_value_;
1617 Register old_value_temp_;
1618 Register store_result_;
1619 bool update_old_value_;
1620 SlowPathCodeARM64* mark_old_value_slow_path_;
1621 SlowPathCodeARM64* update_old_value_slow_path_;
1622 };
1623
GenUnsafeCas(HInvoke * invoke,DataType::Type type,CodeGeneratorARM64 * codegen)1624 static void GenUnsafeCas(HInvoke* invoke, DataType::Type type, CodeGeneratorARM64* codegen) {
1625 MacroAssembler* masm = codegen->GetVIXLAssembler();
1626 LocationSummary* locations = invoke->GetLocations();
1627
1628 Register out = WRegisterFrom(locations->Out()); // Boolean result.
1629 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
1630 Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
1631 Register expected = RegisterFrom(locations->InAt(3), type); // Expected.
1632 Register new_value = RegisterFrom(locations->InAt(4), type); // New value.
1633
1634 // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
1635 if (type == DataType::Type::kReference) {
1636 // Mark card for object assuming new value is stored.
1637 bool new_value_can_be_null = true; // TODO: Worth finding out this information?
1638 codegen->MaybeMarkGCCard(base, new_value, new_value_can_be_null);
1639 }
1640
1641 UseScratchRegisterScope temps(masm);
1642 Register tmp_ptr = temps.AcquireX(); // Pointer to actual memory.
1643 Register old_value; // Value in memory.
1644
1645 vixl::aarch64::Label exit_loop_label;
1646 vixl::aarch64::Label* exit_loop = &exit_loop_label;
1647 vixl::aarch64::Label* cmp_failure = &exit_loop_label;
1648
1649 if (type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
1650 // We need to store the `old_value` in a non-scratch register to make sure
1651 // the read barrier in the slow path does not clobber it.
1652 old_value = WRegisterFrom(locations->GetTemp(0)); // The old value from main path.
1653 // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
1654 // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
1655 Register old_value_temp = WRegisterFrom(locations->GetTemp(1));
1656 ReadBarrierCasSlowPathARM64* slow_path =
1657 new (codegen->GetScopedAllocator()) ReadBarrierCasSlowPathARM64(
1658 invoke,
1659 std::memory_order_seq_cst,
1660 /*strong=*/ true,
1661 base,
1662 offset,
1663 expected,
1664 new_value,
1665 old_value,
1666 old_value_temp,
1667 /*store_result=*/ Register(), // Use a scratch register.
1668 /*update_old_value=*/ false,
1669 codegen);
1670 codegen->AddSlowPath(slow_path);
1671 exit_loop = slow_path->GetExitLabel();
1672 cmp_failure = slow_path->GetEntryLabel();
1673 } else {
1674 old_value = temps.AcquireSameSizeAs(new_value);
1675 }
1676
1677 __ Add(tmp_ptr, base.X(), Operand(offset));
1678
1679 GenerateCompareAndSet(codegen,
1680 type,
1681 std::memory_order_seq_cst,
1682 /*strong=*/ true,
1683 cmp_failure,
1684 tmp_ptr,
1685 new_value,
1686 old_value,
1687 /*store_result=*/ old_value.W(), // Reuse `old_value` for ST*XR* result.
1688 expected);
1689 __ Bind(exit_loop);
1690 __ Cset(out, eq);
1691 }
1692
VisitUnsafeCASInt(HInvoke * invoke)1693 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1694 VisitJdkUnsafeCASInt(invoke);
1695 }
VisitUnsafeCASLong(HInvoke * invoke)1696 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1697 VisitJdkUnsafeCASLong(invoke);
1698 }
VisitUnsafeCASObject(HInvoke * invoke)1699 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1700 VisitJdkUnsafeCASObject(invoke);
1701 }
1702
VisitJdkUnsafeCASInt(HInvoke * invoke)1703 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASInt(HInvoke* invoke) {
1704 // `jdk.internal.misc.Unsafe.compareAndSwapInt` has compare-and-set semantics (see javadoc).
1705 VisitJdkUnsafeCompareAndSetInt(invoke);
1706 }
VisitJdkUnsafeCASLong(HInvoke * invoke)1707 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASLong(HInvoke* invoke) {
1708 // `jdk.internal.misc.Unsafe.compareAndSwapLong` has compare-and-set semantics (see javadoc).
1709 VisitJdkUnsafeCompareAndSetLong(invoke);
1710 }
VisitJdkUnsafeCASObject(HInvoke * invoke)1711 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASObject(HInvoke* invoke) {
1712 // `jdk.internal.misc.Unsafe.compareAndSwapObject` has compare-and-set semantics (see javadoc).
1713 VisitJdkUnsafeCompareAndSetReference(invoke);
1714 }
1715
VisitJdkUnsafeCompareAndSetInt(HInvoke * invoke)1716 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetInt(HInvoke* invoke) {
1717 CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1718 }
VisitJdkUnsafeCompareAndSetLong(HInvoke * invoke)1719 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetLong(HInvoke* invoke) {
1720 CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1721 }
VisitJdkUnsafeCompareAndSetReference(HInvoke * invoke)1722 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetReference(HInvoke* invoke) {
1723 // The only supported read barrier implementation is the Baker-style read barriers.
1724 if (codegen_->EmitNonBakerReadBarrier()) {
1725 return;
1726 }
1727
1728 CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1729 if (codegen_->EmitReadBarrier()) {
1730 // We need two non-scratch temporary registers for read barrier.
1731 LocationSummary* locations = invoke->GetLocations();
1732 if (kUseBakerReadBarrier) {
1733 locations->AddRegisterTemps(2);
1734 } else {
1735 // To preserve the old value across the non-Baker read barrier
1736 // slow path, use a fixed callee-save register.
1737 constexpr int first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
1738 locations->AddTemp(Location::RegisterLocation(first_callee_save));
1739 // To reduce the number of moves, request x0 as the second temporary.
1740 DCHECK(InvokeRuntimeCallingConvention().GetReturnLocation(DataType::Type::kReference).Equals(
1741 Location::RegisterLocation(x0.GetCode())));
1742 locations->AddTemp(Location::RegisterLocation(x0.GetCode()));
1743 }
1744 }
1745 }
1746
VisitUnsafeCASInt(HInvoke * invoke)1747 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1748 VisitJdkUnsafeCASInt(invoke);
1749 }
VisitUnsafeCASLong(HInvoke * invoke)1750 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1751 VisitJdkUnsafeCASLong(invoke);
1752 }
VisitUnsafeCASObject(HInvoke * invoke)1753 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1754 VisitJdkUnsafeCASObject(invoke);
1755 }
1756
VisitJdkUnsafeCASInt(HInvoke * invoke)1757 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASInt(HInvoke* invoke) {
1758 // `jdk.internal.misc.Unsafe.compareAndSwapInt` has compare-and-set semantics (see javadoc).
1759 VisitJdkUnsafeCompareAndSetInt(invoke);
1760 }
VisitJdkUnsafeCASLong(HInvoke * invoke)1761 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASLong(HInvoke* invoke) {
1762 // `jdk.internal.misc.Unsafe.compareAndSwapLong` has compare-and-set semantics (see javadoc).
1763 VisitJdkUnsafeCompareAndSetLong(invoke);
1764 }
VisitJdkUnsafeCASObject(HInvoke * invoke)1765 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASObject(HInvoke* invoke) {
1766 // `jdk.internal.misc.Unsafe.compareAndSwapObject` has compare-and-set semantics (see javadoc).
1767 VisitJdkUnsafeCompareAndSetReference(invoke);
1768 }
1769
VisitJdkUnsafeCompareAndSetInt(HInvoke * invoke)1770 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetInt(HInvoke* invoke) {
1771 GenUnsafeCas(invoke, DataType::Type::kInt32, codegen_);
1772 }
VisitJdkUnsafeCompareAndSetLong(HInvoke * invoke)1773 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetLong(HInvoke* invoke) {
1774 GenUnsafeCas(invoke, DataType::Type::kInt64, codegen_);
1775 }
VisitJdkUnsafeCompareAndSetReference(HInvoke * invoke)1776 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetReference(HInvoke* invoke) {
1777 // The only supported read barrier implementation is the Baker-style read barriers.
1778 DCHECK_IMPLIES(codegen_->EmitReadBarrier(), kUseBakerReadBarrier);
1779
1780 GenUnsafeCas(invoke, DataType::Type::kReference, codegen_);
1781 }
1782
1783 enum class GetAndUpdateOp {
1784 kSet,
1785 kAdd,
1786 kAddWithByteSwap,
1787 kAnd,
1788 kOr,
1789 kXor
1790 };
1791
GenerateGetAndUpdate(CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op,DataType::Type load_store_type,std::memory_order order,Register ptr,CPURegister arg,CPURegister old_value)1792 static void GenerateGetAndUpdate(CodeGeneratorARM64* codegen,
1793 GetAndUpdateOp get_and_update_op,
1794 DataType::Type load_store_type,
1795 std::memory_order order,
1796 Register ptr,
1797 CPURegister arg,
1798 CPURegister old_value) {
1799 MacroAssembler* masm = codegen->GetVIXLAssembler();
1800 UseScratchRegisterScope temps(masm);
1801 Register store_result = temps.AcquireW();
1802
1803 DCHECK_EQ(old_value.GetSizeInBits(), arg.GetSizeInBits());
1804 Register old_value_reg;
1805 Register new_value;
1806 switch (get_and_update_op) {
1807 case GetAndUpdateOp::kSet:
1808 old_value_reg = old_value.IsX() ? old_value.X() : old_value.W();
1809 new_value = arg.IsX() ? arg.X() : arg.W();
1810 break;
1811 case GetAndUpdateOp::kAddWithByteSwap:
1812 case GetAndUpdateOp::kAdd:
1813 if (arg.IsVRegister()) {
1814 old_value_reg = arg.IsD() ? temps.AcquireX() : temps.AcquireW();
1815 new_value = old_value_reg; // Use the same temporary.
1816 break;
1817 }
1818 FALLTHROUGH_INTENDED;
1819 case GetAndUpdateOp::kAnd:
1820 case GetAndUpdateOp::kOr:
1821 case GetAndUpdateOp::kXor:
1822 old_value_reg = old_value.IsX() ? old_value.X() : old_value.W();
1823 new_value = old_value.IsX() ? temps.AcquireX() : temps.AcquireW();
1824 break;
1825 }
1826
1827 bool use_load_acquire =
1828 (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
1829 bool use_store_release =
1830 (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
1831 DCHECK(use_load_acquire || use_store_release);
1832
1833 vixl::aarch64::Label loop_label;
1834 __ Bind(&loop_label);
1835 EmitLoadExclusive(codegen, load_store_type, ptr, old_value_reg, use_load_acquire);
1836 switch (get_and_update_op) {
1837 case GetAndUpdateOp::kSet:
1838 break;
1839 case GetAndUpdateOp::kAddWithByteSwap:
1840 // To avoid unnecessary sign extension before REV16, the caller must specify `kUint16`
1841 // instead of `kInt16` and do the sign-extension explicitly afterwards.
1842 DCHECK_NE(load_store_type, DataType::Type::kInt16);
1843 GenerateReverseBytes(masm, load_store_type, old_value_reg, old_value_reg);
1844 FALLTHROUGH_INTENDED;
1845 case GetAndUpdateOp::kAdd:
1846 if (arg.IsVRegister()) {
1847 VRegister old_value_vreg = old_value.IsD() ? old_value.D() : old_value.S();
1848 VRegister sum = temps.AcquireSameSizeAs(old_value_vreg);
1849 __ Fmov(old_value_vreg, old_value_reg);
1850 __ Fadd(sum, old_value_vreg, arg.IsD() ? arg.D() : arg.S());
1851 __ Fmov(new_value, sum);
1852 } else {
1853 __ Add(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1854 }
1855 if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
1856 GenerateReverseBytes(masm, load_store_type, new_value, new_value);
1857 }
1858 break;
1859 case GetAndUpdateOp::kAnd:
1860 __ And(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1861 break;
1862 case GetAndUpdateOp::kOr:
1863 __ Orr(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1864 break;
1865 case GetAndUpdateOp::kXor:
1866 __ Eor(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1867 break;
1868 }
1869 EmitStoreExclusive(codegen, load_store_type, ptr, store_result, new_value, use_store_release);
1870 __ Cbnz(store_result, &loop_label);
1871 }
1872
CreateUnsafeGetAndUpdateLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen)1873 static void CreateUnsafeGetAndUpdateLocations(ArenaAllocator* allocator,
1874 HInvoke* invoke,
1875 CodeGeneratorARM64* codegen) {
1876 const bool can_call = codegen->EmitReadBarrier() && IsUnsafeGetAndSetReference(invoke);
1877 LocationSummary* locations =
1878 new (allocator) LocationSummary(invoke,
1879 can_call
1880 ? LocationSummary::kCallOnSlowPath
1881 : LocationSummary::kNoCall,
1882 kIntrinsified);
1883 if (can_call && kUseBakerReadBarrier) {
1884 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
1885 }
1886 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
1887 locations->SetInAt(1, Location::RequiresRegister());
1888 locations->SetInAt(2, Location::RequiresRegister());
1889 locations->SetInAt(3, Location::RequiresRegister());
1890 locations->AddTemp(Location::RequiresRegister());
1891
1892 // Request another temporary register for methods that don't return a value.
1893 DataType::Type return_type = invoke->GetType();
1894 const bool is_void = return_type == DataType::Type::kVoid;
1895 if (is_void) {
1896 locations->AddTemp(Location::RequiresRegister());
1897 } else {
1898 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
1899 }
1900 }
1901
GenUnsafeGetAndUpdate(HInvoke * invoke,DataType::Type type,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op)1902 static void GenUnsafeGetAndUpdate(HInvoke* invoke,
1903 DataType::Type type,
1904 CodeGeneratorARM64* codegen,
1905 GetAndUpdateOp get_and_update_op) {
1906 // Currently only used for these GetAndUpdateOp. Might be fine for other ops but double check
1907 // before using.
1908 DCHECK(get_and_update_op == GetAndUpdateOp::kAdd || get_and_update_op == GetAndUpdateOp::kSet);
1909
1910 MacroAssembler* masm = codegen->GetVIXLAssembler();
1911 LocationSummary* locations = invoke->GetLocations();
1912
1913 DataType::Type return_type = invoke->GetType();
1914 const bool is_void = return_type == DataType::Type::kVoid;
1915 // We use a temporary for void methods, as we don't return the value.
1916 Location out_or_temp_loc =
1917 is_void ? locations->GetTemp(locations->GetTempCount() - 1u) : locations->Out();
1918 Register out_or_temp = RegisterFrom(out_or_temp_loc, type); // Result.
1919 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
1920 Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
1921 Register arg = RegisterFrom(locations->InAt(3), type); // New value or addend.
1922 Register tmp_ptr = XRegisterFrom(locations->GetTemp(0)); // Pointer to actual memory.
1923
1924 // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
1925 if (type == DataType::Type::kReference) {
1926 DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
1927 // Mark card for object as a new value shall be stored.
1928 bool new_value_can_be_null = true; // TODO: Worth finding out this information?
1929 codegen->MaybeMarkGCCard(base, /*value=*/arg, new_value_can_be_null);
1930 }
1931
1932 __ Add(tmp_ptr, base.X(), Operand(offset));
1933 GenerateGetAndUpdate(codegen,
1934 get_and_update_op,
1935 type,
1936 std::memory_order_seq_cst,
1937 tmp_ptr,
1938 arg,
1939 /*old_value=*/ out_or_temp);
1940
1941 if (!is_void && type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
1942 DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
1943 if (kUseBakerReadBarrier) {
1944 codegen->GenerateIntrinsicMoveWithBakerReadBarrier(out_or_temp.W(), out_or_temp.W());
1945 } else {
1946 codegen->GenerateReadBarrierSlow(invoke,
1947 Location::RegisterLocation(out_or_temp.GetCode()),
1948 Location::RegisterLocation(out_or_temp.GetCode()),
1949 Location::RegisterLocation(base.GetCode()),
1950 /*offset=*/ 0u,
1951 /*index=*/ Location::RegisterLocation(offset.GetCode()));
1952 }
1953 }
1954 }
1955
VisitUnsafeGetAndAddInt(HInvoke * invoke)1956 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndAddInt(HInvoke* invoke) {
1957 VisitJdkUnsafeGetAndAddInt(invoke);
1958 }
VisitUnsafeGetAndAddLong(HInvoke * invoke)1959 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndAddLong(HInvoke* invoke) {
1960 VisitJdkUnsafeGetAndAddLong(invoke);
1961 }
VisitUnsafeGetAndSetInt(HInvoke * invoke)1962 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetInt(HInvoke* invoke) {
1963 VisitJdkUnsafeGetAndSetInt(invoke);
1964 }
VisitUnsafeGetAndSetLong(HInvoke * invoke)1965 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetLong(HInvoke* invoke) {
1966 VisitJdkUnsafeGetAndSetLong(invoke);
1967 }
VisitUnsafeGetAndSetObject(HInvoke * invoke)1968 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetObject(HInvoke* invoke) {
1969 VisitJdkUnsafeGetAndSetReference(invoke);
1970 }
1971
VisitJdkUnsafeGetAndAddInt(HInvoke * invoke)1972 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndAddInt(HInvoke* invoke) {
1973 CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1974 }
VisitJdkUnsafeGetAndAddLong(HInvoke * invoke)1975 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndAddLong(HInvoke* invoke) {
1976 CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1977 }
VisitJdkUnsafeGetAndSetInt(HInvoke * invoke)1978 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetInt(HInvoke* invoke) {
1979 CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1980 }
VisitJdkUnsafeGetAndSetLong(HInvoke * invoke)1981 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetLong(HInvoke* invoke) {
1982 CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1983 }
VisitJdkUnsafeGetAndSetReference(HInvoke * invoke)1984 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetReference(HInvoke* invoke) {
1985 CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1986 }
1987
VisitUnsafeGetAndAddInt(HInvoke * invoke)1988 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndAddInt(HInvoke* invoke) {
1989 VisitJdkUnsafeGetAndAddInt(invoke);
1990 }
VisitUnsafeGetAndAddLong(HInvoke * invoke)1991 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndAddLong(HInvoke* invoke) {
1992 VisitJdkUnsafeGetAndAddLong(invoke);
1993 }
VisitUnsafeGetAndSetInt(HInvoke * invoke)1994 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetInt(HInvoke* invoke) {
1995 VisitJdkUnsafeGetAndSetInt(invoke);
1996 }
VisitUnsafeGetAndSetLong(HInvoke * invoke)1997 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetLong(HInvoke* invoke) {
1998 VisitJdkUnsafeGetAndSetLong(invoke);
1999 }
VisitUnsafeGetAndSetObject(HInvoke * invoke)2000 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetObject(HInvoke* invoke) {
2001 VisitJdkUnsafeGetAndSetReference(invoke);
2002 }
2003
VisitJdkUnsafeGetAndAddInt(HInvoke * invoke)2004 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndAddInt(HInvoke* invoke) {
2005 GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt32, codegen_, GetAndUpdateOp::kAdd);
2006 }
VisitJdkUnsafeGetAndAddLong(HInvoke * invoke)2007 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndAddLong(HInvoke* invoke) {
2008 GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt64, codegen_, GetAndUpdateOp::kAdd);
2009 }
VisitJdkUnsafeGetAndSetInt(HInvoke * invoke)2010 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetInt(HInvoke* invoke) {
2011 GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt32, codegen_, GetAndUpdateOp::kSet);
2012 }
VisitJdkUnsafeGetAndSetLong(HInvoke * invoke)2013 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetLong(HInvoke* invoke) {
2014 GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt64, codegen_, GetAndUpdateOp::kSet);
2015 }
VisitJdkUnsafeGetAndSetReference(HInvoke * invoke)2016 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetReference(HInvoke* invoke) {
2017 GenUnsafeGetAndUpdate(invoke, DataType::Type::kReference, codegen_, GetAndUpdateOp::kSet);
2018 }
2019
VisitStringCompareTo(HInvoke * invoke)2020 void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
2021 LocationSummary* locations =
2022 new (allocator_) LocationSummary(invoke,
2023 invoke->InputAt(1)->CanBeNull()
2024 ? LocationSummary::kCallOnSlowPath
2025 : LocationSummary::kNoCall,
2026 kIntrinsified);
2027 locations->SetInAt(0, Location::RequiresRegister());
2028 locations->SetInAt(1, Location::RequiresRegister());
2029 locations->AddRegisterTemps(3);
2030 // Need temporary registers for String compression's feature.
2031 if (mirror::kUseStringCompression) {
2032 locations->AddTemp(Location::RequiresRegister());
2033 }
2034 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
2035 }
2036
VisitStringCompareTo(HInvoke * invoke)2037 void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
2038 MacroAssembler* masm = GetVIXLAssembler();
2039 LocationSummary* locations = invoke->GetLocations();
2040
2041 Register str = InputRegisterAt(invoke, 0);
2042 Register arg = InputRegisterAt(invoke, 1);
2043 DCHECK(str.IsW());
2044 DCHECK(arg.IsW());
2045 Register out = OutputRegister(invoke);
2046
2047 Register temp0 = WRegisterFrom(locations->GetTemp(0));
2048 Register temp1 = WRegisterFrom(locations->GetTemp(1));
2049 Register temp2 = WRegisterFrom(locations->GetTemp(2));
2050 Register temp3;
2051 if (mirror::kUseStringCompression) {
2052 temp3 = WRegisterFrom(locations->GetTemp(3));
2053 }
2054
2055 vixl::aarch64::Label loop;
2056 vixl::aarch64::Label find_char_diff;
2057 vixl::aarch64::Label end;
2058 vixl::aarch64::Label different_compression;
2059
2060 // Get offsets of count and value fields within a string object.
2061 const int32_t count_offset = mirror::String::CountOffset().Int32Value();
2062 const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
2063
2064 // Note that the null check must have been done earlier.
2065 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
2066
2067 // Take slow path and throw if input can be and is null.
2068 SlowPathCodeARM64* slow_path = nullptr;
2069 const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
2070 if (can_slow_path) {
2071 slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2072 codegen_->AddSlowPath(slow_path);
2073 __ Cbz(arg, slow_path->GetEntryLabel());
2074 }
2075
2076 // Reference equality check, return 0 if same reference.
2077 __ Subs(out, str, arg);
2078 __ B(&end, eq);
2079
2080 if (mirror::kUseStringCompression) {
2081 // Load `count` fields of this and argument strings.
2082 __ Ldr(temp3, HeapOperand(str, count_offset));
2083 __ Ldr(temp2, HeapOperand(arg, count_offset));
2084 // Clean out compression flag from lengths.
2085 __ Lsr(temp0, temp3, 1u);
2086 __ Lsr(temp1, temp2, 1u);
2087 } else {
2088 // Load lengths of this and argument strings.
2089 __ Ldr(temp0, HeapOperand(str, count_offset));
2090 __ Ldr(temp1, HeapOperand(arg, count_offset));
2091 }
2092 // out = length diff.
2093 __ Subs(out, temp0, temp1);
2094 // temp0 = min(len(str), len(arg)).
2095 __ Csel(temp0, temp1, temp0, ge);
2096 // Shorter string is empty?
2097 __ Cbz(temp0, &end);
2098
2099 if (mirror::kUseStringCompression) {
2100 // Check if both strings using same compression style to use this comparison loop.
2101 __ Eor(temp2, temp2, Operand(temp3));
2102 // Interleave with compression flag extraction which is needed for both paths
2103 // and also set flags which is needed only for the different compressions path.
2104 __ Ands(temp3.W(), temp3.W(), Operand(1));
2105 __ Tbnz(temp2, 0, &different_compression); // Does not use flags.
2106 }
2107 // Store offset of string value in preparation for comparison loop.
2108 __ Mov(temp1, value_offset);
2109 if (mirror::kUseStringCompression) {
2110 // For string compression, calculate the number of bytes to compare (not chars).
2111 // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
2112 __ Lsl(temp0, temp0, temp3);
2113 }
2114
2115 UseScratchRegisterScope scratch_scope(masm);
2116 Register temp4 = scratch_scope.AcquireX();
2117
2118 // Assertions that must hold in order to compare strings 8 bytes at a time.
2119 DCHECK_ALIGNED(value_offset, 8);
2120 static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
2121
2122 const size_t char_size = DataType::Size(DataType::Type::kUint16);
2123 DCHECK_EQ(char_size, 2u);
2124
2125 // Promote temp2 to an X reg, ready for LDR.
2126 temp2 = temp2.X();
2127
2128 // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
2129 __ Bind(&loop);
2130 __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
2131 __ Ldr(temp2, MemOperand(arg.X(), temp1.X()));
2132 __ Cmp(temp4, temp2);
2133 __ B(ne, &find_char_diff);
2134 __ Add(temp1, temp1, char_size * 4);
2135 // With string compression, we have compared 8 bytes, otherwise 4 chars.
2136 __ Subs(temp0, temp0, (mirror::kUseStringCompression) ? 8 : 4);
2137 __ B(&loop, hi);
2138 __ B(&end);
2139
2140 // Promote temp1 to an X reg, ready for EOR.
2141 temp1 = temp1.X();
2142
2143 // Find the single character difference.
2144 __ Bind(&find_char_diff);
2145 // Get the bit position of the first character that differs.
2146 __ Eor(temp1, temp2, temp4);
2147 __ Rbit(temp1, temp1);
2148 __ Clz(temp1, temp1);
2149
2150 // If the number of chars remaining <= the index where the difference occurs (0-3), then
2151 // the difference occurs outside the remaining string data, so just return length diff (out).
2152 // Unlike ARM, we're doing the comparison in one go here, without the subtraction at the
2153 // find_char_diff_2nd_cmp path, so it doesn't matter whether the comparison is signed or
2154 // unsigned when string compression is disabled.
2155 // When it's enabled, the comparison must be unsigned.
2156 __ Cmp(temp0, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
2157 __ B(ls, &end);
2158
2159 // Extract the characters and calculate the difference.
2160 if (mirror:: kUseStringCompression) {
2161 __ Bic(temp1, temp1, 0x7);
2162 __ Bic(temp1, temp1, Operand(temp3.X(), LSL, 3u));
2163 } else {
2164 __ Bic(temp1, temp1, 0xf);
2165 }
2166 __ Lsr(temp2, temp2, temp1);
2167 __ Lsr(temp4, temp4, temp1);
2168 if (mirror::kUseStringCompression) {
2169 // Prioritize the case of compressed strings and calculate such result first.
2170 __ Uxtb(temp1, temp4);
2171 __ Sub(out, temp1.W(), Operand(temp2.W(), UXTB));
2172 __ Tbz(temp3, 0u, &end); // If actually compressed, we're done.
2173 }
2174 __ Uxth(temp4, temp4);
2175 __ Sub(out, temp4.W(), Operand(temp2.W(), UXTH));
2176
2177 if (mirror::kUseStringCompression) {
2178 __ B(&end);
2179 __ Bind(&different_compression);
2180
2181 // Comparison for different compression style.
2182 const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
2183 DCHECK_EQ(c_char_size, 1u);
2184 temp1 = temp1.W();
2185 temp2 = temp2.W();
2186 temp4 = temp4.W();
2187
2188 // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
2189 // Note that flags have been set by the `str` compression flag extraction to `temp3`
2190 // before branching to the `different_compression` label.
2191 __ Csel(temp1, str, arg, eq); // Pointer to the compressed string.
2192 __ Csel(temp2, str, arg, ne); // Pointer to the uncompressed string.
2193
2194 // We want to free up the temp3, currently holding `str` compression flag, for comparison.
2195 // So, we move it to the bottom bit of the iteration count `temp0` which we then need to treat
2196 // as unsigned. Start by freeing the bit with a LSL and continue further down by a SUB which
2197 // will allow `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
2198 __ Lsl(temp0, temp0, 1u);
2199
2200 // Adjust temp1 and temp2 from string pointers to data pointers.
2201 __ Add(temp1, temp1, Operand(value_offset));
2202 __ Add(temp2, temp2, Operand(value_offset));
2203
2204 // Complete the move of the compression flag.
2205 __ Sub(temp0, temp0, Operand(temp3));
2206
2207 vixl::aarch64::Label different_compression_loop;
2208 vixl::aarch64::Label different_compression_diff;
2209
2210 __ Bind(&different_compression_loop);
2211 __ Ldrb(temp4, MemOperand(temp1.X(), c_char_size, PostIndex));
2212 __ Ldrh(temp3, MemOperand(temp2.X(), char_size, PostIndex));
2213 __ Subs(temp4, temp4, Operand(temp3));
2214 __ B(&different_compression_diff, ne);
2215 __ Subs(temp0, temp0, 2);
2216 __ B(&different_compression_loop, hi);
2217 __ B(&end);
2218
2219 // Calculate the difference.
2220 __ Bind(&different_compression_diff);
2221 __ Tst(temp0, Operand(1));
2222 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
2223 "Expecting 0=compressed, 1=uncompressed");
2224 __ Cneg(out, temp4, ne);
2225 }
2226
2227 __ Bind(&end);
2228
2229 if (can_slow_path) {
2230 __ Bind(slow_path->GetExitLabel());
2231 }
2232 }
2233
2234 // The cut off for unrolling the loop in String.equals() intrinsic for const strings.
2235 // The normal loop plus the pre-header is 9 instructions without string compression and 12
2236 // instructions with string compression. We can compare up to 8 bytes in 4 instructions
2237 // (LDR+LDR+CMP+BNE) and up to 16 bytes in 5 instructions (LDP+LDP+CMP+CCMP+BNE). Allow up
2238 // to 10 instructions for the unrolled loop.
2239 constexpr size_t kShortConstStringEqualsCutoffInBytes = 32;
2240
GetConstString(HInstruction * candidate,uint32_t * utf16_length)2241 static const char* GetConstString(HInstruction* candidate, uint32_t* utf16_length) {
2242 if (candidate->IsLoadString()) {
2243 HLoadString* load_string = candidate->AsLoadString();
2244 const DexFile& dex_file = load_string->GetDexFile();
2245 return dex_file.GetStringDataAndUtf16Length(load_string->GetStringIndex(), utf16_length);
2246 }
2247 return nullptr;
2248 }
2249
VisitStringEquals(HInvoke * invoke)2250 void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
2251 LocationSummary* locations =
2252 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2253 locations->SetInAt(0, Location::RequiresRegister());
2254 locations->SetInAt(1, Location::RequiresRegister());
2255
2256 // For the generic implementation and for long const strings we need a temporary.
2257 // We do not need it for short const strings, up to 8 bytes, see code generation below.
2258 uint32_t const_string_length = 0u;
2259 const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
2260 if (const_string == nullptr) {
2261 const_string = GetConstString(invoke->InputAt(1), &const_string_length);
2262 }
2263 bool is_compressed =
2264 mirror::kUseStringCompression &&
2265 const_string != nullptr &&
2266 mirror::String::DexFileStringAllASCII(const_string, const_string_length);
2267 if (const_string == nullptr || const_string_length > (is_compressed ? 8u : 4u)) {
2268 locations->AddTemp(Location::RequiresRegister());
2269 }
2270
2271 // TODO: If the String.equals() is used only for an immediately following HIf, we can
2272 // mark it as emitted-at-use-site and emit branches directly to the appropriate blocks.
2273 // Then we shall need an extra temporary register instead of the output register.
2274 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
2275 }
2276
VisitStringEquals(HInvoke * invoke)2277 void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
2278 MacroAssembler* masm = GetVIXLAssembler();
2279 LocationSummary* locations = invoke->GetLocations();
2280
2281 Register str = WRegisterFrom(locations->InAt(0));
2282 Register arg = WRegisterFrom(locations->InAt(1));
2283 Register out = XRegisterFrom(locations->Out());
2284
2285 UseScratchRegisterScope scratch_scope(masm);
2286 Register temp = scratch_scope.AcquireW();
2287 Register temp1 = scratch_scope.AcquireW();
2288
2289 vixl::aarch64::Label loop;
2290 vixl::aarch64::Label end;
2291 vixl::aarch64::Label return_true;
2292 vixl::aarch64::Label return_false;
2293
2294 // Get offsets of count, value, and class fields within a string object.
2295 const int32_t count_offset = mirror::String::CountOffset().Int32Value();
2296 const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
2297 const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
2298
2299 // Note that the null check must have been done earlier.
2300 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
2301
2302 StringEqualsOptimizations optimizations(invoke);
2303 if (!optimizations.GetArgumentNotNull()) {
2304 // Check if input is null, return false if it is.
2305 __ Cbz(arg, &return_false);
2306 }
2307
2308 // Reference equality check, return true if same reference.
2309 __ Cmp(str, arg);
2310 __ B(&return_true, eq);
2311
2312 if (!optimizations.GetArgumentIsString()) {
2313 // Instanceof check for the argument by comparing class fields.
2314 // All string objects must have the same type since String cannot be subclassed.
2315 // Receiver must be a string object, so its class field is equal to all strings' class fields.
2316 // If the argument is a string object, its class field must be equal to receiver's class field.
2317 //
2318 // As the String class is expected to be non-movable, we can read the class
2319 // field from String.equals' arguments without read barriers.
2320 AssertNonMovableStringClass();
2321 // /* HeapReference<Class> */ temp = str->klass_
2322 __ Ldr(temp, MemOperand(str.X(), class_offset));
2323 // /* HeapReference<Class> */ temp1 = arg->klass_
2324 __ Ldr(temp1, MemOperand(arg.X(), class_offset));
2325 // Also, because we use the previously loaded class references only in the
2326 // following comparison, we don't need to unpoison them.
2327 __ Cmp(temp, temp1);
2328 __ B(&return_false, ne);
2329 }
2330
2331 // Check if one of the inputs is a const string. Do not special-case both strings
2332 // being const, such cases should be handled by constant folding if needed.
2333 uint32_t const_string_length = 0u;
2334 const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
2335 if (const_string == nullptr) {
2336 const_string = GetConstString(invoke->InputAt(1), &const_string_length);
2337 if (const_string != nullptr) {
2338 std::swap(str, arg); // Make sure the const string is in `str`.
2339 }
2340 }
2341 bool is_compressed =
2342 mirror::kUseStringCompression &&
2343 const_string != nullptr &&
2344 mirror::String::DexFileStringAllASCII(const_string, const_string_length);
2345
2346 if (const_string != nullptr) {
2347 // Load `count` field of the argument string and check if it matches the const string.
2348 // Also compares the compression style, if differs return false.
2349 __ Ldr(temp, MemOperand(arg.X(), count_offset));
2350 // Temporarily release temp1 as we may not be able to embed the flagged count in CMP immediate.
2351 scratch_scope.Release(temp1);
2352 __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed)));
2353 temp1 = scratch_scope.AcquireW();
2354 __ B(&return_false, ne);
2355 } else {
2356 // Load `count` fields of this and argument strings.
2357 __ Ldr(temp, MemOperand(str.X(), count_offset));
2358 __ Ldr(temp1, MemOperand(arg.X(), count_offset));
2359 // Check if `count` fields are equal, return false if they're not.
2360 // Also compares the compression style, if differs return false.
2361 __ Cmp(temp, temp1);
2362 __ B(&return_false, ne);
2363 }
2364
2365 // Assertions that must hold in order to compare strings 8 bytes at a time.
2366 // Ok to do this because strings are zero-padded to kObjectAlignment.
2367 DCHECK_ALIGNED(value_offset, 8);
2368 static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
2369
2370 if (const_string != nullptr &&
2371 const_string_length <= (is_compressed ? kShortConstStringEqualsCutoffInBytes
2372 : kShortConstStringEqualsCutoffInBytes / 2u)) {
2373 // Load and compare the contents. Though we know the contents of the short const string
2374 // at compile time, materializing constants may be more code than loading from memory.
2375 int32_t offset = value_offset;
2376 size_t remaining_bytes =
2377 RoundUp(is_compressed ? const_string_length : const_string_length * 2u, 8u);
2378 temp = temp.X();
2379 temp1 = temp1.X();
2380 while (remaining_bytes > sizeof(uint64_t)) {
2381 Register temp2 = XRegisterFrom(locations->GetTemp(0));
2382 __ Ldp(temp, temp1, MemOperand(str.X(), offset));
2383 __ Ldp(temp2, out, MemOperand(arg.X(), offset));
2384 __ Cmp(temp, temp2);
2385 __ Ccmp(temp1, out, NoFlag, eq);
2386 __ B(&return_false, ne);
2387 offset += 2u * sizeof(uint64_t);
2388 remaining_bytes -= 2u * sizeof(uint64_t);
2389 }
2390 if (remaining_bytes != 0u) {
2391 __ Ldr(temp, MemOperand(str.X(), offset));
2392 __ Ldr(temp1, MemOperand(arg.X(), offset));
2393 __ Cmp(temp, temp1);
2394 __ B(&return_false, ne);
2395 }
2396 } else {
2397 // Return true if both strings are empty. Even with string compression `count == 0` means empty.
2398 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
2399 "Expecting 0=compressed, 1=uncompressed");
2400 __ Cbz(temp, &return_true);
2401
2402 if (mirror::kUseStringCompression) {
2403 // For string compression, calculate the number of bytes to compare (not chars).
2404 // This could in theory exceed INT32_MAX, so treat temp as unsigned.
2405 __ And(temp1, temp, Operand(1)); // Extract compression flag.
2406 __ Lsr(temp, temp, 1u); // Extract length.
2407 __ Lsl(temp, temp, temp1); // Calculate number of bytes to compare.
2408 }
2409
2410 // Store offset of string value in preparation for comparison loop
2411 __ Mov(temp1, value_offset);
2412
2413 temp1 = temp1.X();
2414 Register temp2 = XRegisterFrom(locations->GetTemp(0));
2415 // Loop to compare strings 8 bytes at a time starting at the front of the string.
2416 __ Bind(&loop);
2417 __ Ldr(out, MemOperand(str.X(), temp1));
2418 __ Ldr(temp2, MemOperand(arg.X(), temp1));
2419 __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
2420 __ Cmp(out, temp2);
2421 __ B(&return_false, ne);
2422 // With string compression, we have compared 8 bytes, otherwise 4 chars.
2423 __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
2424 __ B(&loop, hi);
2425 }
2426
2427 // Return true and exit the function.
2428 // If loop does not result in returning false, we return true.
2429 __ Bind(&return_true);
2430 __ Mov(out, 1);
2431 __ B(&end);
2432
2433 // Return false and exit the function.
2434 __ Bind(&return_false);
2435 __ Mov(out, 0);
2436 __ Bind(&end);
2437 }
2438
GenerateVisitStringIndexOf(HInvoke * invoke,MacroAssembler * masm,CodeGeneratorARM64 * codegen,bool start_at_zero)2439 static void GenerateVisitStringIndexOf(HInvoke* invoke,
2440 MacroAssembler* masm,
2441 CodeGeneratorARM64* codegen,
2442 bool start_at_zero) {
2443 LocationSummary* locations = invoke->GetLocations();
2444
2445 // Note that the null check must have been done earlier.
2446 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
2447
2448 // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
2449 // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
2450 SlowPathCodeARM64* slow_path = nullptr;
2451 HInstruction* code_point = invoke->InputAt(1);
2452 if (code_point->IsIntConstant()) {
2453 if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) > 0xFFFFU) {
2454 // Always needs the slow-path. We could directly dispatch to it, but this case should be
2455 // rare, so for simplicity just put the full slow-path down and branch unconditionally.
2456 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2457 codegen->AddSlowPath(slow_path);
2458 __ B(slow_path->GetEntryLabel());
2459 __ Bind(slow_path->GetExitLabel());
2460 return;
2461 }
2462 } else if (code_point->GetType() != DataType::Type::kUint16) {
2463 Register char_reg = WRegisterFrom(locations->InAt(1));
2464 __ Tst(char_reg, 0xFFFF0000);
2465 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2466 codegen->AddSlowPath(slow_path);
2467 __ B(ne, slow_path->GetEntryLabel());
2468 }
2469
2470 if (start_at_zero) {
2471 // Start-index = 0.
2472 Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
2473 __ Mov(tmp_reg, 0);
2474 }
2475
2476 codegen->InvokeRuntime(kQuickIndexOf, invoke, invoke->GetDexPc(), slow_path);
2477 CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
2478
2479 if (slow_path != nullptr) {
2480 __ Bind(slow_path->GetExitLabel());
2481 }
2482 }
2483
VisitStringIndexOf(HInvoke * invoke)2484 void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) {
2485 LocationSummary* locations = new (allocator_) LocationSummary(
2486 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2487 // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
2488 // best to align the inputs accordingly.
2489 InvokeRuntimeCallingConvention calling_convention;
2490 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2491 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2492 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
2493
2494 // Need to send start_index=0.
2495 locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
2496 }
2497
VisitStringIndexOf(HInvoke * invoke)2498 void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) {
2499 GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ true);
2500 }
2501
VisitStringIndexOfAfter(HInvoke * invoke)2502 void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
2503 LocationSummary* locations = new (allocator_) LocationSummary(
2504 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2505 // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
2506 // best to align the inputs accordingly.
2507 InvokeRuntimeCallingConvention calling_convention;
2508 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2509 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2510 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2511 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
2512 }
2513
VisitStringIndexOfAfter(HInvoke * invoke)2514 void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
2515 GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ false);
2516 }
2517
VisitStringNewStringFromBytes(HInvoke * invoke)2518 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
2519 LocationSummary* locations = new (allocator_) LocationSummary(
2520 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2521 InvokeRuntimeCallingConvention calling_convention;
2522 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2523 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2524 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2525 locations->SetInAt(3, LocationFrom(calling_convention.GetRegisterAt(3)));
2526 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2527 }
2528
VisitStringNewStringFromBytes(HInvoke * invoke)2529 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
2530 MacroAssembler* masm = GetVIXLAssembler();
2531 LocationSummary* locations = invoke->GetLocations();
2532
2533 Register byte_array = WRegisterFrom(locations->InAt(0));
2534 __ Cmp(byte_array, 0);
2535 SlowPathCodeARM64* slow_path =
2536 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2537 codegen_->AddSlowPath(slow_path);
2538 __ B(eq, slow_path->GetEntryLabel());
2539
2540 codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc(), slow_path);
2541 CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
2542 __ Bind(slow_path->GetExitLabel());
2543 }
2544
VisitStringNewStringFromChars(HInvoke * invoke)2545 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
2546 LocationSummary* locations =
2547 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2548 InvokeRuntimeCallingConvention calling_convention;
2549 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2550 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2551 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2552 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2553 }
2554
VisitStringNewStringFromChars(HInvoke * invoke)2555 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
2556 // No need to emit code checking whether `locations->InAt(2)` is a null
2557 // pointer, as callers of the native method
2558 //
2559 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
2560 //
2561 // all include a null check on `data` before calling that method.
2562 codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
2563 CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
2564 }
2565
VisitStringNewStringFromString(HInvoke * invoke)2566 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) {
2567 LocationSummary* locations = new (allocator_) LocationSummary(
2568 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2569 InvokeRuntimeCallingConvention calling_convention;
2570 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2571 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2572 }
2573
VisitStringNewStringFromString(HInvoke * invoke)2574 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) {
2575 MacroAssembler* masm = GetVIXLAssembler();
2576 LocationSummary* locations = invoke->GetLocations();
2577
2578 Register string_to_copy = WRegisterFrom(locations->InAt(0));
2579 __ Cmp(string_to_copy, 0);
2580 SlowPathCodeARM64* slow_path =
2581 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2582 codegen_->AddSlowPath(slow_path);
2583 __ B(eq, slow_path->GetEntryLabel());
2584
2585 codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc(), slow_path);
2586 CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
2587 __ Bind(slow_path->GetExitLabel());
2588 }
2589
CreateFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)2590 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2591 DCHECK_EQ(invoke->GetNumberOfArguments(), 1U);
2592 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2593 DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2594
2595 LocationSummary* const locations =
2596 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2597 InvokeRuntimeCallingConvention calling_convention;
2598
2599 locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
2600 locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
2601 }
2602
CreateFPFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)2603 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2604 DCHECK_EQ(invoke->GetNumberOfArguments(), 2U);
2605 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2606 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
2607 DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2608
2609 LocationSummary* const locations =
2610 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2611 InvokeRuntimeCallingConvention calling_convention;
2612
2613 locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
2614 locations->SetInAt(1, LocationFrom(calling_convention.GetFpuRegisterAt(1)));
2615 locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
2616 }
2617
CreateFPFPFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)2618 static void CreateFPFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2619 DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
2620 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2621 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
2622 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(2)->GetType()));
2623 DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2624
2625 LocationSummary* const locations =
2626 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2627
2628 locations->SetInAt(0, Location::RequiresFpuRegister());
2629 locations->SetInAt(1, Location::RequiresFpuRegister());
2630 locations->SetInAt(2, Location::RequiresFpuRegister());
2631 locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
2632 }
2633
GenFPToFPCall(HInvoke * invoke,CodeGeneratorARM64 * codegen,QuickEntrypointEnum entry)2634 static void GenFPToFPCall(HInvoke* invoke,
2635 CodeGeneratorARM64* codegen,
2636 QuickEntrypointEnum entry) {
2637 codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
2638 }
2639
VisitMathCos(HInvoke * invoke)2640 void IntrinsicLocationsBuilderARM64::VisitMathCos(HInvoke* invoke) {
2641 CreateFPToFPCallLocations(allocator_, invoke);
2642 }
2643
VisitMathCos(HInvoke * invoke)2644 void IntrinsicCodeGeneratorARM64::VisitMathCos(HInvoke* invoke) {
2645 GenFPToFPCall(invoke, codegen_, kQuickCos);
2646 }
2647
VisitMathSin(HInvoke * invoke)2648 void IntrinsicLocationsBuilderARM64::VisitMathSin(HInvoke* invoke) {
2649 CreateFPToFPCallLocations(allocator_, invoke);
2650 }
2651
VisitMathSin(HInvoke * invoke)2652 void IntrinsicCodeGeneratorARM64::VisitMathSin(HInvoke* invoke) {
2653 GenFPToFPCall(invoke, codegen_, kQuickSin);
2654 }
2655
VisitMathAcos(HInvoke * invoke)2656 void IntrinsicLocationsBuilderARM64::VisitMathAcos(HInvoke* invoke) {
2657 CreateFPToFPCallLocations(allocator_, invoke);
2658 }
2659
VisitMathAcos(HInvoke * invoke)2660 void IntrinsicCodeGeneratorARM64::VisitMathAcos(HInvoke* invoke) {
2661 GenFPToFPCall(invoke, codegen_, kQuickAcos);
2662 }
2663
VisitMathAsin(HInvoke * invoke)2664 void IntrinsicLocationsBuilderARM64::VisitMathAsin(HInvoke* invoke) {
2665 CreateFPToFPCallLocations(allocator_, invoke);
2666 }
2667
VisitMathAsin(HInvoke * invoke)2668 void IntrinsicCodeGeneratorARM64::VisitMathAsin(HInvoke* invoke) {
2669 GenFPToFPCall(invoke, codegen_, kQuickAsin);
2670 }
2671
VisitMathAtan(HInvoke * invoke)2672 void IntrinsicLocationsBuilderARM64::VisitMathAtan(HInvoke* invoke) {
2673 CreateFPToFPCallLocations(allocator_, invoke);
2674 }
2675
VisitMathAtan(HInvoke * invoke)2676 void IntrinsicCodeGeneratorARM64::VisitMathAtan(HInvoke* invoke) {
2677 GenFPToFPCall(invoke, codegen_, kQuickAtan);
2678 }
2679
VisitMathCbrt(HInvoke * invoke)2680 void IntrinsicLocationsBuilderARM64::VisitMathCbrt(HInvoke* invoke) {
2681 CreateFPToFPCallLocations(allocator_, invoke);
2682 }
2683
VisitMathCbrt(HInvoke * invoke)2684 void IntrinsicCodeGeneratorARM64::VisitMathCbrt(HInvoke* invoke) {
2685 GenFPToFPCall(invoke, codegen_, kQuickCbrt);
2686 }
2687
VisitMathCosh(HInvoke * invoke)2688 void IntrinsicLocationsBuilderARM64::VisitMathCosh(HInvoke* invoke) {
2689 CreateFPToFPCallLocations(allocator_, invoke);
2690 }
2691
VisitMathCosh(HInvoke * invoke)2692 void IntrinsicCodeGeneratorARM64::VisitMathCosh(HInvoke* invoke) {
2693 GenFPToFPCall(invoke, codegen_, kQuickCosh);
2694 }
2695
VisitMathExp(HInvoke * invoke)2696 void IntrinsicLocationsBuilderARM64::VisitMathExp(HInvoke* invoke) {
2697 CreateFPToFPCallLocations(allocator_, invoke);
2698 }
2699
VisitMathExp(HInvoke * invoke)2700 void IntrinsicCodeGeneratorARM64::VisitMathExp(HInvoke* invoke) {
2701 GenFPToFPCall(invoke, codegen_, kQuickExp);
2702 }
2703
VisitMathExpm1(HInvoke * invoke)2704 void IntrinsicLocationsBuilderARM64::VisitMathExpm1(HInvoke* invoke) {
2705 CreateFPToFPCallLocations(allocator_, invoke);
2706 }
2707
VisitMathExpm1(HInvoke * invoke)2708 void IntrinsicCodeGeneratorARM64::VisitMathExpm1(HInvoke* invoke) {
2709 GenFPToFPCall(invoke, codegen_, kQuickExpm1);
2710 }
2711
VisitMathLog(HInvoke * invoke)2712 void IntrinsicLocationsBuilderARM64::VisitMathLog(HInvoke* invoke) {
2713 CreateFPToFPCallLocations(allocator_, invoke);
2714 }
2715
VisitMathLog(HInvoke * invoke)2716 void IntrinsicCodeGeneratorARM64::VisitMathLog(HInvoke* invoke) {
2717 GenFPToFPCall(invoke, codegen_, kQuickLog);
2718 }
2719
VisitMathLog10(HInvoke * invoke)2720 void IntrinsicLocationsBuilderARM64::VisitMathLog10(HInvoke* invoke) {
2721 CreateFPToFPCallLocations(allocator_, invoke);
2722 }
2723
VisitMathLog10(HInvoke * invoke)2724 void IntrinsicCodeGeneratorARM64::VisitMathLog10(HInvoke* invoke) {
2725 GenFPToFPCall(invoke, codegen_, kQuickLog10);
2726 }
2727
VisitMathSinh(HInvoke * invoke)2728 void IntrinsicLocationsBuilderARM64::VisitMathSinh(HInvoke* invoke) {
2729 CreateFPToFPCallLocations(allocator_, invoke);
2730 }
2731
VisitMathSinh(HInvoke * invoke)2732 void IntrinsicCodeGeneratorARM64::VisitMathSinh(HInvoke* invoke) {
2733 GenFPToFPCall(invoke, codegen_, kQuickSinh);
2734 }
2735
VisitMathTan(HInvoke * invoke)2736 void IntrinsicLocationsBuilderARM64::VisitMathTan(HInvoke* invoke) {
2737 CreateFPToFPCallLocations(allocator_, invoke);
2738 }
2739
VisitMathTan(HInvoke * invoke)2740 void IntrinsicCodeGeneratorARM64::VisitMathTan(HInvoke* invoke) {
2741 GenFPToFPCall(invoke, codegen_, kQuickTan);
2742 }
2743
VisitMathTanh(HInvoke * invoke)2744 void IntrinsicLocationsBuilderARM64::VisitMathTanh(HInvoke* invoke) {
2745 CreateFPToFPCallLocations(allocator_, invoke);
2746 }
2747
VisitMathTanh(HInvoke * invoke)2748 void IntrinsicCodeGeneratorARM64::VisitMathTanh(HInvoke* invoke) {
2749 GenFPToFPCall(invoke, codegen_, kQuickTanh);
2750 }
2751
VisitMathAtan2(HInvoke * invoke)2752 void IntrinsicLocationsBuilderARM64::VisitMathAtan2(HInvoke* invoke) {
2753 CreateFPFPToFPCallLocations(allocator_, invoke);
2754 }
2755
VisitMathAtan2(HInvoke * invoke)2756 void IntrinsicCodeGeneratorARM64::VisitMathAtan2(HInvoke* invoke) {
2757 GenFPToFPCall(invoke, codegen_, kQuickAtan2);
2758 }
2759
VisitMathPow(HInvoke * invoke)2760 void IntrinsicLocationsBuilderARM64::VisitMathPow(HInvoke* invoke) {
2761 CreateFPFPToFPCallLocations(allocator_, invoke);
2762 }
2763
VisitMathPow(HInvoke * invoke)2764 void IntrinsicCodeGeneratorARM64::VisitMathPow(HInvoke* invoke) {
2765 GenFPToFPCall(invoke, codegen_, kQuickPow);
2766 }
2767
VisitMathHypot(HInvoke * invoke)2768 void IntrinsicLocationsBuilderARM64::VisitMathHypot(HInvoke* invoke) {
2769 CreateFPFPToFPCallLocations(allocator_, invoke);
2770 }
2771
VisitMathHypot(HInvoke * invoke)2772 void IntrinsicCodeGeneratorARM64::VisitMathHypot(HInvoke* invoke) {
2773 GenFPToFPCall(invoke, codegen_, kQuickHypot);
2774 }
2775
VisitMathNextAfter(HInvoke * invoke)2776 void IntrinsicLocationsBuilderARM64::VisitMathNextAfter(HInvoke* invoke) {
2777 CreateFPFPToFPCallLocations(allocator_, invoke);
2778 }
2779
VisitMathNextAfter(HInvoke * invoke)2780 void IntrinsicCodeGeneratorARM64::VisitMathNextAfter(HInvoke* invoke) {
2781 GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
2782 }
2783
VisitStringGetCharsNoCheck(HInvoke * invoke)2784 void IntrinsicLocationsBuilderARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
2785 LocationSummary* locations =
2786 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2787 locations->SetInAt(0, Location::RequiresRegister());
2788 locations->SetInAt(1, Location::RequiresRegister());
2789 locations->SetInAt(2, Location::RequiresRegister());
2790 locations->SetInAt(3, Location::RequiresRegister());
2791 locations->SetInAt(4, Location::RequiresRegister());
2792
2793 locations->AddRegisterTemps(3);
2794 }
2795
VisitStringGetCharsNoCheck(HInvoke * invoke)2796 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
2797 MacroAssembler* masm = GetVIXLAssembler();
2798 LocationSummary* locations = invoke->GetLocations();
2799
2800 // Check assumption that sizeof(Char) is 2 (used in scaling below).
2801 const size_t char_size = DataType::Size(DataType::Type::kUint16);
2802 DCHECK_EQ(char_size, 2u);
2803
2804 // Location of data in char array buffer.
2805 const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
2806
2807 // Location of char array data in string.
2808 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
2809
2810 // void getCharsNoCheck(int srcBegin, int srcEnd, char[] dst, int dstBegin);
2811 // Since getChars() calls getCharsNoCheck() - we use registers rather than constants.
2812 Register srcObj = XRegisterFrom(locations->InAt(0));
2813 Register srcBegin = XRegisterFrom(locations->InAt(1));
2814 Register srcEnd = XRegisterFrom(locations->InAt(2));
2815 Register dstObj = XRegisterFrom(locations->InAt(3));
2816 Register dstBegin = XRegisterFrom(locations->InAt(4));
2817
2818 Register src_ptr = XRegisterFrom(locations->GetTemp(0));
2819 Register num_chr = XRegisterFrom(locations->GetTemp(1));
2820 Register tmp1 = XRegisterFrom(locations->GetTemp(2));
2821
2822 UseScratchRegisterScope temps(masm);
2823 Register dst_ptr = temps.AcquireX();
2824 Register tmp2 = temps.AcquireX();
2825
2826 vixl::aarch64::Label done;
2827 vixl::aarch64::Label compressed_string_vector_loop;
2828 vixl::aarch64::Label compressed_string_remainder;
2829 __ Sub(num_chr, srcEnd, srcBegin);
2830 // Early out for valid zero-length retrievals.
2831 __ Cbz(num_chr, &done);
2832
2833 // dst address start to copy to.
2834 __ Add(dst_ptr, dstObj, Operand(data_offset));
2835 __ Add(dst_ptr, dst_ptr, Operand(dstBegin, LSL, 1));
2836
2837 // src address to copy from.
2838 __ Add(src_ptr, srcObj, Operand(value_offset));
2839 vixl::aarch64::Label compressed_string_preloop;
2840 if (mirror::kUseStringCompression) {
2841 // Location of count in string.
2842 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
2843 // String's length.
2844 __ Ldr(tmp2, MemOperand(srcObj, count_offset));
2845 __ Tbz(tmp2, 0, &compressed_string_preloop);
2846 }
2847 __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
2848
2849 // Do the copy.
2850 vixl::aarch64::Label loop;
2851 vixl::aarch64::Label remainder;
2852
2853 // Save repairing the value of num_chr on the < 8 character path.
2854 __ Subs(tmp1, num_chr, 8);
2855 __ B(lt, &remainder);
2856
2857 // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
2858 __ Mov(num_chr, tmp1);
2859
2860 // Main loop used for longer fetches loads and stores 8x16-bit characters at a time.
2861 // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.)
2862 __ Bind(&loop);
2863 __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, PostIndex));
2864 __ Subs(num_chr, num_chr, 8);
2865 __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, PostIndex));
2866 __ B(ge, &loop);
2867
2868 __ Adds(num_chr, num_chr, 8);
2869 __ B(eq, &done);
2870
2871 // Main loop for < 8 character case and remainder handling. Loads and stores one
2872 // 16-bit Java character at a time.
2873 __ Bind(&remainder);
2874 __ Ldrh(tmp1, MemOperand(src_ptr, char_size, PostIndex));
2875 __ Subs(num_chr, num_chr, 1);
2876 __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
2877 __ B(gt, &remainder);
2878 __ B(&done);
2879
2880 if (mirror::kUseStringCompression) {
2881 // For compressed strings, acquire a SIMD temporary register.
2882 VRegister vtmp1 = temps.AcquireVRegisterOfSize(kQRegSize);
2883 const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
2884 DCHECK_EQ(c_char_size, 1u);
2885 __ Bind(&compressed_string_preloop);
2886 __ Add(src_ptr, src_ptr, Operand(srcBegin));
2887
2888 // Save repairing the value of num_chr on the < 8 character path.
2889 __ Subs(tmp1, num_chr, 8);
2890 __ B(lt, &compressed_string_remainder);
2891
2892 // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
2893 __ Mov(num_chr, tmp1);
2894
2895 // Main loop for compressed src, copying 8 characters (8-bit) to (16-bit) at a time.
2896 // Uses SIMD instructions.
2897 __ Bind(&compressed_string_vector_loop);
2898 __ Ld1(vtmp1.V8B(), MemOperand(src_ptr, c_char_size * 8, PostIndex));
2899 __ Subs(num_chr, num_chr, 8);
2900 __ Uxtl(vtmp1.V8H(), vtmp1.V8B());
2901 __ St1(vtmp1.V8H(), MemOperand(dst_ptr, char_size * 8, PostIndex));
2902 __ B(ge, &compressed_string_vector_loop);
2903
2904 __ Adds(num_chr, num_chr, 8);
2905 __ B(eq, &done);
2906
2907 // Loop for < 8 character case and remainder handling with a compressed src.
2908 // Copies 1 character (8-bit) to (16-bit) at a time.
2909 __ Bind(&compressed_string_remainder);
2910 __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
2911 __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
2912 __ Subs(num_chr, num_chr, Operand(1));
2913 __ B(gt, &compressed_string_remainder);
2914 }
2915
2916 __ Bind(&done);
2917 }
2918
2919 // This value is greater than ARRAYCOPY_SHORT_CHAR_ARRAY_THRESHOLD in libcore,
2920 // so if we choose to jump to the slow path we will end up in the native implementation.
2921 static constexpr int32_t kSystemArrayCopyCharThreshold = 192;
2922
LocationForSystemArrayCopyInput(HInstruction * input)2923 static Location LocationForSystemArrayCopyInput(HInstruction* input) {
2924 HIntConstant* const_input = input->AsIntConstantOrNull();
2925 if (const_input != nullptr && vixl::aarch64::Assembler::IsImmAddSub(const_input->GetValue())) {
2926 return Location::ConstantLocation(const_input);
2927 } else {
2928 return Location::RequiresRegister();
2929 }
2930 }
2931
VisitSystemArrayCopyChar(HInvoke * invoke)2932 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
2933 // Check to see if we have known failures that will cause us to have to bail out
2934 // to the runtime, and just generate the runtime call directly.
2935 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstantOrNull();
2936 HIntConstant* dst_pos = invoke->InputAt(3)->AsIntConstantOrNull();
2937
2938 // The positions must be non-negative.
2939 if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
2940 (dst_pos != nullptr && dst_pos->GetValue() < 0)) {
2941 // We will have to fail anyways.
2942 return;
2943 }
2944
2945 // The length must be >= 0 and not so long that we would (currently) prefer libcore's
2946 // native implementation.
2947 HIntConstant* length = invoke->InputAt(4)->AsIntConstantOrNull();
2948 if (length != nullptr) {
2949 int32_t len = length->GetValue();
2950 if (len < 0 || len > kSystemArrayCopyCharThreshold) {
2951 // Just call as normal.
2952 return;
2953 }
2954 }
2955
2956 ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
2957 LocationSummary* locations =
2958 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
2959 // arraycopy(char[] src, int src_pos, char[] dst, int dst_pos, int length).
2960 locations->SetInAt(0, Location::RequiresRegister());
2961 locations->SetInAt(1, LocationForSystemArrayCopyInput(invoke->InputAt(1)));
2962 locations->SetInAt(2, Location::RequiresRegister());
2963 locations->SetInAt(3, LocationForSystemArrayCopyInput(invoke->InputAt(3)));
2964 locations->SetInAt(4, LocationForSystemArrayCopyInput(invoke->InputAt(4)));
2965
2966 locations->AddRegisterTemps(3);
2967 }
2968
CheckSystemArrayCopyPosition(MacroAssembler * masm,Register array,Location pos,Location length,SlowPathCodeARM64 * slow_path,Register temp,bool length_is_array_length,bool position_sign_checked)2969 static void CheckSystemArrayCopyPosition(MacroAssembler* masm,
2970 Register array,
2971 Location pos,
2972 Location length,
2973 SlowPathCodeARM64* slow_path,
2974 Register temp,
2975 bool length_is_array_length,
2976 bool position_sign_checked) {
2977 const int32_t length_offset = mirror::Array::LengthOffset().Int32Value();
2978 if (pos.IsConstant()) {
2979 int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
2980 if (pos_const == 0) {
2981 if (!length_is_array_length) {
2982 // Check that length(array) >= length.
2983 __ Ldr(temp, MemOperand(array, length_offset));
2984 __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2985 __ B(slow_path->GetEntryLabel(), lt);
2986 }
2987 } else {
2988 // Calculate length(array) - pos.
2989 // Both operands are known to be non-negative `int32_t`, so the difference cannot underflow
2990 // as `int32_t`. If the result is negative, the B.LT below shall go to the slow path.
2991 __ Ldr(temp, MemOperand(array, length_offset));
2992 __ Sub(temp, temp, pos_const);
2993
2994 // Check that (length(array) - pos) >= length.
2995 __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2996 __ B(slow_path->GetEntryLabel(), lt);
2997 }
2998 } else if (length_is_array_length) {
2999 // The only way the copy can succeed is if pos is zero.
3000 __ Cbnz(WRegisterFrom(pos), slow_path->GetEntryLabel());
3001 } else {
3002 // Check that pos >= 0.
3003 Register pos_reg = WRegisterFrom(pos);
3004 if (!position_sign_checked) {
3005 __ Tbnz(pos_reg, pos_reg.GetSizeInBits() - 1, slow_path->GetEntryLabel());
3006 }
3007
3008 // Calculate length(array) - pos.
3009 // Both operands are known to be non-negative `int32_t`, so the difference cannot underflow
3010 // as `int32_t`. If the result is negative, the B.LT below shall go to the slow path.
3011 __ Ldr(temp, MemOperand(array, length_offset));
3012 __ Sub(temp, temp, pos_reg);
3013
3014 // Check that (length(array) - pos) >= length.
3015 __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
3016 __ B(slow_path->GetEntryLabel(), lt);
3017 }
3018 }
3019
GenArrayAddress(MacroAssembler * masm,Register dest,Register base,Location pos,DataType::Type type,int32_t data_offset)3020 static void GenArrayAddress(MacroAssembler* masm,
3021 Register dest,
3022 Register base,
3023 Location pos,
3024 DataType::Type type,
3025 int32_t data_offset) {
3026 if (pos.IsConstant()) {
3027 int32_t constant = pos.GetConstant()->AsIntConstant()->GetValue();
3028 __ Add(dest, base, DataType::Size(type) * constant + data_offset);
3029 } else {
3030 if (data_offset != 0) {
3031 __ Add(dest, base, data_offset);
3032 base = dest;
3033 }
3034 __ Add(dest, base, Operand(XRegisterFrom(pos), LSL, DataType::SizeShift(type)));
3035 }
3036 }
3037
3038 // Compute base source address, base destination address, and end
3039 // source address for System.arraycopy* intrinsics in `src_base`,
3040 // `dst_base` and `src_end` respectively.
GenSystemArrayCopyAddresses(MacroAssembler * masm,DataType::Type type,Register src,Location src_pos,Register dst,Location dst_pos,Location copy_length,Register src_base,Register dst_base,Register src_end)3041 static void GenSystemArrayCopyAddresses(MacroAssembler* masm,
3042 DataType::Type type,
3043 Register src,
3044 Location src_pos,
3045 Register dst,
3046 Location dst_pos,
3047 Location copy_length,
3048 Register src_base,
3049 Register dst_base,
3050 Register src_end) {
3051 // This routine is used by the SystemArrayCopy and the SystemArrayCopyChar intrinsics.
3052 DCHECK(type == DataType::Type::kReference || type == DataType::Type::kUint16)
3053 << "Unexpected element type: " << type;
3054 const int32_t element_size = DataType::Size(type);
3055 const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
3056
3057 GenArrayAddress(masm, src_base, src, src_pos, type, data_offset);
3058 GenArrayAddress(masm, dst_base, dst, dst_pos, type, data_offset);
3059 if (src_end.IsValid()) {
3060 GenArrayAddress(masm, src_end, src_base, copy_length, type, /*data_offset=*/ 0);
3061 }
3062 }
3063
VisitSystemArrayCopyChar(HInvoke * invoke)3064 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
3065 MacroAssembler* masm = GetVIXLAssembler();
3066 LocationSummary* locations = invoke->GetLocations();
3067 Register src = XRegisterFrom(locations->InAt(0));
3068 Location src_pos = locations->InAt(1);
3069 Register dst = XRegisterFrom(locations->InAt(2));
3070 Location dst_pos = locations->InAt(3);
3071 Location length = locations->InAt(4);
3072
3073 SlowPathCodeARM64* slow_path =
3074 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
3075 codegen_->AddSlowPath(slow_path);
3076
3077 // If source and destination are the same, take the slow path. Overlapping copy regions must be
3078 // copied in reverse and we can't know in all cases if it's needed.
3079 __ Cmp(src, dst);
3080 __ B(slow_path->GetEntryLabel(), eq);
3081
3082 // Bail out if the source is null.
3083 __ Cbz(src, slow_path->GetEntryLabel());
3084
3085 // Bail out if the destination is null.
3086 __ Cbz(dst, slow_path->GetEntryLabel());
3087
3088 if (!length.IsConstant()) {
3089 // Merge the following two comparisons into one:
3090 // If the length is negative, bail out (delegate to libcore's native implementation).
3091 // If the length > kSystemArrayCopyCharThreshold then (currently) prefer libcore's
3092 // native implementation.
3093 __ Cmp(WRegisterFrom(length), kSystemArrayCopyCharThreshold);
3094 __ B(slow_path->GetEntryLabel(), hi);
3095 } else {
3096 // We have already checked in the LocationsBuilder for the constant case.
3097 DCHECK_GE(length.GetConstant()->AsIntConstant()->GetValue(), 0);
3098 DCHECK_LE(length.GetConstant()->AsIntConstant()->GetValue(), kSystemArrayCopyCharThreshold);
3099 }
3100
3101 Register src_curr_addr = WRegisterFrom(locations->GetTemp(0));
3102 Register dst_curr_addr = WRegisterFrom(locations->GetTemp(1));
3103 Register src_stop_addr = WRegisterFrom(locations->GetTemp(2));
3104
3105 CheckSystemArrayCopyPosition(masm,
3106 src,
3107 src_pos,
3108 length,
3109 slow_path,
3110 src_curr_addr,
3111 /*length_is_array_length=*/ false,
3112 /*position_sign_checked=*/ false);
3113
3114 CheckSystemArrayCopyPosition(masm,
3115 dst,
3116 dst_pos,
3117 length,
3118 slow_path,
3119 src_curr_addr,
3120 /*length_is_array_length=*/ false,
3121 /*position_sign_checked=*/ false);
3122
3123 src_curr_addr = src_curr_addr.X();
3124 dst_curr_addr = dst_curr_addr.X();
3125 src_stop_addr = src_stop_addr.X();
3126
3127 GenSystemArrayCopyAddresses(masm,
3128 DataType::Type::kUint16,
3129 src,
3130 src_pos,
3131 dst,
3132 dst_pos,
3133 length,
3134 src_curr_addr,
3135 dst_curr_addr,
3136 Register());
3137
3138 // Iterate over the arrays and do a raw copy of the chars.
3139 const int32_t char_size = DataType::Size(DataType::Type::kUint16);
3140 UseScratchRegisterScope temps(masm);
3141
3142 // We split processing of the array in two parts: head and tail.
3143 // A first loop handles the head by copying a block of characters per
3144 // iteration (see: chars_per_block).
3145 // A second loop handles the tail by copying the remaining characters.
3146 // If the copy length is not constant, we copy them one-by-one.
3147 // If the copy length is constant, we optimize by always unrolling the tail
3148 // loop, and also unrolling the head loop when the copy length is small (see:
3149 // unroll_threshold).
3150 //
3151 // Both loops are inverted for better performance, meaning they are
3152 // implemented as conditional do-while loops.
3153 // Here, the loop condition is first checked to determine if there are
3154 // sufficient chars to run an iteration, then we enter the do-while: an
3155 // iteration is performed followed by a conditional branch only if another
3156 // iteration is necessary. As opposed to a standard while-loop, this inversion
3157 // can save some branching (e.g. we don't branch back to the initial condition
3158 // at the end of every iteration only to potentially immediately branch
3159 // again).
3160 //
3161 // A full block of chars is subtracted and added before and after the head
3162 // loop, respectively. This ensures that any remaining length after each
3163 // head loop iteration means there is a full block remaining, reducing the
3164 // number of conditional checks required on every iteration.
3165 constexpr int32_t chars_per_block = 4;
3166 constexpr int32_t unroll_threshold = 2 * chars_per_block;
3167 vixl::aarch64::Label loop1, loop2, pre_loop2, done;
3168
3169 Register length_tmp = src_stop_addr.W();
3170 Register tmp = temps.AcquireRegisterOfSize(char_size * chars_per_block * kBitsPerByte);
3171
3172 auto emitHeadLoop = [&]() {
3173 __ Bind(&loop1);
3174 __ Ldr(tmp, MemOperand(src_curr_addr, char_size * chars_per_block, PostIndex));
3175 __ Subs(length_tmp, length_tmp, chars_per_block);
3176 __ Str(tmp, MemOperand(dst_curr_addr, char_size * chars_per_block, PostIndex));
3177 __ B(&loop1, ge);
3178 };
3179
3180 auto emitTailLoop = [&]() {
3181 __ Bind(&loop2);
3182 __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
3183 __ Subs(length_tmp, length_tmp, 1);
3184 __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
3185 __ B(&loop2, gt);
3186 };
3187
3188 auto emitUnrolledTailLoop = [&](const int32_t tail_length) {
3189 DCHECK_LT(tail_length, 4);
3190
3191 // Don't use post-index addressing, and instead add a constant offset later.
3192 if ((tail_length & 2) != 0) {
3193 __ Ldr(tmp.W(), MemOperand(src_curr_addr));
3194 __ Str(tmp.W(), MemOperand(dst_curr_addr));
3195 }
3196 if ((tail_length & 1) != 0) {
3197 const int32_t offset = (tail_length & ~1) * char_size;
3198 __ Ldrh(tmp, MemOperand(src_curr_addr, offset));
3199 __ Strh(tmp, MemOperand(dst_curr_addr, offset));
3200 }
3201 };
3202
3203 if (length.IsConstant()) {
3204 const int32_t constant_length = length.GetConstant()->AsIntConstant()->GetValue();
3205 if (constant_length >= unroll_threshold) {
3206 __ Mov(length_tmp, constant_length - chars_per_block);
3207 emitHeadLoop();
3208 } else {
3209 static_assert(unroll_threshold == 8, "The unroll_threshold must be 8.");
3210 // Fully unroll both the head and tail loops.
3211 if ((constant_length & 4) != 0) {
3212 __ Ldr(tmp, MemOperand(src_curr_addr, 4 * char_size, PostIndex));
3213 __ Str(tmp, MemOperand(dst_curr_addr, 4 * char_size, PostIndex));
3214 }
3215 }
3216 emitUnrolledTailLoop(constant_length % chars_per_block);
3217 } else {
3218 Register length_reg = WRegisterFrom(length);
3219 __ Subs(length_tmp, length_reg, chars_per_block);
3220 __ B(&pre_loop2, lt);
3221
3222 emitHeadLoop();
3223
3224 __ Bind(&pre_loop2);
3225 __ Adds(length_tmp, length_tmp, chars_per_block);
3226 __ B(&done, eq);
3227
3228 emitTailLoop();
3229 }
3230
3231 __ Bind(&done);
3232 __ Bind(slow_path->GetExitLabel());
3233 }
3234
3235 // We choose to use the native implementation for longer copy lengths.
3236 static constexpr int32_t kSystemArrayCopyThreshold = 128;
3237
VisitSystemArrayCopy(HInvoke * invoke)3238 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
3239 // The only read barrier implementation supporting the
3240 // SystemArrayCopy intrinsic is the Baker-style read barriers.
3241 if (codegen_->EmitNonBakerReadBarrier()) {
3242 return;
3243 }
3244
3245 constexpr size_t kInitialNumTemps = 2u; // We need at least two temps.
3246 LocationSummary* locations = CodeGenerator::CreateSystemArrayCopyLocationSummary(
3247 invoke, kSystemArrayCopyThreshold, kInitialNumTemps);
3248 if (locations != nullptr) {
3249 locations->SetInAt(1, LocationForSystemArrayCopyInput(invoke->InputAt(1)));
3250 locations->SetInAt(3, LocationForSystemArrayCopyInput(invoke->InputAt(3)));
3251 locations->SetInAt(4, LocationForSystemArrayCopyInput(invoke->InputAt(4)));
3252 if (codegen_->EmitBakerReadBarrier()) {
3253 // Temporary register IP0, obtained from the VIXL scratch register
3254 // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
3255 // (because that register is clobbered by ReadBarrierMarkRegX
3256 // entry points). It cannot be used in calls to
3257 // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
3258 // either. For these reasons, get a third extra temporary register
3259 // from the register allocator.
3260 locations->AddTemp(Location::RequiresRegister());
3261 } else {
3262 // Cases other than Baker read barriers: the third temporary will
3263 // be acquired from the VIXL scratch register pool.
3264 }
3265 }
3266 }
3267
VisitSystemArrayCopy(HInvoke * invoke)3268 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
3269 // The only read barrier implementation supporting the
3270 // SystemArrayCopy intrinsic is the Baker-style read barriers.
3271 DCHECK_IMPLIES(codegen_->EmitReadBarrier(), kUseBakerReadBarrier);
3272
3273 MacroAssembler* masm = GetVIXLAssembler();
3274 LocationSummary* locations = invoke->GetLocations();
3275
3276 uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
3277 uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
3278 uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
3279 uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
3280 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
3281
3282 Register src = XRegisterFrom(locations->InAt(0));
3283 Location src_pos = locations->InAt(1);
3284 Register dest = XRegisterFrom(locations->InAt(2));
3285 Location dest_pos = locations->InAt(3);
3286 Location length = locations->InAt(4);
3287 Register temp1 = WRegisterFrom(locations->GetTemp(0));
3288 Location temp1_loc = LocationFrom(temp1);
3289 Register temp2 = WRegisterFrom(locations->GetTemp(1));
3290 Location temp2_loc = LocationFrom(temp2);
3291
3292 SlowPathCodeARM64* intrinsic_slow_path =
3293 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
3294 codegen_->AddSlowPath(intrinsic_slow_path);
3295
3296 vixl::aarch64::Label conditions_on_positions_validated;
3297 SystemArrayCopyOptimizations optimizations(invoke);
3298
3299 // If source and destination are the same, we go to slow path if we need to do forward copying.
3300 // We do not need to do this check if the source and destination positions are the same.
3301 if (!optimizations.GetSourcePositionIsDestinationPosition()) {
3302 if (src_pos.IsConstant()) {
3303 int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
3304 if (dest_pos.IsConstant()) {
3305 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
3306 if (optimizations.GetDestinationIsSource()) {
3307 // Checked when building locations.
3308 DCHECK_GE(src_pos_constant, dest_pos_constant);
3309 } else if (src_pos_constant < dest_pos_constant) {
3310 __ Cmp(src, dest);
3311 __ B(intrinsic_slow_path->GetEntryLabel(), eq);
3312 }
3313 } else {
3314 if (!optimizations.GetDestinationIsSource()) {
3315 __ Cmp(src, dest);
3316 __ B(&conditions_on_positions_validated, ne);
3317 }
3318 __ Cmp(WRegisterFrom(dest_pos), src_pos_constant);
3319 __ B(intrinsic_slow_path->GetEntryLabel(), gt);
3320 }
3321 } else {
3322 if (!optimizations.GetDestinationIsSource()) {
3323 __ Cmp(src, dest);
3324 __ B(&conditions_on_positions_validated, ne);
3325 }
3326 __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()),
3327 OperandFrom(dest_pos, invoke->InputAt(3)->GetType()));
3328 __ B(intrinsic_slow_path->GetEntryLabel(), lt);
3329 }
3330 }
3331
3332 __ Bind(&conditions_on_positions_validated);
3333
3334 if (!optimizations.GetSourceIsNotNull()) {
3335 // Bail out if the source is null.
3336 __ Cbz(src, intrinsic_slow_path->GetEntryLabel());
3337 }
3338
3339 if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
3340 // Bail out if the destination is null.
3341 __ Cbz(dest, intrinsic_slow_path->GetEntryLabel());
3342 }
3343
3344 // We have already checked in the LocationsBuilder for the constant case.
3345 if (!length.IsConstant()) {
3346 // Merge the following two comparisons into one:
3347 // If the length is negative, bail out (delegate to libcore's native implementation).
3348 // If the length >= 128 then (currently) prefer native implementation.
3349 __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold);
3350 __ B(intrinsic_slow_path->GetEntryLabel(), hs);
3351 }
3352 // Validity checks: source.
3353 CheckSystemArrayCopyPosition(masm,
3354 src,
3355 src_pos,
3356 length,
3357 intrinsic_slow_path,
3358 temp1,
3359 optimizations.GetCountIsSourceLength(),
3360 /*position_sign_checked=*/ false);
3361
3362 // Validity checks: dest.
3363 bool dest_position_sign_checked = optimizations.GetSourcePositionIsDestinationPosition();
3364 CheckSystemArrayCopyPosition(masm,
3365 dest,
3366 dest_pos,
3367 length,
3368 intrinsic_slow_path,
3369 temp1,
3370 optimizations.GetCountIsDestinationLength(),
3371 dest_position_sign_checked);
3372
3373 auto check_non_primitive_array_class = [&](Register klass, Register temp) {
3374 // No read barrier is needed for reading a chain of constant references for comparing
3375 // with null, or for reading a constant primitive value, see `ReadBarrierOption`.
3376 // /* HeapReference<Class> */ temp = klass->component_type_
3377 __ Ldr(temp, HeapOperand(klass, component_offset));
3378 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp);
3379 // Check that the component type is not null.
3380 __ Cbz(temp, intrinsic_slow_path->GetEntryLabel());
3381 // Check that the component type is not a primitive.
3382 // /* uint16_t */ temp = static_cast<uint16>(klass->primitive_type_);
3383 __ Ldrh(temp, HeapOperand(temp, primitive_offset));
3384 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
3385 __ Cbnz(temp, intrinsic_slow_path->GetEntryLabel());
3386 };
3387
3388 if (!optimizations.GetDoesNotNeedTypeCheck()) {
3389 // Check whether all elements of the source array are assignable to the component
3390 // type of the destination array. We do two checks: the classes are the same,
3391 // or the destination is Object[]. If none of these checks succeed, we go to the
3392 // slow path.
3393
3394 if (codegen_->EmitBakerReadBarrier()) {
3395 Location temp3_loc = locations->GetTemp(2);
3396 // /* HeapReference<Class> */ temp1 = dest->klass_
3397 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3398 temp1_loc,
3399 dest.W(),
3400 class_offset,
3401 temp3_loc,
3402 /* needs_null_check= */ false,
3403 /* use_load_acquire= */ false);
3404 // Register `temp1` is not trashed by the read barrier emitted
3405 // by GenerateFieldLoadWithBakerReadBarrier below, as that
3406 // method produces a call to a ReadBarrierMarkRegX entry point,
3407 // which saves all potentially live registers, including
3408 // temporaries such a `temp1`.
3409 // /* HeapReference<Class> */ temp2 = src->klass_
3410 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3411 temp2_loc,
3412 src.W(),
3413 class_offset,
3414 temp3_loc,
3415 /* needs_null_check= */ false,
3416 /* use_load_acquire= */ false);
3417 } else {
3418 // /* HeapReference<Class> */ temp1 = dest->klass_
3419 __ Ldr(temp1, MemOperand(dest, class_offset));
3420 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
3421 // /* HeapReference<Class> */ temp2 = src->klass_
3422 __ Ldr(temp2, MemOperand(src, class_offset));
3423 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
3424 }
3425
3426 __ Cmp(temp1, temp2);
3427 if (optimizations.GetDestinationIsTypedObjectArray()) {
3428 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
3429 vixl::aarch64::Label do_copy;
3430 // For class match, we can skip the source type check regardless of the optimization flag.
3431 __ B(&do_copy, eq);
3432 // No read barrier is needed for reading a chain of constant references
3433 // for comparing with null, see `ReadBarrierOption`.
3434 // /* HeapReference<Class> */ temp1 = temp1->component_type_
3435 __ Ldr(temp1, HeapOperand(temp1, component_offset));
3436 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
3437 // /* HeapReference<Class> */ temp1 = temp1->super_class_
3438 __ Ldr(temp1, HeapOperand(temp1, super_offset));
3439 // No need to unpoison the result, we're comparing against null.
3440 __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
3441 // Bail out if the source is not a non primitive array.
3442 if (!optimizations.GetSourceIsNonPrimitiveArray()) {
3443 check_non_primitive_array_class(temp2, temp2);
3444 }
3445 __ Bind(&do_copy);
3446 } else {
3447 DCHECK(!optimizations.GetDestinationIsTypedObjectArray());
3448 // For class match, we can skip the array type check completely if at least one of source
3449 // and destination is known to be a non primitive array, otherwise one check is enough.
3450 __ B(intrinsic_slow_path->GetEntryLabel(), ne);
3451 if (!optimizations.GetDestinationIsNonPrimitiveArray() &&
3452 !optimizations.GetSourceIsNonPrimitiveArray()) {
3453 check_non_primitive_array_class(temp2, temp2);
3454 }
3455 }
3456 } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
3457 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
3458 // Bail out if the source is not a non primitive array.
3459 // No read barrier is needed for reading a chain of constant references for comparing
3460 // with null, or for reading a constant primitive value, see `ReadBarrierOption`.
3461 // /* HeapReference<Class> */ temp2 = src->klass_
3462 __ Ldr(temp2, MemOperand(src, class_offset));
3463 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
3464 check_non_primitive_array_class(temp2, temp2);
3465 }
3466
3467 if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) {
3468 // Null constant length: not need to emit the loop code at all.
3469 } else {
3470 vixl::aarch64::Label skip_copy_and_write_barrier;
3471 if (length.IsRegister()) {
3472 // Don't enter the copy loop if the length is null.
3473 __ Cbz(WRegisterFrom(length), &skip_copy_and_write_barrier);
3474 }
3475
3476 {
3477 // We use a block to end the scratch scope before the write barrier, thus
3478 // freeing the temporary registers so they can be used in `MarkGCCard`.
3479 UseScratchRegisterScope temps(masm);
3480 bool emit_rb = codegen_->EmitBakerReadBarrier();
3481 Register temp3;
3482 Register tmp;
3483 if (emit_rb) {
3484 temp3 = WRegisterFrom(locations->GetTemp(2));
3485 // Make sure `tmp` is not IP0, as it is clobbered by ReadBarrierMarkRegX entry points
3486 // in ReadBarrierSystemArrayCopySlowPathARM64. Explicitly allocate the register IP1.
3487 DCHECK(temps.IsAvailable(ip1));
3488 temps.Exclude(ip1);
3489 tmp = ip1.W();
3490 } else {
3491 temp3 = temps.AcquireW();
3492 tmp = temps.AcquireW();
3493 }
3494
3495 Register src_curr_addr = temp1.X();
3496 Register dst_curr_addr = temp2.X();
3497 Register src_stop_addr = temp3.X();
3498 const DataType::Type type = DataType::Type::kReference;
3499 const int32_t element_size = DataType::Size(type);
3500
3501 SlowPathCodeARM64* read_barrier_slow_path = nullptr;
3502 if (emit_rb) {
3503 // TODO: Also convert this intrinsic to the IsGcMarking strategy?
3504
3505 // SystemArrayCopy implementation for Baker read barriers (see
3506 // also CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier):
3507 //
3508 // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
3509 // lfence; // Load fence or artificial data dependency to prevent load-load reordering
3510 // bool is_gray = (rb_state == ReadBarrier::GrayState());
3511 // if (is_gray) {
3512 // // Slow-path copy.
3513 // do {
3514 // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
3515 // } while (src_ptr != end_ptr)
3516 // } else {
3517 // // Fast-path copy.
3518 // do {
3519 // *dest_ptr++ = *src_ptr++;
3520 // } while (src_ptr != end_ptr)
3521 // }
3522
3523 // /* int32_t */ monitor = src->monitor_
3524 __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
3525 // /* LockWord */ lock_word = LockWord(monitor)
3526 static_assert(sizeof(LockWord) == sizeof(int32_t),
3527 "art::LockWord and int32_t have different sizes.");
3528
3529 // Introduce a dependency on the lock_word including rb_state,
3530 // to prevent load-load reordering, and without using
3531 // a memory barrier (which would be more expensive).
3532 // `src` is unchanged by this operation, but its value now depends
3533 // on `tmp`.
3534 __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
3535
3536 // Slow path used to copy array when `src` is gray.
3537 read_barrier_slow_path =
3538 new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(
3539 invoke, LocationFrom(tmp));
3540 codegen_->AddSlowPath(read_barrier_slow_path);
3541 }
3542
3543 // Compute base source address, base destination address, and end
3544 // source address for System.arraycopy* intrinsics in `src_base`,
3545 // `dst_base` and `src_end` respectively.
3546 // Note that `src_curr_addr` is computed from from `src` (and
3547 // `src_pos`) here, and thus honors the artificial dependency
3548 // of `src` on `tmp`.
3549 GenSystemArrayCopyAddresses(masm,
3550 type,
3551 src,
3552 src_pos,
3553 dest,
3554 dest_pos,
3555 length,
3556 src_curr_addr,
3557 dst_curr_addr,
3558 src_stop_addr);
3559
3560 if (emit_rb) {
3561 // Given the numeric representation, it's enough to check the low bit of the rb_state.
3562 static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0");
3563 static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
3564 __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
3565 }
3566
3567 // Iterate over the arrays and do a raw copy of the objects. We don't need to
3568 // poison/unpoison.
3569 vixl::aarch64::Label loop;
3570 __ Bind(&loop);
3571 __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
3572 __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
3573 __ Cmp(src_curr_addr, src_stop_addr);
3574 __ B(&loop, ne);
3575
3576 if (emit_rb) {
3577 DCHECK(read_barrier_slow_path != nullptr);
3578 __ Bind(read_barrier_slow_path->GetExitLabel());
3579 }
3580 }
3581
3582 // We only need one card marking on the destination array.
3583 codegen_->MarkGCCard(dest.W());
3584
3585 __ Bind(&skip_copy_and_write_barrier);
3586 }
3587
3588 __ Bind(intrinsic_slow_path->GetExitLabel());
3589 }
3590
GenIsInfinite(LocationSummary * locations,bool is64bit,MacroAssembler * masm)3591 static void GenIsInfinite(LocationSummary* locations,
3592 bool is64bit,
3593 MacroAssembler* masm) {
3594 Operand infinity(0);
3595 Operand tst_mask(0);
3596 Register out;
3597
3598 if (is64bit) {
3599 infinity = Operand(kPositiveInfinityDouble);
3600 tst_mask = MaskLeastSignificant<uint64_t>(63);
3601 out = XRegisterFrom(locations->Out());
3602 } else {
3603 infinity = Operand(kPositiveInfinityFloat);
3604 tst_mask = MaskLeastSignificant<uint32_t>(31);
3605 out = WRegisterFrom(locations->Out());
3606 }
3607
3608 MoveFPToInt(locations, is64bit, masm);
3609 // Checks whether exponent bits are all 1 and fraction bits are all 0.
3610 __ Eor(out, out, infinity);
3611 // TST bitmask is used to mask out the sign bit: either 0x7fffffff or 0x7fffffffffffffff
3612 // depending on is64bit.
3613 __ Tst(out, tst_mask);
3614 __ Cset(out, eq);
3615 }
3616
VisitFloatIsInfinite(HInvoke * invoke)3617 void IntrinsicLocationsBuilderARM64::VisitFloatIsInfinite(HInvoke* invoke) {
3618 CreateFPToIntLocations(allocator_, invoke);
3619 }
3620
VisitFloatIsInfinite(HInvoke * invoke)3621 void IntrinsicCodeGeneratorARM64::VisitFloatIsInfinite(HInvoke* invoke) {
3622 GenIsInfinite(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
3623 }
3624
VisitDoubleIsInfinite(HInvoke * invoke)3625 void IntrinsicLocationsBuilderARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
3626 CreateFPToIntLocations(allocator_, invoke);
3627 }
3628
VisitDoubleIsInfinite(HInvoke * invoke)3629 void IntrinsicCodeGeneratorARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
3630 GenIsInfinite(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
3631 }
3632
3633 #define VISIT_INTRINSIC(name, low, high, type, start_index) \
3634 void IntrinsicLocationsBuilderARM64::Visit##name##ValueOf(HInvoke* invoke) { \
3635 InvokeRuntimeCallingConvention calling_convention; \
3636 IntrinsicVisitor::ComputeValueOfLocations( \
3637 invoke, \
3638 codegen_, \
3639 low, \
3640 (high) - (low) + 1, \
3641 calling_convention.GetReturnLocation(DataType::Type::kReference), \
3642 Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode())); \
3643 } \
3644 void IntrinsicCodeGeneratorARM64::Visit##name##ValueOf(HInvoke* invoke) { \
3645 IntrinsicVisitor::ValueOfInfo info = \
3646 IntrinsicVisitor::ComputeValueOfInfo(invoke, \
3647 codegen_->GetCompilerOptions(), \
3648 WellKnownClasses::java_lang_##name##_value, \
3649 low, \
3650 (high) - (low) + 1, \
3651 start_index); \
3652 HandleValueOf(invoke, info, type); \
3653 }
BOXED_TYPES(VISIT_INTRINSIC)3654 BOXED_TYPES(VISIT_INTRINSIC)
3655 #undef VISIT_INTRINSIC
3656
3657 void IntrinsicCodeGeneratorARM64::HandleValueOf(HInvoke* invoke,
3658 const IntrinsicVisitor::ValueOfInfo& info,
3659 DataType::Type type) {
3660 LocationSummary* locations = invoke->GetLocations();
3661 MacroAssembler* masm = GetVIXLAssembler();
3662
3663 Register out = RegisterFrom(locations->Out(), DataType::Type::kReference);
3664 UseScratchRegisterScope temps(masm);
3665 Register temp = temps.AcquireW();
3666 auto allocate_instance = [&]() {
3667 DCHECK(out.X().Is(InvokeRuntimeCallingConvention().GetRegisterAt(0)));
3668 codegen_->LoadIntrinsicDeclaringClass(out, invoke);
3669 codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
3670 CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
3671 };
3672 if (invoke->InputAt(0)->IsIntConstant()) {
3673 int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
3674 if (static_cast<uint32_t>(value - info.low) < info.length) {
3675 // Just embed the object in the code.
3676 DCHECK_NE(info.value_boot_image_reference, ValueOfInfo::kInvalidReference);
3677 codegen_->LoadBootImageAddress(out, info.value_boot_image_reference);
3678 } else {
3679 DCHECK(locations->CanCall());
3680 // Allocate and initialize a new object.
3681 // TODO: If we JIT, we could allocate the object now, and store it in the
3682 // JIT object table.
3683 allocate_instance();
3684 __ Mov(temp.W(), value);
3685 codegen_->Store(type, temp.W(), HeapOperand(out.W(), info.value_offset));
3686 // Class pointer and `value` final field stores require a barrier before publication.
3687 codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
3688 }
3689 } else {
3690 DCHECK(locations->CanCall());
3691 Register in = RegisterFrom(locations->InAt(0), DataType::Type::kInt32);
3692 // Check bounds of our cache.
3693 __ Add(out.W(), in.W(), -info.low);
3694 __ Cmp(out.W(), info.length);
3695 vixl::aarch64::Label allocate, done;
3696 __ B(&allocate, hs);
3697 // If the value is within the bounds, load the object directly from the array.
3698 codegen_->LoadBootImageAddress(temp, info.array_data_boot_image_reference);
3699 MemOperand source = HeapOperand(
3700 temp, out.X(), LSL, DataType::SizeShift(DataType::Type::kReference));
3701 codegen_->Load(DataType::Type::kReference, out, source);
3702 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out);
3703 __ B(&done);
3704 __ Bind(&allocate);
3705 // Otherwise allocate and initialize a new object.
3706 allocate_instance();
3707 codegen_->Store(type, in.W(), HeapOperand(out.W(), info.value_offset));
3708 // Class pointer and `value` final field stores require a barrier before publication.
3709 codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
3710 __ Bind(&done);
3711 }
3712 }
3713
VisitReferenceGetReferent(HInvoke * invoke)3714 void IntrinsicLocationsBuilderARM64::VisitReferenceGetReferent(HInvoke* invoke) {
3715 IntrinsicVisitor::CreateReferenceGetReferentLocations(invoke, codegen_);
3716
3717 if (codegen_->EmitBakerReadBarrier() && invoke->GetLocations() != nullptr) {
3718 invoke->GetLocations()->AddTemp(Location::RequiresRegister());
3719 }
3720 }
3721
VisitReferenceGetReferent(HInvoke * invoke)3722 void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) {
3723 MacroAssembler* masm = GetVIXLAssembler();
3724 LocationSummary* locations = invoke->GetLocations();
3725
3726 Location obj = locations->InAt(0);
3727 Location out = locations->Out();
3728
3729 SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
3730 codegen_->AddSlowPath(slow_path);
3731
3732 if (codegen_->EmitReadBarrier()) {
3733 // Check self->GetWeakRefAccessEnabled().
3734 UseScratchRegisterScope temps(masm);
3735 Register temp = temps.AcquireW();
3736 __ Ldr(temp,
3737 MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArm64PointerSize>().Uint32Value()));
3738 static_assert(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled) == 0);
3739 __ Cbnz(temp, slow_path->GetEntryLabel());
3740 }
3741
3742 {
3743 // Load the java.lang.ref.Reference class.
3744 UseScratchRegisterScope temps(masm);
3745 Register temp = temps.AcquireW();
3746 codegen_->LoadIntrinsicDeclaringClass(temp, invoke);
3747
3748 // Check static fields java.lang.ref.Reference.{disableIntrinsic,slowPathEnabled} together.
3749 MemberOffset disable_intrinsic_offset = IntrinsicVisitor::GetReferenceDisableIntrinsicOffset();
3750 DCHECK_ALIGNED(disable_intrinsic_offset.Uint32Value(), 2u);
3751 DCHECK_EQ(disable_intrinsic_offset.Uint32Value() + 1u,
3752 IntrinsicVisitor::GetReferenceSlowPathEnabledOffset().Uint32Value());
3753 __ Ldrh(temp, HeapOperand(temp, disable_intrinsic_offset.Uint32Value()));
3754 __ Cbnz(temp, slow_path->GetEntryLabel());
3755 }
3756
3757 // Load the value from the field.
3758 uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
3759 if (codegen_->EmitBakerReadBarrier()) {
3760 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3761 out,
3762 WRegisterFrom(obj),
3763 referent_offset,
3764 /*maybe_temp=*/ locations->GetTemp(0),
3765 /*needs_null_check=*/ true,
3766 /*use_load_acquire=*/ true);
3767 } else {
3768 MemOperand field = HeapOperand(WRegisterFrom(obj), referent_offset);
3769 codegen_->LoadAcquire(
3770 invoke, DataType::Type::kReference, WRegisterFrom(out), field, /*needs_null_check=*/ true);
3771 codegen_->MaybeGenerateReadBarrierSlow(invoke, out, out, obj, referent_offset);
3772 }
3773 __ Bind(slow_path->GetExitLabel());
3774 }
3775
VisitReferenceRefersTo(HInvoke * invoke)3776 void IntrinsicLocationsBuilderARM64::VisitReferenceRefersTo(HInvoke* invoke) {
3777 IntrinsicVisitor::CreateReferenceRefersToLocations(invoke, codegen_);
3778 }
3779
VisitReferenceRefersTo(HInvoke * invoke)3780 void IntrinsicCodeGeneratorARM64::VisitReferenceRefersTo(HInvoke* invoke) {
3781 LocationSummary* locations = invoke->GetLocations();
3782 MacroAssembler* masm = codegen_->GetVIXLAssembler();
3783 UseScratchRegisterScope temps(masm);
3784
3785 Register obj = WRegisterFrom(locations->InAt(0));
3786 Register other = WRegisterFrom(locations->InAt(1));
3787 Register out = WRegisterFrom(locations->Out());
3788 Register tmp = temps.AcquireW();
3789
3790 uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
3791 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
3792
3793 MemOperand field = HeapOperand(obj, referent_offset);
3794 codegen_->LoadAcquire(invoke, DataType::Type::kReference, tmp, field, /*needs_null_check=*/ true);
3795 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(tmp);
3796
3797 __ Cmp(tmp, other);
3798
3799 if (codegen_->EmitReadBarrier()) {
3800 DCHECK(kUseBakerReadBarrier);
3801
3802 vixl::aarch64::Label calculate_result;
3803
3804 // If the GC is not marking, the comparison result is final.
3805 __ Cbz(mr, &calculate_result);
3806
3807 __ B(&calculate_result, eq); // ZF set if taken.
3808
3809 // Check if the loaded reference is null.
3810 __ Cbz(tmp, &calculate_result); // ZF clear if taken.
3811
3812 // For correct memory visibility, we need a barrier before loading the lock word.
3813 codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
3814
3815 // Load the lockword and check if it is a forwarding address.
3816 static_assert(LockWord::kStateShift == 30u);
3817 static_assert(LockWord::kStateForwardingAddress == 3u);
3818 __ Ldr(tmp, HeapOperand(tmp, monitor_offset));
3819 __ Cmp(tmp, Operand(0xc0000000));
3820 __ B(&calculate_result, lo); // ZF clear if taken.
3821
3822 // Extract the forwarding address and compare with `other`.
3823 __ Cmp(other, Operand(tmp, LSL, LockWord::kForwardingAddressShift));
3824
3825 __ Bind(&calculate_result);
3826 }
3827
3828 // Convert ZF into the Boolean result.
3829 __ Cset(out, eq);
3830 }
3831
VisitThreadInterrupted(HInvoke * invoke)3832 void IntrinsicLocationsBuilderARM64::VisitThreadInterrupted(HInvoke* invoke) {
3833 LocationSummary* locations =
3834 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3835 locations->SetOut(Location::RequiresRegister());
3836 }
3837
VisitThreadInterrupted(HInvoke * invoke)3838 void IntrinsicCodeGeneratorARM64::VisitThreadInterrupted(HInvoke* invoke) {
3839 MacroAssembler* masm = GetVIXLAssembler();
3840 Register out = RegisterFrom(invoke->GetLocations()->Out(), DataType::Type::kInt32);
3841 UseScratchRegisterScope temps(masm);
3842 Register temp = temps.AcquireX();
3843
3844 __ Add(temp, tr, Thread::InterruptedOffset<kArm64PointerSize>().Int32Value());
3845 __ Ldar(out.W(), MemOperand(temp));
3846
3847 vixl::aarch64::Label done;
3848 __ Cbz(out.W(), &done);
3849 __ Stlr(wzr, MemOperand(temp));
3850 __ Bind(&done);
3851 }
3852
VisitReachabilityFence(HInvoke * invoke)3853 void IntrinsicLocationsBuilderARM64::VisitReachabilityFence(HInvoke* invoke) {
3854 LocationSummary* locations =
3855 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3856 locations->SetInAt(0, Location::Any());
3857 }
3858
VisitReachabilityFence(HInvoke * invoke)3859 void IntrinsicCodeGeneratorARM64::VisitReachabilityFence([[maybe_unused]] HInvoke* invoke) {}
3860
VisitCRC32Update(HInvoke * invoke)3861 void IntrinsicLocationsBuilderARM64::VisitCRC32Update(HInvoke* invoke) {
3862 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3863 return;
3864 }
3865
3866 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3867 LocationSummary::kNoCall,
3868 kIntrinsified);
3869
3870 locations->SetInAt(0, Location::RequiresRegister());
3871 locations->SetInAt(1, Location::RequiresRegister());
3872 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
3873 }
3874
3875 // Lower the invoke of CRC32.update(int crc, int b).
VisitCRC32Update(HInvoke * invoke)3876 void IntrinsicCodeGeneratorARM64::VisitCRC32Update(HInvoke* invoke) {
3877 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3878
3879 MacroAssembler* masm = GetVIXLAssembler();
3880
3881 Register crc = InputRegisterAt(invoke, 0);
3882 Register val = InputRegisterAt(invoke, 1);
3883 Register out = OutputRegister(invoke);
3884
3885 // The general algorithm of the CRC32 calculation is:
3886 // crc = ~crc
3887 // result = crc32_for_byte(crc, b)
3888 // crc = ~result
3889 // It is directly lowered to three instructions.
3890
3891 UseScratchRegisterScope temps(masm);
3892 Register tmp = temps.AcquireSameSizeAs(out);
3893
3894 __ Mvn(tmp, crc);
3895 __ Crc32b(tmp, tmp, val);
3896 __ Mvn(out, tmp);
3897 }
3898
3899 // Generate code using CRC32 instructions which calculates
3900 // a CRC32 value of a byte.
3901 //
3902 // Parameters:
3903 // masm - VIXL macro assembler
3904 // crc - a register holding an initial CRC value
3905 // ptr - a register holding a memory address of bytes
3906 // length - a register holding a number of bytes to process
3907 // out - a register to put a result of calculation
GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler * masm,const Register & crc,const Register & ptr,const Register & length,const Register & out)3908 static void GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler* masm,
3909 const Register& crc,
3910 const Register& ptr,
3911 const Register& length,
3912 const Register& out) {
3913 // The algorithm of CRC32 of bytes is:
3914 // crc = ~crc
3915 // process a few first bytes to make the array 8-byte aligned
3916 // while array has 8 bytes do:
3917 // crc = crc32_of_8bytes(crc, 8_bytes(array))
3918 // if array has 4 bytes:
3919 // crc = crc32_of_4bytes(crc, 4_bytes(array))
3920 // if array has 2 bytes:
3921 // crc = crc32_of_2bytes(crc, 2_bytes(array))
3922 // if array has a byte:
3923 // crc = crc32_of_byte(crc, 1_byte(array))
3924 // crc = ~crc
3925
3926 vixl::aarch64::Label loop, done;
3927 vixl::aarch64::Label process_4bytes, process_2bytes, process_1byte;
3928 vixl::aarch64::Label aligned2, aligned4, aligned8;
3929
3930 // Use VIXL scratch registers as the VIXL macro assembler won't use them in
3931 // instructions below.
3932 UseScratchRegisterScope temps(masm);
3933 Register len = temps.AcquireW();
3934 Register array_elem = temps.AcquireW();
3935
3936 __ Mvn(out, crc);
3937 __ Mov(len, length);
3938
3939 __ Tbz(ptr, 0, &aligned2);
3940 __ Subs(len, len, 1);
3941 __ B(&done, lo);
3942 __ Ldrb(array_elem, MemOperand(ptr, 1, PostIndex));
3943 __ Crc32b(out, out, array_elem);
3944
3945 __ Bind(&aligned2);
3946 __ Tbz(ptr, 1, &aligned4);
3947 __ Subs(len, len, 2);
3948 __ B(&process_1byte, lo);
3949 __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
3950 __ Crc32h(out, out, array_elem);
3951
3952 __ Bind(&aligned4);
3953 __ Tbz(ptr, 2, &aligned8);
3954 __ Subs(len, len, 4);
3955 __ B(&process_2bytes, lo);
3956 __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
3957 __ Crc32w(out, out, array_elem);
3958
3959 __ Bind(&aligned8);
3960 __ Subs(len, len, 8);
3961 // If len < 8 go to process data by 4 bytes, 2 bytes and a byte.
3962 __ B(&process_4bytes, lo);
3963
3964 // The main loop processing data by 8 bytes.
3965 __ Bind(&loop);
3966 __ Ldr(array_elem.X(), MemOperand(ptr, 8, PostIndex));
3967 __ Subs(len, len, 8);
3968 __ Crc32x(out, out, array_elem.X());
3969 // if len >= 8, process the next 8 bytes.
3970 __ B(&loop, hs);
3971
3972 // Process the data which is less than 8 bytes.
3973 // The code generated below works with values of len
3974 // which come in the range [-8, 0].
3975 // The first three bits are used to detect whether 4 bytes or 2 bytes or
3976 // a byte can be processed.
3977 // The checking order is from bit 2 to bit 0:
3978 // bit 2 is set: at least 4 bytes available
3979 // bit 1 is set: at least 2 bytes available
3980 // bit 0 is set: at least a byte available
3981 __ Bind(&process_4bytes);
3982 // Goto process_2bytes if less than four bytes available
3983 __ Tbz(len, 2, &process_2bytes);
3984 __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
3985 __ Crc32w(out, out, array_elem);
3986
3987 __ Bind(&process_2bytes);
3988 // Goto process_1bytes if less than two bytes available
3989 __ Tbz(len, 1, &process_1byte);
3990 __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
3991 __ Crc32h(out, out, array_elem);
3992
3993 __ Bind(&process_1byte);
3994 // Goto done if no bytes available
3995 __ Tbz(len, 0, &done);
3996 __ Ldrb(array_elem, MemOperand(ptr));
3997 __ Crc32b(out, out, array_elem);
3998
3999 __ Bind(&done);
4000 __ Mvn(out, out);
4001 }
4002
4003 // The threshold for sizes of arrays to use the library provided implementation
4004 // of CRC32.updateBytes instead of the intrinsic.
4005 static constexpr int32_t kCRC32UpdateBytesThreshold = 64 * 1024;
4006
VisitCRC32UpdateBytes(HInvoke * invoke)4007 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
4008 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
4009 return;
4010 }
4011
4012 LocationSummary* locations =
4013 new (allocator_) LocationSummary(invoke,
4014 LocationSummary::kCallOnSlowPath,
4015 kIntrinsified);
4016
4017 locations->SetInAt(0, Location::RequiresRegister());
4018 locations->SetInAt(1, Location::RequiresRegister());
4019 locations->SetInAt(2, Location::RegisterOrConstant(invoke->InputAt(2)));
4020 locations->SetInAt(3, Location::RequiresRegister());
4021 locations->AddTemp(Location::RequiresRegister());
4022 locations->SetOut(Location::RequiresRegister());
4023 }
4024
4025 // Lower the invoke of CRC32.updateBytes(int crc, byte[] b, int off, int len)
4026 //
4027 // Note: The intrinsic is not used if len exceeds a threshold.
VisitCRC32UpdateBytes(HInvoke * invoke)4028 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
4029 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
4030
4031 MacroAssembler* masm = GetVIXLAssembler();
4032 LocationSummary* locations = invoke->GetLocations();
4033
4034 SlowPathCodeARM64* slow_path =
4035 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
4036 codegen_->AddSlowPath(slow_path);
4037
4038 Register length = WRegisterFrom(locations->InAt(3));
4039 __ Cmp(length, kCRC32UpdateBytesThreshold);
4040 __ B(slow_path->GetEntryLabel(), hi);
4041
4042 const uint32_t array_data_offset =
4043 mirror::Array::DataOffset(Primitive::kPrimByte).Uint32Value();
4044 Register ptr = XRegisterFrom(locations->GetTemp(0));
4045 Register array = XRegisterFrom(locations->InAt(1));
4046 Location offset = locations->InAt(2);
4047 if (offset.IsConstant()) {
4048 int32_t offset_value = offset.GetConstant()->AsIntConstant()->GetValue();
4049 __ Add(ptr, array, array_data_offset + offset_value);
4050 } else {
4051 __ Add(ptr, array, array_data_offset);
4052 __ Add(ptr, ptr, XRegisterFrom(offset));
4053 }
4054
4055 Register crc = WRegisterFrom(locations->InAt(0));
4056 Register out = WRegisterFrom(locations->Out());
4057
4058 GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
4059
4060 __ Bind(slow_path->GetExitLabel());
4061 }
4062
VisitCRC32UpdateByteBuffer(HInvoke * invoke)4063 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
4064 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
4065 return;
4066 }
4067
4068 LocationSummary* locations =
4069 new (allocator_) LocationSummary(invoke,
4070 LocationSummary::kNoCall,
4071 kIntrinsified);
4072
4073 locations->SetInAt(0, Location::RequiresRegister());
4074 locations->SetInAt(1, Location::RequiresRegister());
4075 locations->SetInAt(2, Location::RequiresRegister());
4076 locations->SetInAt(3, Location::RequiresRegister());
4077 locations->AddTemp(Location::RequiresRegister());
4078 locations->SetOut(Location::RequiresRegister());
4079 }
4080
4081 // Lower the invoke of CRC32.updateByteBuffer(int crc, long addr, int off, int len)
4082 //
4083 // There is no need to generate code checking if addr is 0.
4084 // The method updateByteBuffer is a private method of java.util.zip.CRC32.
4085 // This guarantees no calls outside of the CRC32 class.
4086 // An address of DirectBuffer is always passed to the call of updateByteBuffer.
4087 // It might be an implementation of an empty DirectBuffer which can use a zero
4088 // address but it must have the length to be zero. The current generated code
4089 // correctly works with the zero length.
VisitCRC32UpdateByteBuffer(HInvoke * invoke)4090 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
4091 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
4092
4093 MacroAssembler* masm = GetVIXLAssembler();
4094 LocationSummary* locations = invoke->GetLocations();
4095
4096 Register addr = XRegisterFrom(locations->InAt(1));
4097 Register ptr = XRegisterFrom(locations->GetTemp(0));
4098 __ Add(ptr, addr, XRegisterFrom(locations->InAt(2)));
4099
4100 Register crc = WRegisterFrom(locations->InAt(0));
4101 Register length = WRegisterFrom(locations->InAt(3));
4102 Register out = WRegisterFrom(locations->Out());
4103 GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
4104 }
4105
VisitFP16ToFloat(HInvoke * invoke)4106 void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
4107 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4108 return;
4109 }
4110
4111 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
4112 LocationSummary::kNoCall,
4113 kIntrinsified);
4114 locations->SetInAt(0, Location::RequiresRegister());
4115 locations->SetOut(Location::RequiresFpuRegister());
4116 }
4117
VisitFP16ToFloat(HInvoke * invoke)4118 void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
4119 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4120 MacroAssembler* masm = GetVIXLAssembler();
4121 UseScratchRegisterScope scratch_scope(masm);
4122 Register bits = InputRegisterAt(invoke, 0);
4123 VRegister out = SRegisterFrom(invoke->GetLocations()->Out());
4124 VRegister half = scratch_scope.AcquireH();
4125 __ Fmov(half, bits); // ARMv8.2
4126 __ Fcvt(out, half);
4127 }
4128
VisitFP16ToHalf(HInvoke * invoke)4129 void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
4130 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4131 return;
4132 }
4133
4134 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
4135 LocationSummary::kNoCall,
4136 kIntrinsified);
4137 locations->SetInAt(0, Location::RequiresFpuRegister());
4138 locations->SetOut(Location::RequiresRegister());
4139 }
4140
VisitFP16ToHalf(HInvoke * invoke)4141 void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
4142 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4143 MacroAssembler* masm = GetVIXLAssembler();
4144 UseScratchRegisterScope scratch_scope(masm);
4145 VRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
4146 VRegister half = scratch_scope.AcquireH();
4147 Register out = WRegisterFrom(invoke->GetLocations()->Out());
4148 __ Fcvt(half, in);
4149 __ Fmov(out, half);
4150 __ Sxth(out, out); // sign extend due to returning a short type.
4151 }
4152
4153 template<typename OP>
GenerateFP16Round(HInvoke * invoke,CodeGeneratorARM64 * const codegen_,MacroAssembler * masm,OP && roundOp)4154 void GenerateFP16Round(HInvoke* invoke,
4155 CodeGeneratorARM64* const codegen_,
4156 MacroAssembler* masm,
4157 OP&& roundOp) {
4158 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4159 LocationSummary* locations = invoke->GetLocations();
4160 UseScratchRegisterScope scratch_scope(masm);
4161 Register out = WRegisterFrom(locations->Out());
4162 VRegister half = scratch_scope.AcquireH();
4163 __ Fmov(half, WRegisterFrom(locations->InAt(0)));
4164 roundOp(half, half);
4165 __ Fmov(out, half);
4166 __ Sxth(out, out);
4167 }
4168
VisitFP16Floor(HInvoke * invoke)4169 void IntrinsicLocationsBuilderARM64::VisitFP16Floor(HInvoke* invoke) {
4170 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4171 return;
4172 }
4173
4174 CreateIntToIntLocations(allocator_, invoke);
4175 }
4176
VisitFP16Floor(HInvoke * invoke)4177 void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) {
4178 MacroAssembler* masm = GetVIXLAssembler();
4179 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
4180 __ Frintm(out, in); // Round towards Minus infinity
4181 };
4182 GenerateFP16Round(invoke, codegen_, masm, roundOp);
4183 }
4184
VisitFP16Ceil(HInvoke * invoke)4185 void IntrinsicLocationsBuilderARM64::VisitFP16Ceil(HInvoke* invoke) {
4186 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4187 return;
4188 }
4189
4190 CreateIntToIntLocations(allocator_, invoke);
4191 }
4192
VisitFP16Ceil(HInvoke * invoke)4193 void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
4194 MacroAssembler* masm = GetVIXLAssembler();
4195 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
4196 __ Frintp(out, in); // Round towards Plus infinity
4197 };
4198 GenerateFP16Round(invoke, codegen_, masm, roundOp);
4199 }
4200
VisitFP16Rint(HInvoke * invoke)4201 void IntrinsicLocationsBuilderARM64::VisitFP16Rint(HInvoke* invoke) {
4202 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4203 return;
4204 }
4205
4206 CreateIntToIntLocations(allocator_, invoke);
4207 }
4208
VisitFP16Rint(HInvoke * invoke)4209 void IntrinsicCodeGeneratorARM64::VisitFP16Rint(HInvoke* invoke) {
4210 MacroAssembler* masm = GetVIXLAssembler();
4211 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
4212 __ Frintn(out, in); // Round to nearest, with ties to even
4213 };
4214 GenerateFP16Round(invoke, codegen_, masm, roundOp);
4215 }
4216
FP16ComparisonLocations(HInvoke * invoke,ArenaAllocator * allocator_,CodeGeneratorARM64 * codegen_,int requiredTemps)4217 void FP16ComparisonLocations(HInvoke* invoke,
4218 ArenaAllocator* allocator_,
4219 CodeGeneratorARM64* codegen_,
4220 int requiredTemps) {
4221 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4222 return;
4223 }
4224
4225 CreateIntIntToIntLocations(allocator_, invoke);
4226 for (int i = 0; i < requiredTemps; i++) {
4227 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
4228 }
4229 }
4230
4231 template<typename OP>
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,const OP compareOp)4232 void GenerateFP16Compare(HInvoke* invoke,
4233 CodeGeneratorARM64* codegen,
4234 MacroAssembler* masm,
4235 const OP compareOp) {
4236 DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
4237 LocationSummary* locations = invoke->GetLocations();
4238 Register out = WRegisterFrom(locations->Out());
4239 VRegister half0 = HRegisterFrom(locations->GetTemp(0));
4240 VRegister half1 = HRegisterFrom(locations->GetTemp(1));
4241 __ Fmov(half0, WRegisterFrom(locations->InAt(0)));
4242 __ Fmov(half1, WRegisterFrom(locations->InAt(1)));
4243 compareOp(out, half0, half1);
4244 }
4245
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,vixl::aarch64::Condition cond)4246 static inline void GenerateFP16Compare(HInvoke* invoke,
4247 CodeGeneratorARM64* codegen,
4248 MacroAssembler* masm,
4249 vixl::aarch64::Condition cond) {
4250 auto compareOp = [masm, cond](const Register out, const VRegister& in0, const VRegister& in1) {
4251 __ Fcmp(in0, in1);
4252 __ Cset(out, cond);
4253 };
4254 GenerateFP16Compare(invoke, codegen, masm, compareOp);
4255 }
4256
VisitFP16Greater(HInvoke * invoke)4257 void IntrinsicLocationsBuilderARM64::VisitFP16Greater(HInvoke* invoke) {
4258 FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4259 }
4260
VisitFP16Greater(HInvoke * invoke)4261 void IntrinsicCodeGeneratorARM64::VisitFP16Greater(HInvoke* invoke) {
4262 MacroAssembler* masm = GetVIXLAssembler();
4263 GenerateFP16Compare(invoke, codegen_, masm, gt);
4264 }
4265
VisitFP16GreaterEquals(HInvoke * invoke)4266 void IntrinsicLocationsBuilderARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
4267 FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4268 }
4269
VisitFP16GreaterEquals(HInvoke * invoke)4270 void IntrinsicCodeGeneratorARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
4271 MacroAssembler* masm = GetVIXLAssembler();
4272 GenerateFP16Compare(invoke, codegen_, masm, ge);
4273 }
4274
VisitFP16Less(HInvoke * invoke)4275 void IntrinsicLocationsBuilderARM64::VisitFP16Less(HInvoke* invoke) {
4276 FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4277 }
4278
VisitFP16Less(HInvoke * invoke)4279 void IntrinsicCodeGeneratorARM64::VisitFP16Less(HInvoke* invoke) {
4280 MacroAssembler* masm = GetVIXLAssembler();
4281 GenerateFP16Compare(invoke, codegen_, masm, mi);
4282 }
4283
VisitFP16LessEquals(HInvoke * invoke)4284 void IntrinsicLocationsBuilderARM64::VisitFP16LessEquals(HInvoke* invoke) {
4285 FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4286 }
4287
VisitFP16LessEquals(HInvoke * invoke)4288 void IntrinsicCodeGeneratorARM64::VisitFP16LessEquals(HInvoke* invoke) {
4289 MacroAssembler* masm = GetVIXLAssembler();
4290 GenerateFP16Compare(invoke, codegen_, masm, ls);
4291 }
4292
VisitFP16Compare(HInvoke * invoke)4293 void IntrinsicLocationsBuilderARM64::VisitFP16Compare(HInvoke* invoke) {
4294 FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4295 }
4296
VisitFP16Compare(HInvoke * invoke)4297 void IntrinsicCodeGeneratorARM64::VisitFP16Compare(HInvoke* invoke) {
4298 MacroAssembler* masm = GetVIXLAssembler();
4299 auto compareOp = [masm](const Register out,
4300 const VRegister& in0,
4301 const VRegister& in1) {
4302 vixl::aarch64::Label end;
4303 vixl::aarch64::Label equal;
4304 vixl::aarch64::Label normal;
4305
4306 // The normal cases for this method are:
4307 // - in0 > in1 => out = 1
4308 // - in0 < in1 => out = -1
4309 // - in0 == in1 => out = 0
4310 // +/-Infinity are ordered by default so are handled by the normal case.
4311 // There are two special cases that Fcmp is insufficient for distinguishing:
4312 // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
4313 // - in0 or in1 is NaN => manually compare with in0 and in1 separately
4314 __ Fcmp(in0, in1);
4315 __ B(eq, &equal); // in0==in1 or +0 -0 case.
4316 __ B(vc, &normal); // in0 and in1 are ordered (not NaN).
4317
4318 // Either of the inputs is NaN.
4319 // NaN is equal to itself and greater than any other number so:
4320 // - if only in0 is NaN => return 1
4321 // - if only in1 is NaN => return -1
4322 // - if both in0 and in1 are NaN => return 0
4323 __ Fcmp(in0, 0.0);
4324 __ Mov(out, -1);
4325 __ B(vc, &end); // in0 != NaN => out = -1.
4326 __ Fcmp(in1, 0.0);
4327 __ Cset(out, vc); // if in1 != NaN => out = 1, otherwise both are NaNs => out = 0.
4328 __ B(&end);
4329
4330 // in0 == in1 or if one of the inputs is +0 and the other is -0.
4331 __ Bind(&equal);
4332 // Compare encoding of in0 and in1 as the denormal fraction of single precision float.
4333 // Reverse operand order because -0 > +0 when compared as S registers.
4334 // The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
4335 // Therefore the value of bits[127:16] will not matter when doing the
4336 // below Fcmp as they are set to 0.
4337 __ Fcmp(in1.S(), in0.S());
4338
4339 __ Bind(&normal);
4340 __ Cset(out, gt); // if in0 > in1 => out = 1, otherwise out = 0.
4341 // Note: could be from equals path or original comparison
4342 __ Csinv(out, out, wzr, pl); // if in0 >= in1 out=out, otherwise out=-1.
4343
4344 __ Bind(&end);
4345 };
4346
4347 GenerateFP16Compare(invoke, codegen_, masm, compareOp);
4348 }
4349
4350 const int kFP16NaN = 0x7e00;
4351
GenerateFP16MinMax(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,vixl::aarch64::Condition cond)4352 static inline void GenerateFP16MinMax(HInvoke* invoke,
4353 CodeGeneratorARM64* codegen,
4354 MacroAssembler* masm,
4355 vixl::aarch64::Condition cond) {
4356 DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
4357 LocationSummary* locations = invoke->GetLocations();
4358
4359 vixl::aarch64::Label equal;
4360 vixl::aarch64::Label end;
4361
4362 UseScratchRegisterScope temps(masm);
4363
4364 Register out = WRegisterFrom(locations->Out());
4365 Register in0 = WRegisterFrom(locations->InAt(0));
4366 Register in1 = WRegisterFrom(locations->InAt(1));
4367 VRegister half0 = HRegisterFrom(locations->GetTemp(0));
4368 VRegister half1 = temps.AcquireH();
4369
4370 // The normal cases for this method are:
4371 // - in0.h == in1.h => out = in0 or in1
4372 // - in0.h <cond> in1.h => out = in0
4373 // - in0.h <!cond> in1.h => out = in1
4374 // +/-Infinity are ordered by default so are handled by the normal case.
4375 // There are two special cases that Fcmp is insufficient for distinguishing:
4376 // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
4377 // - in0 or in1 is NaN => out = NaN
4378 __ Fmov(half0, in0);
4379 __ Fmov(half1, in1);
4380 __ Fcmp(half0, half1);
4381 __ B(eq, &equal); // half0 = half1 or +0/-0 case.
4382 __ Csel(out, in0, in1, cond); // if half0 <cond> half1 => out = in0, otherwise out = in1.
4383 __ B(vc, &end); // None of the inputs were NaN.
4384
4385 // Atleast one input was NaN.
4386 __ Mov(out, kFP16NaN); // out=NaN.
4387 __ B(&end);
4388
4389 // in0 == in1 or if one of the inputs is +0 and the other is -0.
4390 __ Bind(&equal);
4391 // Fcmp cannot normally distinguish +0 and -0 so compare encoding.
4392 // Encoding is compared as the denormal fraction of a Single.
4393 // Note: encoding of -0 > encoding of +0 despite +0 > -0 so in0 and in1 are swapped.
4394 // Note: The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
4395 __ Fcmp(half1.S(), half0.S());
4396
4397 __ Csel(out, in0, in1, cond); // if half0 <cond> half1 => out = in0, otherwise out = in1.
4398
4399 __ Bind(&end);
4400 }
4401
VisitFP16Min(HInvoke * invoke)4402 void IntrinsicLocationsBuilderARM64::VisitFP16Min(HInvoke* invoke) {
4403 FP16ComparisonLocations(invoke, allocator_, codegen_, 1);
4404 }
4405
VisitFP16Min(HInvoke * invoke)4406 void IntrinsicCodeGeneratorARM64::VisitFP16Min(HInvoke* invoke) {
4407 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4408 MacroAssembler* masm = GetVIXLAssembler();
4409 GenerateFP16MinMax(invoke, codegen_, masm, mi);
4410 }
4411
VisitFP16Max(HInvoke * invoke)4412 void IntrinsicLocationsBuilderARM64::VisitFP16Max(HInvoke* invoke) {
4413 FP16ComparisonLocations(invoke, allocator_, codegen_, 1);
4414 }
4415
VisitFP16Max(HInvoke * invoke)4416 void IntrinsicCodeGeneratorARM64::VisitFP16Max(HInvoke* invoke) {
4417 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4418 MacroAssembler* masm = GetVIXLAssembler();
4419 GenerateFP16MinMax(invoke, codegen_, masm, gt);
4420 }
4421
GenerateDivideUnsigned(HInvoke * invoke,CodeGeneratorARM64 * codegen)4422 static void GenerateDivideUnsigned(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4423 LocationSummary* locations = invoke->GetLocations();
4424 MacroAssembler* masm = codegen->GetVIXLAssembler();
4425 DataType::Type type = invoke->GetType();
4426 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
4427
4428 Register dividend = RegisterFrom(locations->InAt(0), type);
4429 Register divisor = RegisterFrom(locations->InAt(1), type);
4430 Register out = RegisterFrom(locations->Out(), type);
4431
4432 // Check if divisor is zero, bail to managed implementation to handle.
4433 SlowPathCodeARM64* slow_path =
4434 new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
4435 codegen->AddSlowPath(slow_path);
4436 __ Cbz(divisor, slow_path->GetEntryLabel());
4437
4438 __ Udiv(out, dividend, divisor);
4439
4440 __ Bind(slow_path->GetExitLabel());
4441 }
4442
VisitIntegerDivideUnsigned(HInvoke * invoke)4443 void IntrinsicLocationsBuilderARM64::VisitIntegerDivideUnsigned(HInvoke* invoke) {
4444 CreateIntIntToIntSlowPathCallLocations(allocator_, invoke);
4445 }
4446
VisitIntegerDivideUnsigned(HInvoke * invoke)4447 void IntrinsicCodeGeneratorARM64::VisitIntegerDivideUnsigned(HInvoke* invoke) {
4448 GenerateDivideUnsigned(invoke, codegen_);
4449 }
4450
VisitLongDivideUnsigned(HInvoke * invoke)4451 void IntrinsicLocationsBuilderARM64::VisitLongDivideUnsigned(HInvoke* invoke) {
4452 CreateIntIntToIntSlowPathCallLocations(allocator_, invoke);
4453 }
4454
VisitLongDivideUnsigned(HInvoke * invoke)4455 void IntrinsicCodeGeneratorARM64::VisitLongDivideUnsigned(HInvoke* invoke) {
4456 GenerateDivideUnsigned(invoke, codegen_);
4457 }
4458
VisitMathMultiplyHigh(HInvoke * invoke)4459 void IntrinsicLocationsBuilderARM64::VisitMathMultiplyHigh(HInvoke* invoke) {
4460 CreateIntIntToIntLocations(allocator_, invoke);
4461 }
4462
VisitMathMultiplyHigh(HInvoke * invoke)4463 void IntrinsicCodeGeneratorARM64::VisitMathMultiplyHigh(HInvoke* invoke) {
4464 LocationSummary* locations = invoke->GetLocations();
4465 MacroAssembler* masm = codegen_->GetVIXLAssembler();
4466 DataType::Type type = invoke->GetType();
4467 DCHECK(type == DataType::Type::kInt64);
4468
4469 Register x = RegisterFrom(locations->InAt(0), type);
4470 Register y = RegisterFrom(locations->InAt(1), type);
4471 Register out = RegisterFrom(locations->Out(), type);
4472
4473 __ Smulh(out, x, y);
4474 }
4475
GenerateMathFma(HInvoke * invoke,CodeGeneratorARM64 * codegen)4476 static void GenerateMathFma(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4477 MacroAssembler* masm = codegen->GetVIXLAssembler();
4478
4479 VRegister n = helpers::InputFPRegisterAt(invoke, 0);
4480 VRegister m = helpers::InputFPRegisterAt(invoke, 1);
4481 VRegister a = helpers::InputFPRegisterAt(invoke, 2);
4482 VRegister out = helpers::OutputFPRegister(invoke);
4483
4484 __ Fmadd(out, n, m, a);
4485 }
4486
VisitMathFmaDouble(HInvoke * invoke)4487 void IntrinsicLocationsBuilderARM64::VisitMathFmaDouble(HInvoke* invoke) {
4488 CreateFPFPFPToFPLocations(allocator_, invoke);
4489 }
4490
VisitMathFmaDouble(HInvoke * invoke)4491 void IntrinsicCodeGeneratorARM64::VisitMathFmaDouble(HInvoke* invoke) {
4492 GenerateMathFma(invoke, codegen_);
4493 }
4494
VisitMathFmaFloat(HInvoke * invoke)4495 void IntrinsicLocationsBuilderARM64::VisitMathFmaFloat(HInvoke* invoke) {
4496 CreateFPFPFPToFPLocations(allocator_, invoke);
4497 }
4498
VisitMathFmaFloat(HInvoke * invoke)4499 void IntrinsicCodeGeneratorARM64::VisitMathFmaFloat(HInvoke* invoke) {
4500 GenerateMathFma(invoke, codegen_);
4501 }
4502
4503 class VarHandleSlowPathARM64 : public IntrinsicSlowPathARM64 {
4504 public:
VarHandleSlowPathARM64(HInvoke * invoke,std::memory_order order)4505 VarHandleSlowPathARM64(HInvoke* invoke, std::memory_order order)
4506 : IntrinsicSlowPathARM64(invoke),
4507 order_(order),
4508 return_success_(false),
4509 strong_(false),
4510 get_and_update_op_(GetAndUpdateOp::kAdd) {
4511 }
4512
GetByteArrayViewCheckLabel()4513 vixl::aarch64::Label* GetByteArrayViewCheckLabel() {
4514 return &byte_array_view_check_label_;
4515 }
4516
GetNativeByteOrderLabel()4517 vixl::aarch64::Label* GetNativeByteOrderLabel() {
4518 return &native_byte_order_label_;
4519 }
4520
SetCompareAndSetOrExchangeArgs(bool return_success,bool strong)4521 void SetCompareAndSetOrExchangeArgs(bool return_success, bool strong) {
4522 if (return_success) {
4523 DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndSet);
4524 } else {
4525 DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndExchange);
4526 }
4527 return_success_ = return_success;
4528 strong_ = strong;
4529 }
4530
SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op)4531 void SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op) {
4532 DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kGetAndUpdate);
4533 get_and_update_op_ = get_and_update_op;
4534 }
4535
EmitNativeCode(CodeGenerator * codegen_in)4536 void EmitNativeCode(CodeGenerator* codegen_in) override {
4537 if (GetByteArrayViewCheckLabel()->IsLinked()) {
4538 EmitByteArrayViewCode(codegen_in);
4539 }
4540 IntrinsicSlowPathARM64::EmitNativeCode(codegen_in);
4541 }
4542
4543 private:
GetInvoke() const4544 HInvoke* GetInvoke() const {
4545 return GetInstruction()->AsInvoke();
4546 }
4547
GetAccessModeTemplate() const4548 mirror::VarHandle::AccessModeTemplate GetAccessModeTemplate() const {
4549 return mirror::VarHandle::GetAccessModeTemplateByIntrinsic(GetInvoke()->GetIntrinsic());
4550 }
4551
4552 void EmitByteArrayViewCode(CodeGenerator* codegen_in);
4553
4554 vixl::aarch64::Label byte_array_view_check_label_;
4555 vixl::aarch64::Label native_byte_order_label_;
4556 // Shared parameter for all VarHandle intrinsics.
4557 std::memory_order order_;
4558 // Extra arguments for GenerateVarHandleCompareAndSetOrExchange().
4559 bool return_success_;
4560 bool strong_;
4561 // Extra argument for GenerateVarHandleGetAndUpdate().
4562 GetAndUpdateOp get_and_update_op_;
4563 };
4564
4565 // Generate subtype check without read barriers.
GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path,Register object,Register type,bool object_can_be_null=true)4566 static void GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorARM64* codegen,
4567 SlowPathCodeARM64* slow_path,
4568 Register object,
4569 Register type,
4570 bool object_can_be_null = true) {
4571 MacroAssembler* masm = codegen->GetVIXLAssembler();
4572
4573 const MemberOffset class_offset = mirror::Object::ClassOffset();
4574 const MemberOffset super_class_offset = mirror::Class::SuperClassOffset();
4575
4576 vixl::aarch64::Label success;
4577 if (object_can_be_null) {
4578 __ Cbz(object, &success);
4579 }
4580
4581 UseScratchRegisterScope temps(masm);
4582 Register temp = temps.AcquireW();
4583
4584 __ Ldr(temp, HeapOperand(object, class_offset.Int32Value()));
4585 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4586 vixl::aarch64::Label loop;
4587 __ Bind(&loop);
4588 __ Cmp(type, temp);
4589 __ B(&success, eq);
4590 __ Ldr(temp, HeapOperand(temp, super_class_offset.Int32Value()));
4591 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4592 __ Cbz(temp, slow_path->GetEntryLabel());
4593 __ B(&loop);
4594 __ Bind(&success);
4595 }
4596
4597 // Check access mode and the primitive type from VarHandle.varType.
4598 // Check reference arguments against the VarHandle.varType; for references this is a subclass
4599 // check without read barrier, so it can have false negatives which we handle in the slow path.
GenerateVarHandleAccessModeAndVarTypeChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path,DataType::Type type)4600 static void GenerateVarHandleAccessModeAndVarTypeChecks(HInvoke* invoke,
4601 CodeGeneratorARM64* codegen,
4602 SlowPathCodeARM64* slow_path,
4603 DataType::Type type) {
4604 mirror::VarHandle::AccessMode access_mode =
4605 mirror::VarHandle::GetAccessModeByIntrinsic(invoke->GetIntrinsic());
4606 Primitive::Type primitive_type = DataTypeToPrimitive(type);
4607
4608 MacroAssembler* masm = codegen->GetVIXLAssembler();
4609 Register varhandle = InputRegisterAt(invoke, 0);
4610
4611 const MemberOffset var_type_offset = mirror::VarHandle::VarTypeOffset();
4612 const MemberOffset access_mode_bit_mask_offset = mirror::VarHandle::AccessModesBitMaskOffset();
4613 const MemberOffset primitive_type_offset = mirror::Class::PrimitiveTypeOffset();
4614
4615 UseScratchRegisterScope temps(masm);
4616 Register var_type_no_rb = temps.AcquireW();
4617 Register temp2 = temps.AcquireW();
4618
4619 // Check that the operation is permitted and the primitive type of varhandle.varType.
4620 // We do not need a read barrier when loading a reference only for loading constant
4621 // primitive field through the reference. Use LDP to load the fields together.
4622 DCHECK_EQ(var_type_offset.Int32Value() + 4, access_mode_bit_mask_offset.Int32Value());
4623 __ Ldp(var_type_no_rb, temp2, HeapOperand(varhandle, var_type_offset.Int32Value()));
4624 codegen->GetAssembler()->MaybeUnpoisonHeapReference(var_type_no_rb);
4625 __ Tbz(temp2, static_cast<uint32_t>(access_mode), slow_path->GetEntryLabel());
4626 __ Ldrh(temp2, HeapOperand(var_type_no_rb, primitive_type_offset.Int32Value()));
4627 if (primitive_type == Primitive::kPrimNot) {
4628 static_assert(Primitive::kPrimNot == 0);
4629 __ Cbnz(temp2, slow_path->GetEntryLabel());
4630 } else {
4631 __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
4632 __ B(slow_path->GetEntryLabel(), ne);
4633 }
4634
4635 temps.Release(temp2);
4636
4637 if (type == DataType::Type::kReference) {
4638 // Check reference arguments against the varType.
4639 // False negatives due to varType being an interface or array type
4640 // or due to the missing read barrier are handled by the slow path.
4641 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4642 uint32_t arguments_start = /* VarHandle object */ 1u + expected_coordinates_count;
4643 uint32_t number_of_arguments = invoke->GetNumberOfArguments();
4644 for (size_t arg_index = arguments_start; arg_index != number_of_arguments; ++arg_index) {
4645 HInstruction* arg = invoke->InputAt(arg_index);
4646 DCHECK_EQ(arg->GetType(), DataType::Type::kReference);
4647 if (!arg->IsNullConstant()) {
4648 Register arg_reg = WRegisterFrom(invoke->GetLocations()->InAt(arg_index));
4649 GenerateSubTypeObjectCheckNoReadBarrier(codegen, slow_path, arg_reg, var_type_no_rb);
4650 }
4651 }
4652 }
4653 }
4654
GenerateVarHandleStaticFieldCheck(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path)4655 static void GenerateVarHandleStaticFieldCheck(HInvoke* invoke,
4656 CodeGeneratorARM64* codegen,
4657 SlowPathCodeARM64* slow_path) {
4658 MacroAssembler* masm = codegen->GetVIXLAssembler();
4659 Register varhandle = InputRegisterAt(invoke, 0);
4660
4661 const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4662
4663 UseScratchRegisterScope temps(masm);
4664 Register temp = temps.AcquireW();
4665
4666 // Check that the VarHandle references a static field by checking that coordinateType0 == null.
4667 // Do not emit read barrier (or unpoison the reference) for comparing to null.
4668 __ Ldr(temp, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4669 __ Cbnz(temp, slow_path->GetEntryLabel());
4670 }
4671
GenerateVarHandleInstanceFieldChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path)4672 static void GenerateVarHandleInstanceFieldChecks(HInvoke* invoke,
4673 CodeGeneratorARM64* codegen,
4674 SlowPathCodeARM64* slow_path) {
4675 VarHandleOptimizations optimizations(invoke);
4676 MacroAssembler* masm = codegen->GetVIXLAssembler();
4677 Register varhandle = InputRegisterAt(invoke, 0);
4678 Register object = InputRegisterAt(invoke, 1);
4679
4680 const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4681 const MemberOffset coordinate_type1_offset = mirror::VarHandle::CoordinateType1Offset();
4682
4683 // Null-check the object.
4684 if (!optimizations.GetSkipObjectNullCheck()) {
4685 __ Cbz(object, slow_path->GetEntryLabel());
4686 }
4687
4688 if (!optimizations.GetUseKnownImageVarHandle()) {
4689 UseScratchRegisterScope temps(masm);
4690 Register temp = temps.AcquireW();
4691 Register temp2 = temps.AcquireW();
4692
4693 // Check that the VarHandle references an instance field by checking that
4694 // coordinateType1 == null. coordinateType0 should not be null, but this is handled by the
4695 // type compatibility check with the source object's type, which will fail for null.
4696 DCHECK_EQ(coordinate_type0_offset.Int32Value() + 4, coordinate_type1_offset.Int32Value());
4697 __ Ldp(temp, temp2, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4698 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4699 // No need for read barrier or unpoisoning of coordinateType1 for comparison with null.
4700 __ Cbnz(temp2, slow_path->GetEntryLabel());
4701
4702 // Check that the object has the correct type.
4703 // We deliberately avoid the read barrier, letting the slow path handle the false negatives.
4704 temps.Release(temp2); // Needed by GenerateSubTypeObjectCheckNoReadBarrier().
4705 GenerateSubTypeObjectCheckNoReadBarrier(
4706 codegen, slow_path, object, temp, /*object_can_be_null=*/ false);
4707 }
4708 }
4709
GenerateVarHandleArrayChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,VarHandleSlowPathARM64 * slow_path)4710 static void GenerateVarHandleArrayChecks(HInvoke* invoke,
4711 CodeGeneratorARM64* codegen,
4712 VarHandleSlowPathARM64* slow_path) {
4713 VarHandleOptimizations optimizations(invoke);
4714 MacroAssembler* masm = codegen->GetVIXLAssembler();
4715 Register varhandle = InputRegisterAt(invoke, 0);
4716 Register object = InputRegisterAt(invoke, 1);
4717 Register index = InputRegisterAt(invoke, 2);
4718 DataType::Type value_type =
4719 GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
4720 Primitive::Type primitive_type = DataTypeToPrimitive(value_type);
4721
4722 const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4723 const MemberOffset coordinate_type1_offset = mirror::VarHandle::CoordinateType1Offset();
4724 const MemberOffset component_type_offset = mirror::Class::ComponentTypeOffset();
4725 const MemberOffset primitive_type_offset = mirror::Class::PrimitiveTypeOffset();
4726 const MemberOffset class_offset = mirror::Object::ClassOffset();
4727 const MemberOffset array_length_offset = mirror::Array::LengthOffset();
4728
4729 // Null-check the object.
4730 if (!optimizations.GetSkipObjectNullCheck()) {
4731 __ Cbz(object, slow_path->GetEntryLabel());
4732 }
4733
4734 UseScratchRegisterScope temps(masm);
4735 Register temp = temps.AcquireW();
4736 Register temp2 = temps.AcquireW();
4737
4738 // Check that the VarHandle references an array, byte array view or ByteBuffer by checking
4739 // that coordinateType1 != null. If that's true, coordinateType1 shall be int.class and
4740 // coordinateType0 shall not be null but we do not explicitly verify that.
4741 DCHECK_EQ(coordinate_type0_offset.Int32Value() + 4, coordinate_type1_offset.Int32Value());
4742 __ Ldp(temp, temp2, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4743 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4744 // No need for read barrier or unpoisoning of coordinateType1 for comparison with null.
4745 __ Cbz(temp2, slow_path->GetEntryLabel());
4746
4747 // Check object class against componentType0.
4748 //
4749 // This is an exact check and we defer other cases to the runtime. This includes
4750 // conversion to array of superclass references, which is valid but subsequently
4751 // requires all update operations to check that the value can indeed be stored.
4752 // We do not want to perform such extra checks in the intrinsified code.
4753 //
4754 // We do this check without read barrier, so there can be false negatives which we
4755 // defer to the slow path. There shall be no false negatives for array classes in the
4756 // boot image (including Object[] and primitive arrays) because they are non-movable.
4757 __ Ldr(temp2, HeapOperand(object, class_offset.Int32Value()));
4758 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
4759 __ Cmp(temp, temp2);
4760 __ B(slow_path->GetEntryLabel(), ne);
4761
4762 // Check that the coordinateType0 is an array type. We do not need a read barrier
4763 // for loading constant reference fields (or chains of them) for comparison with null,
4764 // nor for finally loading a constant primitive field (primitive type) below.
4765 __ Ldr(temp2, HeapOperand(temp, component_type_offset.Int32Value()));
4766 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
4767 __ Cbz(temp2, slow_path->GetEntryLabel());
4768
4769 // Check that the array component type matches the primitive type.
4770 __ Ldrh(temp2, HeapOperand(temp2, primitive_type_offset.Int32Value()));
4771 if (primitive_type == Primitive::kPrimNot) {
4772 static_assert(Primitive::kPrimNot == 0);
4773 __ Cbnz(temp2, slow_path->GetEntryLabel());
4774 } else {
4775 // With the exception of `kPrimNot` (handled above), `kPrimByte` and `kPrimBoolean`,
4776 // we shall check for a byte array view in the slow path.
4777 // The check requires the ByteArrayViewVarHandle.class to be in the boot image,
4778 // so we cannot emit that if we're JITting without boot image.
4779 bool boot_image_available =
4780 codegen->GetCompilerOptions().IsBootImage() ||
4781 !Runtime::Current()->GetHeap()->GetBootImageSpaces().empty();
4782 bool can_be_view = (DataType::Size(value_type) != 1u) && boot_image_available;
4783 vixl::aarch64::Label* slow_path_label =
4784 can_be_view ? slow_path->GetByteArrayViewCheckLabel() : slow_path->GetEntryLabel();
4785 __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
4786 __ B(slow_path_label, ne);
4787 }
4788
4789 // Check for array index out of bounds.
4790 __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
4791 __ Cmp(index, temp);
4792 __ B(slow_path->GetEntryLabel(), hs);
4793 }
4794
GenerateVarHandleCoordinateChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,VarHandleSlowPathARM64 * slow_path)4795 static void GenerateVarHandleCoordinateChecks(HInvoke* invoke,
4796 CodeGeneratorARM64* codegen,
4797 VarHandleSlowPathARM64* slow_path) {
4798 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4799 if (expected_coordinates_count == 0u) {
4800 GenerateVarHandleStaticFieldCheck(invoke, codegen, slow_path);
4801 } else if (expected_coordinates_count == 1u) {
4802 GenerateVarHandleInstanceFieldChecks(invoke, codegen, slow_path);
4803 } else {
4804 DCHECK_EQ(expected_coordinates_count, 2u);
4805 GenerateVarHandleArrayChecks(invoke, codegen, slow_path);
4806 }
4807 }
4808
GenerateVarHandleChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,DataType::Type type)4809 static VarHandleSlowPathARM64* GenerateVarHandleChecks(HInvoke* invoke,
4810 CodeGeneratorARM64* codegen,
4811 std::memory_order order,
4812 DataType::Type type) {
4813 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4814 VarHandleOptimizations optimizations(invoke);
4815 if (optimizations.GetUseKnownImageVarHandle()) {
4816 DCHECK_NE(expected_coordinates_count, 2u);
4817 if (expected_coordinates_count == 0u || optimizations.GetSkipObjectNullCheck()) {
4818 return nullptr;
4819 }
4820 }
4821
4822 VarHandleSlowPathARM64* slow_path =
4823 new (codegen->GetScopedAllocator()) VarHandleSlowPathARM64(invoke, order);
4824 codegen->AddSlowPath(slow_path);
4825
4826 if (!optimizations.GetUseKnownImageVarHandle()) {
4827 GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, type);
4828 }
4829 GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
4830
4831 return slow_path;
4832 }
4833
4834 struct VarHandleTarget {
4835 Register object; // The object holding the value to operate on.
4836 Register offset; // The offset of the value to operate on.
4837 };
4838
GetVarHandleTarget(HInvoke * invoke)4839 static VarHandleTarget GetVarHandleTarget(HInvoke* invoke) {
4840 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4841 LocationSummary* locations = invoke->GetLocations();
4842
4843 VarHandleTarget target;
4844 // The temporary allocated for loading the offset.
4845 target.offset = WRegisterFrom(locations->GetTemp(0u));
4846 // The reference to the object that holds the value to operate on.
4847 target.object = (expected_coordinates_count == 0u)
4848 ? WRegisterFrom(locations->GetTemp(1u))
4849 : InputRegisterAt(invoke, 1);
4850 return target;
4851 }
4852
GenerateVarHandleTarget(HInvoke * invoke,const VarHandleTarget & target,CodeGeneratorARM64 * codegen)4853 static void GenerateVarHandleTarget(HInvoke* invoke,
4854 const VarHandleTarget& target,
4855 CodeGeneratorARM64* codegen) {
4856 MacroAssembler* masm = codegen->GetVIXLAssembler();
4857 Register varhandle = InputRegisterAt(invoke, 0);
4858 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4859
4860 if (expected_coordinates_count <= 1u) {
4861 if (VarHandleOptimizations(invoke).GetUseKnownImageVarHandle()) {
4862 ScopedObjectAccess soa(Thread::Current());
4863 ArtField* target_field = GetBootImageVarHandleField(invoke);
4864 if (expected_coordinates_count == 0u) {
4865 ObjPtr<mirror::Class> declaring_class = target_field->GetDeclaringClass();
4866 if (Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(declaring_class)) {
4867 uint32_t boot_image_offset = CodeGenerator::GetBootImageOffset(declaring_class);
4868 codegen->LoadBootImageRelRoEntry(target.object, boot_image_offset);
4869 } else {
4870 codegen->LoadTypeForBootImageIntrinsic(
4871 target.object,
4872 TypeReference(&declaring_class->GetDexFile(), declaring_class->GetDexTypeIndex()));
4873 }
4874 }
4875 __ Mov(target.offset, target_field->GetOffset().Uint32Value());
4876 } else {
4877 // For static fields, we need to fill the `target.object` with the declaring class,
4878 // so we can use `target.object` as temporary for the `ArtField*`. For instance fields,
4879 // we do not need the declaring class, so we can forget the `ArtField*` when
4880 // we load the `target.offset`, so use the `target.offset` to hold the `ArtField*`.
4881 Register field = (expected_coordinates_count == 0) ? target.object : target.offset;
4882
4883 const MemberOffset art_field_offset = mirror::FieldVarHandle::ArtFieldOffset();
4884 const MemberOffset offset_offset = ArtField::OffsetOffset();
4885
4886 // Load the ArtField*, the offset and, if needed, declaring class.
4887 __ Ldr(field.X(), HeapOperand(varhandle, art_field_offset.Int32Value()));
4888 __ Ldr(target.offset, MemOperand(field.X(), offset_offset.Int32Value()));
4889 if (expected_coordinates_count == 0u) {
4890 codegen->GenerateGcRootFieldLoad(invoke,
4891 LocationFrom(target.object),
4892 field.X(),
4893 ArtField::DeclaringClassOffset().Int32Value(),
4894 /*fixup_label=*/nullptr,
4895 codegen->GetCompilerReadBarrierOption());
4896 }
4897 }
4898 } else {
4899 DCHECK_EQ(expected_coordinates_count, 2u);
4900 DataType::Type value_type =
4901 GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
4902 size_t size_shift = DataType::SizeShift(value_type);
4903 MemberOffset data_offset = mirror::Array::DataOffset(DataType::Size(value_type));
4904
4905 Register index = InputRegisterAt(invoke, 2);
4906 Register shifted_index = index;
4907 if (size_shift != 0u) {
4908 shifted_index = target.offset;
4909 __ Lsl(shifted_index, index, size_shift);
4910 }
4911 __ Add(target.offset, shifted_index, data_offset.Int32Value());
4912 }
4913 }
4914
CreateVarHandleCommonLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)4915 static LocationSummary* CreateVarHandleCommonLocations(HInvoke* invoke,
4916 CodeGeneratorARM64* codegen) {
4917 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4918 DataType::Type return_type = invoke->GetType();
4919
4920 ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
4921 LocationSummary* locations =
4922 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
4923 locations->SetInAt(0, Location::RequiresRegister());
4924 // Require coordinates in registers. These are the object holding the value
4925 // to operate on (except for static fields) and index (for arrays and views).
4926 for (size_t i = 0; i != expected_coordinates_count; ++i) {
4927 locations->SetInAt(/* VarHandle object */ 1u + i, Location::RequiresRegister());
4928 }
4929 if (return_type != DataType::Type::kVoid) {
4930 if (DataType::IsFloatingPointType(return_type)) {
4931 locations->SetOut(Location::RequiresFpuRegister());
4932 } else {
4933 locations->SetOut(Location::RequiresRegister());
4934 }
4935 }
4936 uint32_t arguments_start = /* VarHandle object */ 1u + expected_coordinates_count;
4937 uint32_t number_of_arguments = invoke->GetNumberOfArguments();
4938 for (size_t arg_index = arguments_start; arg_index != number_of_arguments; ++arg_index) {
4939 HInstruction* arg = invoke->InputAt(arg_index);
4940 if (IsZeroBitPattern(arg)) {
4941 locations->SetInAt(arg_index, Location::ConstantLocation(arg));
4942 } else if (DataType::IsFloatingPointType(arg->GetType())) {
4943 locations->SetInAt(arg_index, Location::RequiresFpuRegister());
4944 } else {
4945 locations->SetInAt(arg_index, Location::RequiresRegister());
4946 }
4947 }
4948
4949 // Add a temporary for offset.
4950 if (codegen->EmitNonBakerReadBarrier() &&
4951 GetExpectedVarHandleCoordinatesCount(invoke) == 0u) { // For static fields.
4952 // To preserve the offset value across the non-Baker read barrier slow path
4953 // for loading the declaring class, use a fixed callee-save register.
4954 constexpr int first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
4955 locations->AddTemp(Location::RegisterLocation(first_callee_save));
4956 } else {
4957 locations->AddTemp(Location::RequiresRegister());
4958 }
4959 if (expected_coordinates_count == 0u) {
4960 // Add a temporary to hold the declaring class.
4961 locations->AddTemp(Location::RequiresRegister());
4962 }
4963
4964 return locations;
4965 }
4966
CreateVarHandleGetLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)4967 static void CreateVarHandleGetLocations(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4968 VarHandleOptimizations optimizations(invoke);
4969 if (optimizations.GetDoNotIntrinsify()) {
4970 return;
4971 }
4972
4973 if (codegen->EmitNonBakerReadBarrier() &&
4974 invoke->GetType() == DataType::Type::kReference &&
4975 invoke->GetIntrinsic() != Intrinsics::kVarHandleGet &&
4976 invoke->GetIntrinsic() != Intrinsics::kVarHandleGetOpaque) {
4977 // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
4978 // the passed reference and reloads it from the field. This gets the memory visibility
4979 // wrong for Acquire/Volatile operations. b/173104084
4980 return;
4981 }
4982
4983 CreateVarHandleCommonLocations(invoke, codegen);
4984 }
4985
GenerateVarHandleGet(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool byte_swap=false)4986 static void GenerateVarHandleGet(HInvoke* invoke,
4987 CodeGeneratorARM64* codegen,
4988 std::memory_order order,
4989 bool byte_swap = false) {
4990 DataType::Type type = invoke->GetType();
4991 DCHECK_NE(type, DataType::Type::kVoid);
4992
4993 LocationSummary* locations = invoke->GetLocations();
4994 MacroAssembler* masm = codegen->GetVIXLAssembler();
4995 CPURegister out = helpers::OutputCPURegister(invoke);
4996
4997 VarHandleTarget target = GetVarHandleTarget(invoke);
4998 VarHandleSlowPathARM64* slow_path = nullptr;
4999 if (!byte_swap) {
5000 slow_path = GenerateVarHandleChecks(invoke, codegen, order, type);
5001 GenerateVarHandleTarget(invoke, target, codegen);
5002 if (slow_path != nullptr) {
5003 __ Bind(slow_path->GetNativeByteOrderLabel());
5004 }
5005 }
5006
5007 // ARM64 load-acquire instructions are implicitly sequentially consistent.
5008 bool use_load_acquire =
5009 (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
5010 DCHECK(use_load_acquire || order == std::memory_order_relaxed);
5011
5012 // Load the value from the target location.
5013 if (type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
5014 // Piggy-back on the field load path using introspection for the Baker read barrier.
5015 // The `target.offset` is a temporary, use it for field address.
5016 Register tmp_ptr = target.offset.X();
5017 __ Add(tmp_ptr, target.object.X(), target.offset.X());
5018 codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
5019 locations->Out(),
5020 target.object,
5021 MemOperand(tmp_ptr),
5022 /*needs_null_check=*/ false,
5023 use_load_acquire);
5024 DCHECK(!byte_swap);
5025 } else {
5026 MemOperand address(target.object.X(), target.offset.X());
5027 CPURegister load_reg = out;
5028 DataType::Type load_type = type;
5029 UseScratchRegisterScope temps(masm);
5030 if (byte_swap) {
5031 if (type == DataType::Type::kInt16) {
5032 // Avoid unnecessary sign extension before REV16.
5033 load_type = DataType::Type::kUint16;
5034 } else if (type == DataType::Type::kFloat32) {
5035 load_type = DataType::Type::kInt32;
5036 load_reg = target.offset.W();
5037 } else if (type == DataType::Type::kFloat64) {
5038 load_type = DataType::Type::kInt64;
5039 load_reg = target.offset.X();
5040 }
5041 }
5042 if (use_load_acquire) {
5043 codegen->LoadAcquire(invoke, load_type, load_reg, address, /*needs_null_check=*/ false);
5044 } else {
5045 codegen->Load(load_type, load_reg, address);
5046 }
5047 if (type == DataType::Type::kReference) {
5048 DCHECK(!byte_swap);
5049 DCHECK(out.IsW());
5050 Location out_loc = locations->Out();
5051 Location object_loc = LocationFrom(target.object);
5052 Location offset_loc = LocationFrom(target.offset);
5053 codegen->MaybeGenerateReadBarrierSlow(invoke, out_loc, out_loc, object_loc, 0u, offset_loc);
5054 } else if (byte_swap) {
5055 GenerateReverseBytes(masm, type, load_reg, out);
5056 }
5057 }
5058
5059 if (slow_path != nullptr) {
5060 DCHECK(!byte_swap);
5061 __ Bind(slow_path->GetExitLabel());
5062 }
5063 }
5064
VisitVarHandleGet(HInvoke * invoke)5065 void IntrinsicLocationsBuilderARM64::VisitVarHandleGet(HInvoke* invoke) {
5066 CreateVarHandleGetLocations(invoke, codegen_);
5067 }
5068
VisitVarHandleGet(HInvoke * invoke)5069 void IntrinsicCodeGeneratorARM64::VisitVarHandleGet(HInvoke* invoke) {
5070 GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
5071 }
5072
VisitVarHandleGetOpaque(HInvoke * invoke)5073 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
5074 CreateVarHandleGetLocations(invoke, codegen_);
5075 }
5076
VisitVarHandleGetOpaque(HInvoke * invoke)5077 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
5078 GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
5079 }
5080
VisitVarHandleGetAcquire(HInvoke * invoke)5081 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
5082 CreateVarHandleGetLocations(invoke, codegen_);
5083 }
5084
VisitVarHandleGetAcquire(HInvoke * invoke)5085 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
5086 GenerateVarHandleGet(invoke, codegen_, std::memory_order_acquire);
5087 }
5088
VisitVarHandleGetVolatile(HInvoke * invoke)5089 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
5090 CreateVarHandleGetLocations(invoke, codegen_);
5091 }
5092
VisitVarHandleGetVolatile(HInvoke * invoke)5093 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
5094 GenerateVarHandleGet(invoke, codegen_, std::memory_order_seq_cst);
5095 }
5096
CreateVarHandleSetLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)5097 static void CreateVarHandleSetLocations(HInvoke* invoke, CodeGeneratorARM64* codegen) {
5098 VarHandleOptimizations optimizations(invoke);
5099 if (optimizations.GetDoNotIntrinsify()) {
5100 return;
5101 }
5102
5103 CreateVarHandleCommonLocations(invoke, codegen);
5104 }
5105
GenerateVarHandleSet(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool byte_swap=false)5106 static void GenerateVarHandleSet(HInvoke* invoke,
5107 CodeGeneratorARM64* codegen,
5108 std::memory_order order,
5109 bool byte_swap = false) {
5110 uint32_t value_index = invoke->GetNumberOfArguments() - 1;
5111 DataType::Type value_type = GetDataTypeFromShorty(invoke, value_index);
5112
5113 MacroAssembler* masm = codegen->GetVIXLAssembler();
5114 CPURegister value = InputCPURegisterOrZeroRegAt(invoke, value_index);
5115
5116 VarHandleTarget target = GetVarHandleTarget(invoke);
5117 VarHandleSlowPathARM64* slow_path = nullptr;
5118 if (!byte_swap) {
5119 slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
5120 GenerateVarHandleTarget(invoke, target, codegen);
5121 if (slow_path != nullptr) {
5122 __ Bind(slow_path->GetNativeByteOrderLabel());
5123 }
5124 }
5125
5126 // ARM64 store-release instructions are implicitly sequentially consistent.
5127 bool use_store_release =
5128 (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
5129 DCHECK(use_store_release || order == std::memory_order_relaxed);
5130
5131 // Store the value to the target location.
5132 {
5133 CPURegister source = value;
5134 UseScratchRegisterScope temps(masm);
5135 if (kPoisonHeapReferences && value_type == DataType::Type::kReference) {
5136 DCHECK(value.IsW());
5137 Register temp = temps.AcquireW();
5138 __ Mov(temp, value.W());
5139 codegen->GetAssembler()->PoisonHeapReference(temp);
5140 source = temp;
5141 }
5142 if (byte_swap) {
5143 DCHECK(!source.IsZero()); // We use the main path for zero as it does not need a byte swap.
5144 Register temp = source.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
5145 if (value_type == DataType::Type::kInt16) {
5146 // Avoid unnecessary sign extension before storing.
5147 value_type = DataType::Type::kUint16;
5148 } else if (DataType::IsFloatingPointType(value_type)) {
5149 __ Fmov(temp, source.Is64Bits() ? source.D() : source.S());
5150 value_type = source.Is64Bits() ? DataType::Type::kInt64 : DataType::Type::kInt32;
5151 source = temp; // Source for the `GenerateReverseBytes()` below.
5152 }
5153 GenerateReverseBytes(masm, value_type, source, temp);
5154 source = temp;
5155 }
5156 MemOperand address(target.object.X(), target.offset.X());
5157 if (use_store_release) {
5158 codegen->StoreRelease(invoke, value_type, source, address, /*needs_null_check=*/ false);
5159 } else {
5160 codegen->Store(value_type, source, address);
5161 }
5162 }
5163
5164 if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(value_index))) {
5165 codegen->MaybeMarkGCCard(target.object, Register(value), /* emit_null_check= */ true);
5166 }
5167
5168 if (slow_path != nullptr) {
5169 DCHECK(!byte_swap);
5170 __ Bind(slow_path->GetExitLabel());
5171 }
5172 }
5173
VisitVarHandleSet(HInvoke * invoke)5174 void IntrinsicLocationsBuilderARM64::VisitVarHandleSet(HInvoke* invoke) {
5175 CreateVarHandleSetLocations(invoke, codegen_);
5176 }
5177
VisitVarHandleSet(HInvoke * invoke)5178 void IntrinsicCodeGeneratorARM64::VisitVarHandleSet(HInvoke* invoke) {
5179 GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
5180 }
5181
VisitVarHandleSetOpaque(HInvoke * invoke)5182 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
5183 CreateVarHandleSetLocations(invoke, codegen_);
5184 }
5185
VisitVarHandleSetOpaque(HInvoke * invoke)5186 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
5187 GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
5188 }
5189
VisitVarHandleSetRelease(HInvoke * invoke)5190 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
5191 CreateVarHandleSetLocations(invoke, codegen_);
5192 }
5193
VisitVarHandleSetRelease(HInvoke * invoke)5194 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
5195 GenerateVarHandleSet(invoke, codegen_, std::memory_order_release);
5196 }
5197
VisitVarHandleSetVolatile(HInvoke * invoke)5198 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
5199 CreateVarHandleSetLocations(invoke, codegen_);
5200 }
5201
VisitVarHandleSetVolatile(HInvoke * invoke)5202 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
5203 GenerateVarHandleSet(invoke, codegen_, std::memory_order_seq_cst);
5204 }
5205
CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen,bool return_success)5206 static void CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke* invoke,
5207 CodeGeneratorARM64* codegen,
5208 bool return_success) {
5209 VarHandleOptimizations optimizations(invoke);
5210 if (optimizations.GetDoNotIntrinsify()) {
5211 return;
5212 }
5213
5214 uint32_t number_of_arguments = invoke->GetNumberOfArguments();
5215 DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
5216 if (value_type == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5217 // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
5218 // the passed reference and reloads it from the field. This breaks the read barriers
5219 // in slow path in different ways. The marked old value may not actually be a to-space
5220 // reference to the same object as `old_value`, breaking slow path assumptions. And
5221 // for CompareAndExchange, marking the old value after comparison failure may actually
5222 // return the reference to `expected`, erroneously indicating success even though we
5223 // did not set the new value. (And it also gets the memory visibility wrong.) b/173104084
5224 return;
5225 }
5226
5227 LocationSummary* locations = CreateVarHandleCommonLocations(invoke, codegen);
5228
5229 if (codegen->EmitNonBakerReadBarrier()) {
5230 // We need callee-save registers for both the class object and offset instead of
5231 // the temporaries reserved in CreateVarHandleCommonLocations().
5232 static_assert(POPCOUNT(kArm64CalleeSaveRefSpills) >= 2u);
5233 uint32_t first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
5234 uint32_t second_callee_save = CTZ(kArm64CalleeSaveRefSpills ^ (1u << first_callee_save));
5235 if (GetExpectedVarHandleCoordinatesCount(invoke) == 0u) { // For static fields.
5236 DCHECK_EQ(locations->GetTempCount(), 2u);
5237 DCHECK(locations->GetTemp(0u).Equals(Location::RequiresRegister()));
5238 DCHECK(locations->GetTemp(1u).Equals(Location::RegisterLocation(first_callee_save)));
5239 locations->SetTempAt(0u, Location::RegisterLocation(second_callee_save));
5240 } else {
5241 DCHECK_EQ(locations->GetTempCount(), 1u);
5242 DCHECK(locations->GetTemp(0u).Equals(Location::RequiresRegister()));
5243 locations->SetTempAt(0u, Location::RegisterLocation(first_callee_save));
5244 }
5245 }
5246 size_t old_temp_count = locations->GetTempCount();
5247 DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
5248 if (!return_success) {
5249 if (DataType::IsFloatingPointType(value_type)) {
5250 // Add a temporary for old value and exclusive store result if floating point
5251 // `expected` and/or `new_value` take scratch registers.
5252 size_t available_scratch_registers =
5253 (IsZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) ? 1u : 0u) +
5254 (IsZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) ? 1u : 0u);
5255 size_t temps_needed = /* pointer, old value, store result */ 3u - available_scratch_registers;
5256 // We can reuse the declaring class (if present) and offset temporary.
5257 if (temps_needed > old_temp_count) {
5258 locations->AddRegisterTemps(temps_needed - old_temp_count);
5259 }
5260 } else if ((value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) &&
5261 !IsZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) &&
5262 !IsZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) &&
5263 GetExpectedVarHandleCoordinatesCount(invoke) == 2u) {
5264 // Allocate a normal temporary for store result in the non-native byte order path
5265 // because scratch registers are used by the byte-swapped `expected` and `new_value`.
5266 DCHECK_EQ(old_temp_count, 1u);
5267 locations->AddTemp(Location::RequiresRegister());
5268 }
5269 }
5270 if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5271 // Add a temporary for the `old_value_temp` in slow path.
5272 locations->AddTemp(Location::RequiresRegister());
5273 }
5274 }
5275
MoveToTempIfFpRegister(const CPURegister & cpu_reg,DataType::Type type,MacroAssembler * masm,UseScratchRegisterScope * temps)5276 static Register MoveToTempIfFpRegister(const CPURegister& cpu_reg,
5277 DataType::Type type,
5278 MacroAssembler* masm,
5279 UseScratchRegisterScope* temps) {
5280 if (cpu_reg.IsS()) {
5281 DCHECK_EQ(type, DataType::Type::kFloat32);
5282 Register reg = temps->AcquireW();
5283 __ Fmov(reg, cpu_reg.S());
5284 return reg;
5285 } else if (cpu_reg.IsD()) {
5286 DCHECK_EQ(type, DataType::Type::kFloat64);
5287 Register reg = temps->AcquireX();
5288 __ Fmov(reg, cpu_reg.D());
5289 return reg;
5290 } else {
5291 return DataType::Is64BitType(type) ? cpu_reg.X() : cpu_reg.W();
5292 }
5293 }
5294
GenerateVarHandleCompareAndSetOrExchange(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool return_success,bool strong,bool byte_swap=false)5295 static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke,
5296 CodeGeneratorARM64* codegen,
5297 std::memory_order order,
5298 bool return_success,
5299 bool strong,
5300 bool byte_swap = false) {
5301 DCHECK(return_success || strong);
5302
5303 uint32_t expected_index = invoke->GetNumberOfArguments() - 2;
5304 uint32_t new_value_index = invoke->GetNumberOfArguments() - 1;
5305 DataType::Type value_type = GetDataTypeFromShorty(invoke, new_value_index);
5306 DCHECK_EQ(value_type, GetDataTypeFromShorty(invoke, expected_index));
5307
5308 MacroAssembler* masm = codegen->GetVIXLAssembler();
5309 LocationSummary* locations = invoke->GetLocations();
5310 CPURegister expected = InputCPURegisterOrZeroRegAt(invoke, expected_index);
5311 CPURegister new_value = InputCPURegisterOrZeroRegAt(invoke, new_value_index);
5312 CPURegister out = helpers::OutputCPURegister(invoke);
5313
5314 VarHandleTarget target = GetVarHandleTarget(invoke);
5315 VarHandleSlowPathARM64* slow_path = nullptr;
5316 if (!byte_swap) {
5317 slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
5318 GenerateVarHandleTarget(invoke, target, codegen);
5319 if (slow_path != nullptr) {
5320 slow_path->SetCompareAndSetOrExchangeArgs(return_success, strong);
5321 __ Bind(slow_path->GetNativeByteOrderLabel());
5322 }
5323 }
5324
5325 // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
5326 if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(new_value_index))) {
5327 // Mark card for object assuming new value is stored.
5328 bool new_value_can_be_null = true; // TODO: Worth finding out this information?
5329 codegen->MaybeMarkGCCard(target.object, new_value.W(), new_value_can_be_null);
5330 }
5331
5332 // Reuse the `offset` temporary for the pointer to the target location,
5333 // except for references that need the offset for the read barrier.
5334 UseScratchRegisterScope temps(masm);
5335 Register tmp_ptr = target.offset.X();
5336 if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5337 tmp_ptr = temps.AcquireX();
5338 }
5339 __ Add(tmp_ptr, target.object.X(), target.offset.X());
5340
5341 // Move floating point values to scratch registers.
5342 // Note that float/double CAS uses bitwise comparison, rather than the operator==.
5343 Register expected_reg = MoveToTempIfFpRegister(expected, value_type, masm, &temps);
5344 Register new_value_reg = MoveToTempIfFpRegister(new_value, value_type, masm, &temps);
5345 bool is_fp = DataType::IsFloatingPointType(value_type);
5346 DataType::Type cas_type = is_fp
5347 ? ((value_type == DataType::Type::kFloat64) ? DataType::Type::kInt64 : DataType::Type::kInt32)
5348 : value_type;
5349 // Avoid sign extension in the CAS loop by zero-extending `expected` before the loop. This adds
5350 // one instruction for CompareAndExchange as we shall need to sign-extend the returned value.
5351 if (value_type == DataType::Type::kInt16 && !expected.IsZero()) {
5352 Register temp = temps.AcquireW();
5353 __ Uxth(temp, expected_reg);
5354 expected_reg = temp;
5355 cas_type = DataType::Type::kUint16;
5356 } else if (value_type == DataType::Type::kInt8 && !expected.IsZero()) {
5357 Register temp = temps.AcquireW();
5358 __ Uxtb(temp, expected_reg);
5359 expected_reg = temp;
5360 cas_type = DataType::Type::kUint8;
5361 }
5362
5363 if (byte_swap) {
5364 // Do the byte swap and move values to scratch registers if needed.
5365 // Non-zero FP values and non-zero `expected` for `kInt16` are already in scratch registers.
5366 DCHECK_NE(value_type, DataType::Type::kInt8);
5367 if (!expected.IsZero()) {
5368 bool is_scratch = is_fp || (value_type == DataType::Type::kInt16);
5369 Register temp = is_scratch ? expected_reg : temps.AcquireSameSizeAs(expected_reg);
5370 GenerateReverseBytes(masm, cas_type, expected_reg, temp);
5371 expected_reg = temp;
5372 }
5373 if (!new_value.IsZero()) {
5374 Register temp = is_fp ? new_value_reg : temps.AcquireSameSizeAs(new_value_reg);
5375 GenerateReverseBytes(masm, cas_type, new_value_reg, temp);
5376 new_value_reg = temp;
5377 }
5378 }
5379
5380 // Prepare registers for old value and the result of the exclusive store.
5381 Register old_value;
5382 Register store_result;
5383 if (return_success) {
5384 // Use the output register for both old value and exclusive store result.
5385 old_value = (cas_type == DataType::Type::kInt64) ? out.X() : out.W();
5386 store_result = out.W();
5387 } else if (DataType::IsFloatingPointType(value_type)) {
5388 // We need two temporary registers but we have already used scratch registers for
5389 // holding the expected and new value unless they are zero bit pattern (+0.0f or
5390 // +0.0). We have allocated sufficient normal temporaries to handle that.
5391 size_t next_temp = 1u;
5392 if (expected.IsZero()) {
5393 old_value = (cas_type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
5394 } else {
5395 Location temp = locations->GetTemp(next_temp);
5396 ++next_temp;
5397 old_value = (cas_type == DataType::Type::kInt64) ? XRegisterFrom(temp) : WRegisterFrom(temp);
5398 }
5399 store_result =
5400 new_value.IsZero() ? temps.AcquireW() : WRegisterFrom(locations->GetTemp(next_temp));
5401 DCHECK(!old_value.Is(tmp_ptr));
5402 DCHECK(!store_result.Is(tmp_ptr));
5403 } else {
5404 // Use the output register for the old value.
5405 old_value = (cas_type == DataType::Type::kInt64) ? out.X() : out.W();
5406 // Use scratch register for the store result, except when we have used up
5407 // scratch registers for byte-swapped `expected` and `new_value`.
5408 // In that case, we have allocated a normal temporary.
5409 store_result = (byte_swap && !expected.IsZero() && !new_value.IsZero())
5410 ? WRegisterFrom(locations->GetTemp(1))
5411 : temps.AcquireW();
5412 DCHECK(!store_result.Is(tmp_ptr));
5413 }
5414
5415 vixl::aarch64::Label exit_loop_label;
5416 vixl::aarch64::Label* exit_loop = &exit_loop_label;
5417 vixl::aarch64::Label* cmp_failure = &exit_loop_label;
5418
5419 if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5420 // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
5421 // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
5422 size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
5423 Register old_value_temp =
5424 WRegisterFrom(locations->GetTemp((expected_coordinates_count == 0u) ? 2u : 1u));
5425 // For strong CAS, use a scratch register for the store result in slow path.
5426 // For weak CAS, we need to check the store result, so store it in `store_result`.
5427 Register slow_path_store_result = strong ? Register() : store_result;
5428 ReadBarrierCasSlowPathARM64* rb_slow_path =
5429 new (codegen->GetScopedAllocator()) ReadBarrierCasSlowPathARM64(
5430 invoke,
5431 order,
5432 strong,
5433 target.object,
5434 target.offset.X(),
5435 expected_reg,
5436 new_value_reg,
5437 old_value,
5438 old_value_temp,
5439 slow_path_store_result,
5440 /*update_old_value=*/ !return_success,
5441 codegen);
5442 codegen->AddSlowPath(rb_slow_path);
5443 exit_loop = rb_slow_path->GetExitLabel();
5444 cmp_failure = rb_slow_path->GetEntryLabel();
5445 }
5446
5447 GenerateCompareAndSet(codegen,
5448 cas_type,
5449 order,
5450 strong,
5451 cmp_failure,
5452 tmp_ptr,
5453 new_value_reg,
5454 old_value,
5455 store_result,
5456 expected_reg);
5457 __ Bind(exit_loop);
5458
5459 if (return_success) {
5460 if (strong) {
5461 __ Cset(out.W(), eq);
5462 } else {
5463 // On success, the Z flag is set and the store result is 1, see GenerateCompareAndSet().
5464 // On failure, either the Z flag is clear or the store result is 0.
5465 // Determine the final success value with a CSEL.
5466 __ Csel(out.W(), store_result, wzr, eq);
5467 }
5468 } else if (byte_swap) {
5469 // Also handles moving to FP registers.
5470 GenerateReverseBytes(masm, value_type, old_value, out);
5471 } else if (DataType::IsFloatingPointType(value_type)) {
5472 __ Fmov((value_type == DataType::Type::kFloat64) ? out.D() : out.S(), old_value);
5473 } else if (value_type == DataType::Type::kInt8) {
5474 __ Sxtb(out.W(), old_value);
5475 } else if (value_type == DataType::Type::kInt16) {
5476 __ Sxth(out.W(), old_value);
5477 }
5478
5479 if (slow_path != nullptr) {
5480 DCHECK(!byte_swap);
5481 __ Bind(slow_path->GetExitLabel());
5482 }
5483 }
5484
VisitVarHandleCompareAndExchange(HInvoke * invoke)5485 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchange(HInvoke* invoke) {
5486 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5487 }
5488
VisitVarHandleCompareAndExchange(HInvoke * invoke)5489 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchange(HInvoke* invoke) {
5490 GenerateVarHandleCompareAndSetOrExchange(
5491 invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ false, /*strong=*/ true);
5492 }
5493
VisitVarHandleCompareAndExchangeAcquire(HInvoke * invoke)5494 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchangeAcquire(HInvoke* invoke) {
5495 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5496 }
5497
VisitVarHandleCompareAndExchangeAcquire(HInvoke * invoke)5498 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchangeAcquire(HInvoke* invoke) {
5499 GenerateVarHandleCompareAndSetOrExchange(
5500 invoke, codegen_, std::memory_order_acquire, /*return_success=*/ false, /*strong=*/ true);
5501 }
5502
VisitVarHandleCompareAndExchangeRelease(HInvoke * invoke)5503 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchangeRelease(HInvoke* invoke) {
5504 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5505 }
5506
VisitVarHandleCompareAndExchangeRelease(HInvoke * invoke)5507 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchangeRelease(HInvoke* invoke) {
5508 GenerateVarHandleCompareAndSetOrExchange(
5509 invoke, codegen_, std::memory_order_release, /*return_success=*/ false, /*strong=*/ true);
5510 }
5511
VisitVarHandleCompareAndSet(HInvoke * invoke)5512 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndSet(HInvoke* invoke) {
5513 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5514 }
5515
VisitVarHandleCompareAndSet(HInvoke * invoke)5516 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndSet(HInvoke* invoke) {
5517 GenerateVarHandleCompareAndSetOrExchange(
5518 invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ true, /*strong=*/ true);
5519 }
5520
VisitVarHandleWeakCompareAndSet(HInvoke * invoke)5521 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSet(HInvoke* invoke) {
5522 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5523 }
5524
VisitVarHandleWeakCompareAndSet(HInvoke * invoke)5525 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSet(HInvoke* invoke) {
5526 GenerateVarHandleCompareAndSetOrExchange(
5527 invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ true, /*strong=*/ false);
5528 }
5529
VisitVarHandleWeakCompareAndSetAcquire(HInvoke * invoke)5530 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetAcquire(HInvoke* invoke) {
5531 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5532 }
5533
VisitVarHandleWeakCompareAndSetAcquire(HInvoke * invoke)5534 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetAcquire(HInvoke* invoke) {
5535 GenerateVarHandleCompareAndSetOrExchange(
5536 invoke, codegen_, std::memory_order_acquire, /*return_success=*/ true, /*strong=*/ false);
5537 }
5538
VisitVarHandleWeakCompareAndSetPlain(HInvoke * invoke)5539 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetPlain(HInvoke* invoke) {
5540 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5541 }
5542
VisitVarHandleWeakCompareAndSetPlain(HInvoke * invoke)5543 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetPlain(HInvoke* invoke) {
5544 GenerateVarHandleCompareAndSetOrExchange(
5545 invoke, codegen_, std::memory_order_relaxed, /*return_success=*/ true, /*strong=*/ false);
5546 }
5547
VisitVarHandleWeakCompareAndSetRelease(HInvoke * invoke)5548 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetRelease(HInvoke* invoke) {
5549 CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5550 }
5551
VisitVarHandleWeakCompareAndSetRelease(HInvoke * invoke)5552 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetRelease(HInvoke* invoke) {
5553 GenerateVarHandleCompareAndSetOrExchange(
5554 invoke, codegen_, std::memory_order_release, /*return_success=*/ true, /*strong=*/ false);
5555 }
5556
CreateVarHandleGetAndUpdateLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op)5557 static void CreateVarHandleGetAndUpdateLocations(HInvoke* invoke,
5558 CodeGeneratorARM64* codegen,
5559 GetAndUpdateOp get_and_update_op) {
5560 VarHandleOptimizations optimizations(invoke);
5561 if (optimizations.GetDoNotIntrinsify()) {
5562 return;
5563 }
5564
5565 // Get the type from the shorty as the invokes may not return a value.
5566 uint32_t arg_index = invoke->GetNumberOfArguments() - 1;
5567 DataType::Type value_type = GetDataTypeFromShorty(invoke, arg_index);
5568 if (value_type == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5569 // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
5570 // the passed reference and reloads it from the field, thus seeing the new value
5571 // that we have just stored. (And it also gets the memory visibility wrong.) b/173104084
5572 return;
5573 }
5574
5575 LocationSummary* locations = CreateVarHandleCommonLocations(invoke, codegen);
5576 size_t old_temp_count = locations->GetTempCount();
5577
5578 DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
5579 if (DataType::IsFloatingPointType(value_type)) {
5580 if (get_and_update_op == GetAndUpdateOp::kAdd) {
5581 // For ADD, do not use ZR for zero bit pattern (+0.0f or +0.0).
5582 locations->SetInAt(invoke->GetNumberOfArguments() - 1u, Location::RequiresFpuRegister());
5583 } else {
5584 DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
5585 // We can reuse the declaring class temporary if present.
5586 if (old_temp_count == 1u &&
5587 !IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5588 // Add a temporary for `old_value` if floating point `new_value` takes a scratch register.
5589 locations->AddTemp(Location::RequiresRegister());
5590 }
5591 }
5592 }
5593 // We need a temporary for the byte-swap path for bitwise operations unless the argument is a
5594 // zero which does not need a byte-swap. We can reuse the declaring class temporary if present.
5595 if (old_temp_count == 1u &&
5596 (get_and_update_op != GetAndUpdateOp::kSet && get_and_update_op != GetAndUpdateOp::kAdd) &&
5597 GetExpectedVarHandleCoordinatesCount(invoke) == 2u &&
5598 !IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5599 if (value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) {
5600 locations->AddTemp(Location::RequiresRegister());
5601 }
5602 }
5603
5604 // Request another temporary register for methods that don't return a value.
5605 // For the non-void case, we already set `out` in `CreateVarHandleCommonLocations`.
5606 DataType::Type return_type = invoke->GetType();
5607 const bool is_void = return_type == DataType::Type::kVoid;
5608 DCHECK_IMPLIES(!is_void, return_type == value_type);
5609 if (is_void) {
5610 if (DataType::IsFloatingPointType(value_type)) {
5611 locations->AddTemp(Location::RequiresFpuRegister());
5612 } else {
5613 locations->AddTemp(Location::RequiresRegister());
5614 }
5615 }
5616 }
5617
GenerateVarHandleGetAndUpdate(HInvoke * invoke,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op,std::memory_order order,bool byte_swap=false)5618 static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
5619 CodeGeneratorARM64* codegen,
5620 GetAndUpdateOp get_and_update_op,
5621 std::memory_order order,
5622 bool byte_swap = false) {
5623 // Get the type from the shorty as the invokes may not return a value.
5624 uint32_t arg_index = invoke->GetNumberOfArguments() - 1;
5625 DataType::Type value_type = GetDataTypeFromShorty(invoke, arg_index);
5626 bool is_fp = DataType::IsFloatingPointType(value_type);
5627
5628 MacroAssembler* masm = codegen->GetVIXLAssembler();
5629 LocationSummary* locations = invoke->GetLocations();
5630 CPURegister arg = (is_fp && get_and_update_op == GetAndUpdateOp::kAdd)
5631 ? InputCPURegisterAt(invoke, arg_index)
5632 : InputCPURegisterOrZeroRegAt(invoke, arg_index);
5633 DataType::Type return_type = invoke->GetType();
5634 const bool is_void = return_type == DataType::Type::kVoid;
5635 DCHECK_IMPLIES(!is_void, return_type == value_type);
5636 // We use a temporary for void methods, as we don't return the value.
5637 CPURegister out_or_temp =
5638 is_void ? CPURegisterFrom(locations->GetTemp(locations->GetTempCount() - 1u), value_type) :
5639 helpers::OutputCPURegister(invoke);
5640
5641 VarHandleTarget target = GetVarHandleTarget(invoke);
5642 VarHandleSlowPathARM64* slow_path = nullptr;
5643 if (!byte_swap) {
5644 slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
5645 GenerateVarHandleTarget(invoke, target, codegen);
5646 if (slow_path != nullptr) {
5647 slow_path->SetGetAndUpdateOp(get_and_update_op);
5648 __ Bind(slow_path->GetNativeByteOrderLabel());
5649 }
5650 }
5651
5652 // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
5653 if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(arg_index))) {
5654 DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
5655 // Mark card for object, the new value shall be stored.
5656 bool new_value_can_be_null = true; // TODO: Worth finding out this information?
5657 codegen->MaybeMarkGCCard(target.object, arg.W(), new_value_can_be_null);
5658 }
5659
5660 // Reuse the `target.offset` temporary for the pointer to the target location,
5661 // except for references that need the offset for the non-Baker read barrier.
5662 UseScratchRegisterScope temps(masm);
5663 Register tmp_ptr = target.offset.X();
5664 if (value_type == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5665 tmp_ptr = temps.AcquireX();
5666 }
5667 __ Add(tmp_ptr, target.object.X(), target.offset.X());
5668
5669 // The load/store type is never floating point.
5670 DataType::Type load_store_type = is_fp
5671 ? ((value_type == DataType::Type::kFloat32) ? DataType::Type::kInt32 : DataType::Type::kInt64)
5672 : value_type;
5673 // Avoid sign extension in the CAS loop. Sign-extend after the loop.
5674 // Note: Using unsigned values yields the same value to store (we do not store higher bits).
5675 if (value_type == DataType::Type::kInt8) {
5676 load_store_type = DataType::Type::kUint8;
5677 } else if (value_type == DataType::Type::kInt16) {
5678 load_store_type = DataType::Type::kUint16;
5679 }
5680
5681 // Prepare register for old value.
5682 CPURegister old_value = out_or_temp;
5683 if (get_and_update_op == GetAndUpdateOp::kSet) {
5684 // For floating point GetAndSet, do the GenerateGetAndUpdate() with core registers,
5685 // rather than moving between core and FP registers in the loop.
5686 arg = MoveToTempIfFpRegister(arg, value_type, masm, &temps);
5687 if (is_fp && !arg.IsZero()) {
5688 // We need a temporary register but we have already used a scratch register for
5689 // the new value unless it is zero bit pattern (+0.0f or +0.0) and need another one
5690 // in GenerateGetAndUpdate(). We have allocated a normal temporary to handle that.
5691 old_value = CPURegisterFrom(locations->GetTemp(1u), load_store_type);
5692 } else if (value_type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
5693 // Load the old value initially to a scratch register.
5694 // We shall move it to `out` later with a read barrier.
5695 old_value = temps.AcquireW();
5696 }
5697 }
5698
5699 if (byte_swap) {
5700 DCHECK_NE(value_type, DataType::Type::kReference);
5701 DCHECK_NE(DataType::Size(value_type), 1u);
5702 if (get_and_update_op == GetAndUpdateOp::kAdd) {
5703 // We need to do the byte swapping in the CAS loop for GetAndAdd.
5704 get_and_update_op = GetAndUpdateOp::kAddWithByteSwap;
5705 } else if (!arg.IsZero()) {
5706 // For other operations, avoid byte swap inside the CAS loop by providing an adjusted `arg`.
5707 // For GetAndSet use a scratch register; FP argument is already in a scratch register.
5708 // For bitwise operations GenerateGetAndUpdate() needs both scratch registers;
5709 // we have allocated a normal temporary to handle that.
5710 CPURegister temp = (get_and_update_op == GetAndUpdateOp::kSet)
5711 ? (is_fp ? arg : (arg.Is64Bits() ? temps.AcquireX() : temps.AcquireW()))
5712 : CPURegisterFrom(locations->GetTemp(1u), load_store_type);
5713 GenerateReverseBytes(masm, load_store_type, arg, temp);
5714 arg = temp;
5715 }
5716 }
5717
5718 GenerateGetAndUpdate(codegen, get_and_update_op, load_store_type, order, tmp_ptr, arg, old_value);
5719
5720 if (!is_void) {
5721 if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
5722 // The only adjustment needed is sign-extension for `kInt16`.
5723 // Everything else has been done by the `GenerateGetAndUpdate()`.
5724 DCHECK(byte_swap);
5725 if (value_type == DataType::Type::kInt16) {
5726 DCHECK_EQ(load_store_type, DataType::Type::kUint16);
5727 __ Sxth(out_or_temp.W(), old_value.W());
5728 }
5729 } else if (byte_swap) {
5730 // Also handles moving to FP registers.
5731 GenerateReverseBytes(masm, value_type, old_value, out_or_temp);
5732 } else if (get_and_update_op == GetAndUpdateOp::kSet &&
5733 value_type == DataType::Type::kFloat64) {
5734 __ Fmov(out_or_temp.D(), old_value.X());
5735 } else if (get_and_update_op == GetAndUpdateOp::kSet &&
5736 value_type == DataType::Type::kFloat32) {
5737 __ Fmov(out_or_temp.S(), old_value.W());
5738 } else if (value_type == DataType::Type::kInt8) {
5739 __ Sxtb(out_or_temp.W(), old_value.W());
5740 } else if (value_type == DataType::Type::kInt16) {
5741 __ Sxth(out_or_temp.W(), old_value.W());
5742 } else if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5743 if (kUseBakerReadBarrier) {
5744 codegen->GenerateIntrinsicMoveWithBakerReadBarrier(out_or_temp.W(), old_value.W());
5745 } else {
5746 codegen->GenerateReadBarrierSlow(
5747 invoke,
5748 Location::RegisterLocation(out_or_temp.GetCode()),
5749 Location::RegisterLocation(old_value.GetCode()),
5750 Location::RegisterLocation(target.object.GetCode()),
5751 /*offset=*/0u,
5752 /*index=*/Location::RegisterLocation(target.offset.GetCode()));
5753 }
5754 }
5755 }
5756
5757 if (slow_path != nullptr) {
5758 DCHECK(!byte_swap);
5759 __ Bind(slow_path->GetExitLabel());
5760 }
5761 }
5762
VisitVarHandleGetAndSet(HInvoke * invoke)5763 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSet(HInvoke* invoke) {
5764 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5765 }
5766
VisitVarHandleGetAndSet(HInvoke * invoke)5767 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSet(HInvoke* invoke) {
5768 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_seq_cst);
5769 }
5770
VisitVarHandleGetAndSetAcquire(HInvoke * invoke)5771 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) {
5772 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5773 }
5774
VisitVarHandleGetAndSetAcquire(HInvoke * invoke)5775 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) {
5776 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_acquire);
5777 }
5778
VisitVarHandleGetAndSetRelease(HInvoke * invoke)5779 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) {
5780 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5781 }
5782
VisitVarHandleGetAndSetRelease(HInvoke * invoke)5783 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) {
5784 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_release);
5785 }
5786
VisitVarHandleGetAndAdd(HInvoke * invoke)5787 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAdd(HInvoke* invoke) {
5788 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5789 }
5790
VisitVarHandleGetAndAdd(HInvoke * invoke)5791 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAdd(HInvoke* invoke) {
5792 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_seq_cst);
5793 }
5794
VisitVarHandleGetAndAddAcquire(HInvoke * invoke)5795 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) {
5796 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5797 }
5798
VisitVarHandleGetAndAddAcquire(HInvoke * invoke)5799 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) {
5800 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_acquire);
5801 }
5802
VisitVarHandleGetAndAddRelease(HInvoke * invoke)5803 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) {
5804 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5805 }
5806
VisitVarHandleGetAndAddRelease(HInvoke * invoke)5807 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) {
5808 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_release);
5809 }
5810
VisitVarHandleGetAndBitwiseAnd(HInvoke * invoke)5811 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) {
5812 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5813 }
5814
VisitVarHandleGetAndBitwiseAnd(HInvoke * invoke)5815 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) {
5816 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_seq_cst);
5817 }
5818
VisitVarHandleGetAndBitwiseAndAcquire(HInvoke * invoke)5819 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) {
5820 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5821 }
5822
VisitVarHandleGetAndBitwiseAndAcquire(HInvoke * invoke)5823 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) {
5824 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_acquire);
5825 }
5826
VisitVarHandleGetAndBitwiseAndRelease(HInvoke * invoke)5827 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) {
5828 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5829 }
5830
VisitVarHandleGetAndBitwiseAndRelease(HInvoke * invoke)5831 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) {
5832 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_release);
5833 }
5834
VisitVarHandleGetAndBitwiseOr(HInvoke * invoke)5835 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) {
5836 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5837 }
5838
VisitVarHandleGetAndBitwiseOr(HInvoke * invoke)5839 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) {
5840 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_seq_cst);
5841 }
5842
VisitVarHandleGetAndBitwiseOrAcquire(HInvoke * invoke)5843 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) {
5844 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5845 }
5846
VisitVarHandleGetAndBitwiseOrAcquire(HInvoke * invoke)5847 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) {
5848 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_acquire);
5849 }
5850
VisitVarHandleGetAndBitwiseOrRelease(HInvoke * invoke)5851 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) {
5852 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5853 }
5854
VisitVarHandleGetAndBitwiseOrRelease(HInvoke * invoke)5855 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) {
5856 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_release);
5857 }
5858
VisitVarHandleGetAndBitwiseXor(HInvoke * invoke)5859 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) {
5860 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5861 }
5862
VisitVarHandleGetAndBitwiseXor(HInvoke * invoke)5863 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) {
5864 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_seq_cst);
5865 }
5866
VisitVarHandleGetAndBitwiseXorAcquire(HInvoke * invoke)5867 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) {
5868 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5869 }
5870
VisitVarHandleGetAndBitwiseXorAcquire(HInvoke * invoke)5871 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) {
5872 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_acquire);
5873 }
5874
VisitVarHandleGetAndBitwiseXorRelease(HInvoke * invoke)5875 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) {
5876 CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5877 }
5878
VisitVarHandleGetAndBitwiseXorRelease(HInvoke * invoke)5879 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) {
5880 GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_release);
5881 }
5882
EmitByteArrayViewCode(CodeGenerator * codegen_in)5883 void VarHandleSlowPathARM64::EmitByteArrayViewCode(CodeGenerator* codegen_in) {
5884 DCHECK(GetByteArrayViewCheckLabel()->IsLinked());
5885 CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
5886 MacroAssembler* masm = codegen->GetVIXLAssembler();
5887 HInvoke* invoke = GetInvoke();
5888 mirror::VarHandle::AccessModeTemplate access_mode_template = GetAccessModeTemplate();
5889 DataType::Type value_type =
5890 GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
5891 DCHECK_NE(value_type, DataType::Type::kReference);
5892 size_t size = DataType::Size(value_type);
5893 DCHECK_GT(size, 1u);
5894 Register varhandle = InputRegisterAt(invoke, 0);
5895 Register object = InputRegisterAt(invoke, 1);
5896 Register index = InputRegisterAt(invoke, 2);
5897
5898 MemberOffset class_offset = mirror::Object::ClassOffset();
5899 MemberOffset array_length_offset = mirror::Array::LengthOffset();
5900 MemberOffset data_offset = mirror::Array::DataOffset(Primitive::kPrimByte);
5901 MemberOffset native_byte_order_offset = mirror::ByteArrayViewVarHandle::NativeByteOrderOffset();
5902
5903 __ Bind(GetByteArrayViewCheckLabel());
5904
5905 VarHandleTarget target = GetVarHandleTarget(invoke);
5906 {
5907 UseScratchRegisterScope temps(masm);
5908 Register temp = temps.AcquireW();
5909 Register temp2 = temps.AcquireW();
5910
5911 // The main path checked that the coordinateType0 is an array class that matches
5912 // the class of the actual coordinate argument but it does not match the value type.
5913 // Check if the `varhandle` references a ByteArrayViewVarHandle instance.
5914 __ Ldr(temp, HeapOperand(varhandle, class_offset.Int32Value()));
5915 codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
5916 codegen->LoadClassRootForIntrinsic(temp2, ClassRoot::kJavaLangInvokeByteArrayViewVarHandle);
5917 __ Cmp(temp, temp2);
5918 __ B(GetEntryLabel(), ne);
5919
5920 // Check for array index out of bounds.
5921 __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
5922 __ Subs(temp, temp, index);
5923 __ Ccmp(temp, size, NoFlag, hs); // If SUBS yields LO (C=false), keep the C flag clear.
5924 __ B(GetEntryLabel(), lo);
5925
5926 // Construct the target.
5927 __ Add(target.offset, index, data_offset.Int32Value());
5928
5929 // Alignment check. For unaligned access, go to the runtime.
5930 DCHECK(IsPowerOfTwo(size));
5931 if (size == 2u) {
5932 __ Tbnz(target.offset, 0, GetEntryLabel());
5933 } else {
5934 __ Tst(target.offset, size - 1u);
5935 __ B(GetEntryLabel(), ne);
5936 }
5937
5938 // Byte order check. For native byte order return to the main path.
5939 if (access_mode_template == mirror::VarHandle::AccessModeTemplate::kSet &&
5940 IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5941 // There is no reason to differentiate between native byte order and byte-swap
5942 // for setting a zero bit pattern. Just return to the main path.
5943 __ B(GetNativeByteOrderLabel());
5944 return;
5945 }
5946 __ Ldr(temp, HeapOperand(varhandle, native_byte_order_offset.Int32Value()));
5947 __ Cbnz(temp, GetNativeByteOrderLabel());
5948 }
5949
5950 switch (access_mode_template) {
5951 case mirror::VarHandle::AccessModeTemplate::kGet:
5952 GenerateVarHandleGet(invoke, codegen, order_, /*byte_swap=*/ true);
5953 break;
5954 case mirror::VarHandle::AccessModeTemplate::kSet:
5955 GenerateVarHandleSet(invoke, codegen, order_, /*byte_swap=*/ true);
5956 break;
5957 case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
5958 case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
5959 GenerateVarHandleCompareAndSetOrExchange(
5960 invoke, codegen, order_, return_success_, strong_, /*byte_swap=*/ true);
5961 break;
5962 case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
5963 GenerateVarHandleGetAndUpdate(
5964 invoke, codegen, get_and_update_op_, order_, /*byte_swap=*/ true);
5965 break;
5966 }
5967 __ B(GetExitLabel());
5968 }
5969
VisitMethodHandleInvokeExact(HInvoke * invoke)5970 void IntrinsicLocationsBuilderARM64::VisitMethodHandleInvokeExact(HInvoke* invoke) {
5971 ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
5972 LocationSummary* locations = new (allocator)
5973 LocationSummary(invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
5974
5975 InvokeDexCallingConventionVisitorARM64 calling_convention;
5976 locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
5977
5978 locations->SetInAt(0, Location::RequiresRegister());
5979
5980 // Accomodating LocationSummary for underlying invoke-* call.
5981 uint32_t number_of_args = invoke->GetNumberOfArguments();
5982 for (uint32_t i = 1; i < number_of_args; ++i) {
5983 locations->SetInAt(i, calling_convention.GetNextLocation(invoke->InputAt(i)->GetType()));
5984 }
5985
5986 // The last input is MethodType object corresponding to the call-site.
5987 locations->SetInAt(number_of_args, Location::RequiresRegister());
5988
5989 locations->AddTemp(Location::RequiresRegister());
5990 locations->AddTemp(calling_convention.GetMethodLocation());
5991 }
5992
VisitMethodHandleInvokeExact(HInvoke * invoke)5993 void IntrinsicCodeGeneratorARM64::VisitMethodHandleInvokeExact(HInvoke* invoke) {
5994 LocationSummary* locations = invoke->GetLocations();
5995
5996 Register method_handle = InputRegisterAt(invoke, 0);
5997
5998 SlowPathCodeARM64* slow_path =
5999 new (codegen_->GetScopedAllocator()) InvokePolymorphicSlowPathARM64(invoke, method_handle);
6000 codegen_->AddSlowPath(slow_path);
6001 MacroAssembler* masm = codegen_->GetVIXLAssembler();
6002
6003 Register call_site_type = InputRegisterAt(invoke, invoke->GetNumberOfArguments());
6004
6005 // Call site should match with MethodHandle's type.
6006 Register temp = WRegisterFrom(locations->GetTemp(0));
6007 __ Ldr(temp, HeapOperand(method_handle.W(), mirror::MethodHandle::MethodTypeOffset()));
6008 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp);
6009 __ Cmp(call_site_type, temp);
6010 __ B(ne, slow_path->GetEntryLabel());
6011
6012 __ Ldr(temp, HeapOperand(method_handle.W(), mirror::MethodHandle::HandleKindOffset()));
6013 __ Cmp(temp, Operand(mirror::MethodHandle::Kind::kInvokeStatic));
6014 __ B(ne, slow_path->GetEntryLabel());
6015
6016 Register method = XRegisterFrom(locations->GetTemp(1));
6017 __ Ldr(method, HeapOperand(method_handle.W(), mirror::MethodHandle::ArtFieldOrMethodOffset()));
6018 Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
6019 __ Ldr(lr, MemOperand(method, entry_point.SizeValue()));
6020 __ Blr(lr);
6021 codegen_->RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
6022 __ Bind(slow_path->GetExitLabel());
6023 }
6024
6025 #define MARK_UNIMPLEMENTED(Name) UNIMPLEMENTED_INTRINSIC(ARM64, Name)
6026 UNIMPLEMENTED_INTRINSIC_LIST_ARM64(MARK_UNIMPLEMENTED);
6027 #undef MARK_UNIMPLEMENTED
6028
6029 UNREACHABLE_INTRINSICS(ARM64)
6030
6031 #undef __
6032
6033 } // namespace arm64
6034 } // namespace art
6035