xref: /aosp_15_r20/external/swiftshader/src/Reactor/SubzeroReactor.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Debug.hpp"
16 #include "Print.hpp"
17 #include "Reactor.hpp"
18 #include "ReactorDebugInfo.hpp"
19 #include "SIMD.hpp"
20 
21 #include "ExecutableMemory.hpp"
22 #include "Optimizer.hpp"
23 #include "PragmaInternals.hpp"
24 
25 #include "src/IceCfg.h"
26 #include "src/IceCfgNode.h"
27 #include "src/IceELFObjectWriter.h"
28 #include "src/IceELFStreamer.h"
29 #include "src/IceGlobalContext.h"
30 #include "src/IceGlobalInits.h"
31 #include "src/IceTypes.h"
32 
33 #include "llvm/Support/Compiler.h"
34 #include "llvm/Support/FileSystem.h"
35 #include "llvm/Support/ManagedStatic.h"
36 #include "llvm/Support/raw_os_ostream.h"
37 
38 #include "marl/event.h"
39 
40 #if __has_feature(memory_sanitizer)
41 #	include <sanitizer/msan_interface.h>
42 #endif
43 
44 #if defined(_WIN32)
45 #	ifndef WIN32_LEAN_AND_MEAN
46 #		define WIN32_LEAN_AND_MEAN
47 #	endif  // !WIN32_LEAN_AND_MEAN
48 #	ifndef NOMINMAX
49 #		define NOMINMAX
50 #	endif  // !NOMINMAX
51 #	include <Windows.h>
52 #endif
53 
54 #include <array>
55 #include <cmath>
56 #include <iostream>
57 #include <limits>
58 #include <mutex>
59 
60 // Subzero utility functions
61 // These functions only accept and return Subzero (Ice) types, and do not access any globals.
62 namespace {
63 namespace sz {
64 
createFunction(Ice::GlobalContext * context,Ice::Type returnType,const std::vector<Ice::Type> & paramTypes)65 Ice::Cfg *createFunction(Ice::GlobalContext *context, Ice::Type returnType, const std::vector<Ice::Type> &paramTypes)
66 {
67 	uint32_t sequenceNumber = 0;
68 	auto *function = Ice::Cfg::create(context, sequenceNumber).release();
69 
70 	function->setStackSizeLimit(512 * 1024);  // 512 KiB
71 
72 	Ice::CfgLocalAllocatorScope allocScope{ function };
73 
74 	for(auto type : paramTypes)
75 	{
76 		Ice::Variable *arg = function->makeVariable(type);
77 		function->addArg(arg);
78 	}
79 
80 	Ice::CfgNode *node = function->makeNode();
81 	function->setEntryNode(node);
82 
83 	return function;
84 }
85 
getPointerType(Ice::Type elementType)86 Ice::Type getPointerType(Ice::Type elementType)
87 {
88 	if(sizeof(void *) == 8)
89 	{
90 		return Ice::IceType_i64;
91 	}
92 	else
93 	{
94 		return Ice::IceType_i32;
95 	}
96 }
97 
allocateStackVariable(Ice::Cfg * function,Ice::Type type,int arraySize=0)98 Ice::Variable *allocateStackVariable(Ice::Cfg *function, Ice::Type type, int arraySize = 0)
99 {
100 	int typeSize = Ice::typeWidthInBytes(type);
101 	int totalSize = typeSize * (arraySize ? arraySize : 1);
102 
103 	auto bytes = Ice::ConstantInteger32::create(function->getContext(), Ice::IceType_i32, totalSize);
104 	auto address = function->makeVariable(getPointerType(type));
105 	auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
106 	function->getEntryNode()->getInsts().push_front(alloca);
107 
108 	ASSERT(!rr::getPragmaState(rr::InitializeLocalVariables) && "Subzero does not support initializing local variables");
109 
110 	return address;
111 }
112 
getConstantPointer(Ice::GlobalContext * context,const void * ptr)113 Ice::Constant *getConstantPointer(Ice::GlobalContext *context, const void *ptr)
114 {
115 	if(sizeof(void *) == 8)
116 	{
117 		return context->getConstantInt64(reinterpret_cast<intptr_t>(ptr));
118 	}
119 	else
120 	{
121 		return context->getConstantInt32(reinterpret_cast<intptr_t>(ptr));
122 	}
123 }
124 
125 // TODO(amaiorano): remove this prototype once these are moved to separate header/cpp
126 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType);
127 
128 // Wrapper for calls on C functions with Ice types
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,Ice::Operand * callTarget,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)129 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, Ice::Operand *callTarget, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
130 {
131 	Ice::Variable *ret = nullptr;
132 
133 	// Subzero doesn't support boolean return values. Replace with an i32 temporarily,
134 	// then truncate result to bool.
135 	// TODO(b/151158858): Add support to Subzero's InstCall for bool-returning functions
136 	const bool returningBool = (retTy == Ice::IceType_i1);
137 	if(returningBool)
138 	{
139 		ret = function->makeVariable(Ice::IceType_i32);
140 	}
141 	else if(retTy != Ice::IceType_void)
142 	{
143 		ret = function->makeVariable(retTy);
144 	}
145 
146 	auto call = Ice::InstCall::create(function, iceArgs.size(), ret, callTarget, false, false, isVariadic);
147 	for(auto arg : iceArgs)
148 	{
149 		call->addArg(arg);
150 	}
151 
152 	basicBlock->appendInst(call);
153 
154 	if(returningBool)
155 	{
156 		// Truncate result to bool so that if any (lsb) bits were set, result will be true
157 		ret = createTruncate(function, basicBlock, ret, Ice::IceType_i1);
158 	}
159 
160 	return ret;
161 }
162 
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,const void * fptr,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)163 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, const void *fptr, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
164 {
165 	Ice::Operand *callTarget = getConstantPointer(function->getContext(), fptr);
166 	return Call(function, basicBlock, retTy, callTarget, iceArgs, isVariadic);
167 }
168 
169 // Wrapper for calls on C functions with Ice types
170 template<typename Return, typename... CArgs, typename... RArgs>
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Return (fptr)(CArgs...),RArgs &&...args)171 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Return(fptr)(CArgs...), RArgs &&...args)
172 {
173 	static_assert(sizeof...(CArgs) == sizeof...(RArgs), "Expected number of args don't match");
174 
175 	Ice::Type retTy = T(rr::CToReactorT<Return>::type());
176 	std::vector<Ice::Operand *> iceArgs{ std::forward<RArgs>(args)... };
177 	return Call(function, basicBlock, retTy, reinterpret_cast<const void *>(fptr), iceArgs, false);
178 }
179 
createTruncate(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * from,Ice::Type toType)180 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType)
181 {
182 	Ice::Variable *to = function->makeVariable(toType);
183 	Ice::InstCast *cast = Ice::InstCast::create(function, Ice::InstCast::Trunc, to, from);
184 	basicBlock->appendInst(cast);
185 	return to;
186 }
187 
createLoad(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * ptr,Ice::Type type,unsigned int align)188 Ice::Variable *createLoad(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *ptr, Ice::Type type, unsigned int align)
189 {
190 	Ice::Variable *result = function->makeVariable(type);
191 	auto load = Ice::InstLoad::create(function, result, ptr, align);
192 	basicBlock->appendInst(load);
193 
194 	return result;
195 }
196 
197 }  // namespace sz
198 }  // namespace
199 
200 namespace rr {
201 class ELFMemoryStreamer;
202 class CoroutineGenerator;
203 }  // namespace rr
204 
205 namespace {
206 
207 // Used to automatically invoke llvm_shutdown() when driver is unloaded
208 llvm::llvm_shutdown_obj llvmShutdownObj;
209 
210 Ice::GlobalContext *context = nullptr;
211 Ice::Cfg *function = nullptr;
212 Ice::CfgNode *entryBlock = nullptr;
213 Ice::CfgNode *basicBlockTop = nullptr;
214 Ice::CfgNode *basicBlock = nullptr;
215 Ice::CfgLocalAllocatorScope *allocator = nullptr;
216 rr::ELFMemoryStreamer *routine = nullptr;
217 
218 std::mutex codegenMutex;
219 
220 Ice::ELFFileStreamer *elfFile = nullptr;
221 Ice::Fdstream *out = nullptr;
222 
223 // Coroutine globals
224 rr::Type *coroYieldType = nullptr;
225 std::shared_ptr<rr::CoroutineGenerator> coroGen;
getOrCreateScheduler()226 marl::Scheduler &getOrCreateScheduler()
227 {
228 	static auto scheduler = [] {
229 		marl::Scheduler::Config cfg;
230 		cfg.setWorkerThreadCount(8);
231 		return std::make_unique<marl::Scheduler>(cfg);
232 	}();
233 
234 	return *scheduler;
235 }
236 
237 rr::Nucleus::OptimizerCallback *optimizerCallback = nullptr;
238 
239 }  // Anonymous namespace
240 
241 namespace {
242 
243 #if !defined(__i386__) && defined(_M_IX86)
244 #	define __i386__ 1
245 #endif
246 
247 #if !defined(__x86_64__) && (defined(_M_AMD64) || defined(_M_X64))
248 #	define __x86_64__ 1
249 #endif
250 
toIce(int level)251 Ice::OptLevel toIce(int level)
252 {
253 	switch(level)
254 	{
255 	// Note that O0 and O1 are not implemented by Subzero
256 	case 0: return Ice::Opt_m1;
257 	case 1: return Ice::Opt_m1;
258 	case 2: return Ice::Opt_2;
259 	case 3: return Ice::Opt_2;
260 	default: UNREACHABLE("Unknown Optimization Level %d", int(level));
261 	}
262 	return Ice::Opt_2;
263 }
264 
stdToIceMemoryOrder(std::memory_order memoryOrder)265 Ice::Intrinsics::MemoryOrder stdToIceMemoryOrder(std::memory_order memoryOrder)
266 {
267 	switch(memoryOrder)
268 	{
269 	case std::memory_order_relaxed: return Ice::Intrinsics::MemoryOrderRelaxed;
270 	case std::memory_order_consume: return Ice::Intrinsics::MemoryOrderConsume;
271 	case std::memory_order_acquire: return Ice::Intrinsics::MemoryOrderAcquire;
272 	case std::memory_order_release: return Ice::Intrinsics::MemoryOrderRelease;
273 	case std::memory_order_acq_rel: return Ice::Intrinsics::MemoryOrderAcquireRelease;
274 	case std::memory_order_seq_cst: return Ice::Intrinsics::MemoryOrderSequentiallyConsistent;
275 	}
276 	return Ice::Intrinsics::MemoryOrderInvalid;
277 }
278 
279 class CPUID
280 {
281 public:
282 	const static bool ARM;
283 	const static bool SSE4_1;
284 
285 private:
cpuid(int registers[4],int info)286 	static void cpuid(int registers[4], int info)
287 	{
288 #if defined(__i386__) || defined(__x86_64__)
289 #	if defined(_WIN32)
290 		__cpuid(registers, info);
291 #	else
292 		__asm volatile("cpuid"
293 		               : "=a"(registers[0]), "=b"(registers[1]), "=c"(registers[2]), "=d"(registers[3])
294 		               : "a"(info));
295 #	endif
296 #else
297 		registers[0] = 0;
298 		registers[1] = 0;
299 		registers[2] = 0;
300 		registers[3] = 0;
301 #endif
302 	}
303 
detectARM()304 	constexpr static bool detectARM()
305 	{
306 #if defined(__arm__) || defined(__aarch64__)
307 		return true;
308 #elif defined(__i386__) || defined(__x86_64__)
309 		return false;
310 #elif defined(__mips__)
311 		return false;
312 #else
313 #	error "Unknown architecture"
314 #endif
315 	}
316 
detectSSE4_1()317 	static bool detectSSE4_1()
318 	{
319 #if defined(__i386__) || defined(__x86_64__)
320 		int registers[4];
321 		cpuid(registers, 1);
322 		return (registers[2] & 0x00080000) != 0;
323 #else
324 		return false;
325 #endif
326 	}
327 };
328 
329 constexpr bool CPUID::ARM = CPUID::detectARM();
330 const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
331 constexpr bool emulateIntrinsics = false;
332 constexpr bool emulateMismatchedBitCast = CPUID::ARM;
333 
334 constexpr bool subzeroDumpEnabled = false;
335 constexpr bool subzeroEmitTextAsm = false;
336 
337 #if !ALLOW_DUMP
338 static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
339 static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
340 #endif
341 
342 }  // anonymous namespace
343 
344 namespace rr {
345 
346 const int SIMD::Width = 4;
347 
backendName()348 std::string Caps::backendName()
349 {
350 	return "Subzero";
351 }
352 
coroutinesSupported()353 bool Caps::coroutinesSupported()
354 {
355 	return true;
356 }
357 
fmaIsFast()358 bool Caps::fmaIsFast()
359 {
360 	// TODO(b/214591655): Subzero currently never emits FMA instructions. std::fma() is called instead.
361 	return false;
362 }
363 
364 enum EmulatedType
365 {
366 	EmulatedShift = 16,
367 	EmulatedV2 = 2 << EmulatedShift,
368 	EmulatedV4 = 4 << EmulatedShift,
369 	EmulatedV8 = 8 << EmulatedShift,
370 	EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
371 
372 	Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
373 	Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
374 	Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
375 	Type_v8i8 = Ice::IceType_v16i8 | EmulatedV8,
376 	Type_v4i8 = Ice::IceType_v16i8 | EmulatedV4,
377 	Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
378 };
379 
380 class Value : public Ice::Operand
381 {};
382 class SwitchCases : public Ice::InstSwitch
383 {};
384 class BasicBlock : public Ice::CfgNode
385 {};
386 
T(Type * t)387 Ice::Type T(Type *t)
388 {
389 	static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
390 	return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
391 }
392 
T(Ice::Type t)393 Type *T(Ice::Type t)
394 {
395 	return reinterpret_cast<Type *>(t);
396 }
397 
T(EmulatedType t)398 Type *T(EmulatedType t)
399 {
400 	return reinterpret_cast<Type *>(t);
401 }
402 
T(const std::vector<Type * > & types)403 std::vector<Ice::Type> T(const std::vector<Type *> &types)
404 {
405 	std::vector<Ice::Type> result;
406 	result.reserve(types.size());
407 	for(auto &t : types)
408 	{
409 		result.push_back(T(t));
410 	}
411 	return result;
412 }
413 
V(Ice::Operand * v)414 Value *V(Ice::Operand *v)
415 {
416 	return reinterpret_cast<Value *>(v);
417 }
418 
V(Value * v)419 Ice::Operand *V(Value *v)
420 {
421 	return reinterpret_cast<Ice::Operand *>(v);
422 }
423 
V(const std::vector<Value * > & values)424 std::vector<Ice::Operand *> V(const std::vector<Value *> &values)
425 {
426 	std::vector<Ice::Operand *> result;
427 	result.reserve(values.size());
428 	for(auto &v : values)
429 	{
430 		result.push_back(V(v));
431 	}
432 	return result;
433 }
434 
B(Ice::CfgNode * b)435 BasicBlock *B(Ice::CfgNode *b)
436 {
437 	return reinterpret_cast<BasicBlock *>(b);
438 }
439 
typeSize(Type * type)440 static size_t typeSize(Type *type)
441 {
442 	if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
443 	{
444 		switch(reinterpret_cast<std::intptr_t>(type))
445 		{
446 		case Type_v2i32: return 8;
447 		case Type_v4i16: return 8;
448 		case Type_v2i16: return 4;
449 		case Type_v8i8: return 8;
450 		case Type_v4i8: return 4;
451 		case Type_v2f32: return 8;
452 		default: ASSERT(false);
453 		}
454 	}
455 
456 	return Ice::typeWidthInBytes(T(type));
457 }
458 
finalizeFunction()459 static void finalizeFunction()
460 {
461 	// Create a return if none was added
462 	if(::basicBlock->getInsts().empty() || ::basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
463 	{
464 		Nucleus::createRetVoid();
465 	}
466 
467 	// Connect the entry block to the top of the initial basic block
468 	auto br = Ice::InstBr::create(::function, ::basicBlockTop);
469 	::entryBlock->appendInst(br);
470 }
471 
472 using ElfHeader = std::conditional<sizeof(void *) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
473 using SectionHeader = std::conditional<sizeof(void *) == 8, Elf64_Shdr, Elf32_Shdr>::type;
474 
sectionHeader(const ElfHeader * elfHeader)475 inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
476 {
477 	return reinterpret_cast<const SectionHeader *>((intptr_t)elfHeader + elfHeader->e_shoff);
478 }
479 
elfSection(const ElfHeader * elfHeader,int index)480 inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
481 {
482 	return &sectionHeader(elfHeader)[index];
483 }
484 
relocateSymbol(const ElfHeader * elfHeader,const Elf32_Rel & relocation,const SectionHeader & relocationTable)485 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
486 {
487 	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
488 
489 	uint32_t index = relocation.getSymbol();
490 	int table = relocationTable.sh_link;
491 	void *symbolValue = nullptr;
492 
493 	if(index != SHN_UNDEF)
494 	{
495 		if(table == SHN_UNDEF) return nullptr;
496 		const SectionHeader *symbolTable = elfSection(elfHeader, table);
497 
498 		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
499 		if(index >= symtab_entries)
500 		{
501 			ASSERT(index < symtab_entries && "Symbol Index out of range");
502 			return nullptr;
503 		}
504 
505 		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
506 		Elf32_Sym &symbol = ((Elf32_Sym *)symbolAddress)[index];
507 		uint16_t section = symbol.st_shndx;
508 
509 		if(section != SHN_UNDEF && section < SHN_LORESERVE)
510 		{
511 			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
512 			symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
513 		}
514 		else
515 		{
516 			return nullptr;
517 		}
518 	}
519 
520 	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
521 	unaligned_ptr<int32_t> patchSite = (void *)(address + relocation.r_offset);
522 
523 	if(CPUID::ARM)
524 	{
525 		switch(relocation.getType())
526 		{
527 		case R_ARM_NONE:
528 			// No relocation
529 			break;
530 		case R_ARM_MOVW_ABS_NC:
531 			{
532 				uint32_t thumb = 0;  // Calls to Thumb code not supported.
533 				uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
534 				*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
535 			}
536 			break;
537 		case R_ARM_MOVT_ABS:
538 			{
539 				uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
540 				*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
541 			}
542 			break;
543 		default:
544 			ASSERT(false && "Unsupported relocation type");
545 			return nullptr;
546 		}
547 	}
548 	else
549 	{
550 		switch(relocation.getType())
551 		{
552 		case R_386_NONE:
553 			// No relocation
554 			break;
555 		case R_386_32:
556 			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
557 			break;
558 		case R_386_PC32:
559 			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
560 			break;
561 		default:
562 			ASSERT(false && "Unsupported relocation type");
563 			return nullptr;
564 		}
565 	}
566 
567 	return symbolValue;
568 }
569 
relocateSymbol(const ElfHeader * elfHeader,const Elf64_Rela & relocation,const SectionHeader & relocationTable)570 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
571 {
572 	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
573 
574 	uint32_t index = relocation.getSymbol();
575 	int table = relocationTable.sh_link;
576 	void *symbolValue = nullptr;
577 
578 	if(index != SHN_UNDEF)
579 	{
580 		if(table == SHN_UNDEF) return nullptr;
581 		const SectionHeader *symbolTable = elfSection(elfHeader, table);
582 
583 		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
584 		if(index >= symtab_entries)
585 		{
586 			ASSERT(index < symtab_entries && "Symbol Index out of range");
587 			return nullptr;
588 		}
589 
590 		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
591 		Elf64_Sym &symbol = ((Elf64_Sym *)symbolAddress)[index];
592 		uint16_t section = symbol.st_shndx;
593 
594 		if(section != SHN_UNDEF && section < SHN_LORESERVE)
595 		{
596 			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
597 			symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
598 		}
599 		else
600 		{
601 			return nullptr;
602 		}
603 	}
604 
605 	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
606 	unaligned_ptr<int32_t> patchSite32 = (void *)(address + relocation.r_offset);
607 	unaligned_ptr<int64_t> patchSite64 = (void *)(address + relocation.r_offset);
608 
609 	switch(relocation.getType())
610 	{
611 	case R_X86_64_NONE:
612 		// No relocation
613 		break;
614 	case R_X86_64_64:
615 		*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
616 		break;
617 	case R_X86_64_PC32:
618 		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
619 		break;
620 	case R_X86_64_32S:
621 		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
622 		break;
623 	default:
624 		ASSERT(false && "Unsupported relocation type");
625 		return nullptr;
626 	}
627 
628 	return symbolValue;
629 }
630 
631 struct EntryPoint
632 {
633 	const void *entry;
634 	size_t codeSize = 0;
635 };
636 
loadImage(uint8_t * const elfImage,const std::vector<const char * > & functionNames)637 std::vector<EntryPoint> loadImage(uint8_t *const elfImage, const std::vector<const char *> &functionNames)
638 {
639 	ASSERT(functionNames.size() > 0);
640 	std::vector<EntryPoint> entryPoints(functionNames.size());
641 
642 	ElfHeader *elfHeader = (ElfHeader *)elfImage;
643 
644 	// TODO: assert?
645 	if(!elfHeader->checkMagic())
646 	{
647 		return {};
648 	}
649 
650 	// Expect ELF bitness to match platform
651 	ASSERT(sizeof(void *) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
652 #if defined(__i386__)
653 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_386);
654 #elif defined(__x86_64__)
655 	ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_X86_64);
656 #elif defined(__arm__)
657 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_ARM);
658 #elif defined(__aarch64__)
659 	ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_AARCH64);
660 #elif defined(__mips__)
661 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_MIPS);
662 #else
663 #	error "Unsupported platform"
664 #endif
665 
666 	SectionHeader *sectionHeader = (SectionHeader *)(elfImage + elfHeader->e_shoff);
667 
668 	for(int i = 0; i < elfHeader->e_shnum; i++)
669 	{
670 		if(sectionHeader[i].sh_type == SHT_PROGBITS)
671 		{
672 			if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
673 			{
674 				auto findSectionNameEntryIndex = [&]() -> size_t {
675 					auto sectionNameOffset = sectionHeader[elfHeader->e_shstrndx].sh_offset + sectionHeader[i].sh_name;
676 					const char *sectionName = reinterpret_cast<const char *>(elfImage + sectionNameOffset);
677 
678 					for(size_t j = 0; j < functionNames.size(); ++j)
679 					{
680 						if(strstr(sectionName, functionNames[j]) != nullptr)
681 						{
682 							return j;
683 						}
684 					}
685 
686 					UNREACHABLE("Failed to find executable section that matches input function names");
687 					return static_cast<size_t>(-1);
688 				};
689 
690 				size_t index = findSectionNameEntryIndex();
691 				entryPoints[index].entry = elfImage + sectionHeader[i].sh_offset;
692 				entryPoints[index].codeSize = sectionHeader[i].sh_size;
693 			}
694 		}
695 		else if(sectionHeader[i].sh_type == SHT_REL)
696 		{
697 			ASSERT(sizeof(void *) == 4 && "UNIMPLEMENTED");  // Only expected/implemented for 32-bit code
698 
699 			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
700 			{
701 				const Elf32_Rel &relocation = ((const Elf32_Rel *)(elfImage + sectionHeader[i].sh_offset))[index];
702 				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
703 			}
704 		}
705 		else if(sectionHeader[i].sh_type == SHT_RELA)
706 		{
707 			ASSERT(sizeof(void *) == 8 && "UNIMPLEMENTED");  // Only expected/implemented for 64-bit code
708 
709 			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
710 			{
711 				const Elf64_Rela &relocation = ((const Elf64_Rela *)(elfImage + sectionHeader[i].sh_offset))[index];
712 				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
713 			}
714 		}
715 	}
716 
717 	return entryPoints;
718 }
719 
720 template<typename T>
721 struct ExecutableAllocator
722 {
ExecutableAllocatorrr::ExecutableAllocator723 	ExecutableAllocator() {}
724 	template<class U>
ExecutableAllocatorrr::ExecutableAllocator725 	ExecutableAllocator(const ExecutableAllocator<U> &other)
726 	{}
727 
728 	using value_type = T;
729 	using size_type = std::size_t;
730 
allocaterr::ExecutableAllocator731 	T *allocate(size_type n)
732 	{
733 		return (T *)allocateMemoryPages(
734 		    sizeof(T) * n, PERMISSION_READ | PERMISSION_WRITE, true);
735 	}
736 
deallocaterr::ExecutableAllocator737 	void deallocate(T *p, size_type n)
738 	{
739 		deallocateMemoryPages(p, sizeof(T) * n);
740 	}
741 };
742 
743 class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
744 {
745 	ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
746 	ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
747 
748 public:
ELFMemoryStreamer()749 	ELFMemoryStreamer()
750 	    : Routine()
751 	{
752 		position = 0;
753 		buffer.reserve(0x1000);
754 	}
755 
~ELFMemoryStreamer()756 	~ELFMemoryStreamer() override
757 	{
758 	}
759 
write8(uint8_t Value)760 	void write8(uint8_t Value) override
761 	{
762 		if(position == (uint64_t)buffer.size())
763 		{
764 			buffer.push_back(Value);
765 			position++;
766 		}
767 		else if(position < (uint64_t)buffer.size())
768 		{
769 			buffer[position] = Value;
770 			position++;
771 		}
772 		else
773 			ASSERT(false && "UNIMPLEMENTED");
774 	}
775 
writeBytes(llvm::StringRef Bytes)776 	void writeBytes(llvm::StringRef Bytes) override
777 	{
778 		std::size_t oldSize = buffer.size();
779 		buffer.resize(oldSize + Bytes.size());
780 		memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
781 		position += Bytes.size();
782 	}
783 
tell() const784 	uint64_t tell() const override
785 	{
786 		return position;
787 	}
788 
seek(uint64_t Off)789 	void seek(uint64_t Off) override
790 	{
791 		position = Off;
792 	}
793 
loadImageAndGetEntryPoints(const std::vector<const char * > & functionNames)794 	std::vector<EntryPoint> loadImageAndGetEntryPoints(const std::vector<const char *> &functionNames)
795 	{
796 		auto entryPoints = loadImage(&buffer[0], functionNames);
797 
798 #if defined(_WIN32)
799 		FlushInstructionCache(GetCurrentProcess(), NULL, 0);
800 #else
801 		for(auto &entryPoint : entryPoints)
802 		{
803 			__builtin___clear_cache((char *)entryPoint.entry, (char *)entryPoint.entry + entryPoint.codeSize);
804 		}
805 #endif
806 
807 		return entryPoints;
808 	}
809 
finalize()810 	void finalize()
811 	{
812 		position = std::numeric_limits<std::size_t>::max();  // Can't stream more data after this
813 
814 		protectMemoryPages(&buffer[0], buffer.size(), PERMISSION_READ | PERMISSION_EXECUTE);
815 	}
816 
setEntry(int index,const void * func)817 	void setEntry(int index, const void *func)
818 	{
819 		ASSERT(func);
820 		funcs[index] = func;
821 	}
822 
getEntry(int index) const823 	const void *getEntry(int index) const override
824 	{
825 		ASSERT(funcs[index]);
826 		return funcs[index];
827 	}
828 
addConstantData(const void * data,size_t size,size_t alignment=1)829 	const void *addConstantData(const void *data, size_t size, size_t alignment = 1)
830 	{
831 		// Check if we already have a suitable constant.
832 		for(const auto &c : constantsPool)
833 		{
834 			void *ptr = c.data.get();
835 			size_t space = c.space;
836 
837 			void *alignedPtr = std::align(alignment, size, ptr, space);
838 
839 			if(space < size)
840 			{
841 				continue;
842 			}
843 
844 			if(memcmp(data, alignedPtr, size) == 0)
845 			{
846 				return alignedPtr;
847 			}
848 		}
849 
850 		// TODO(b/148086935): Replace with a buffer allocator.
851 		size_t space = size + alignment;
852 		auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[space]);
853 		void *ptr = buf.get();
854 		void *alignedPtr = std::align(alignment, size, ptr, space);
855 		ASSERT(alignedPtr);
856 		memcpy(alignedPtr, data, size);
857 		constantsPool.emplace_back(std::move(buf), space);
858 
859 		return alignedPtr;
860 	}
861 
862 private:
863 	struct Constant
864 	{
Constantrr::ELFMemoryStreamer::Constant865 		Constant(std::unique_ptr<uint8_t[]> data, size_t space)
866 		    : data(std::move(data))
867 		    , space(space)
868 		{}
869 
870 		std::unique_ptr<uint8_t[]> data;
871 		size_t space;
872 	};
873 
874 	std::array<const void *, Nucleus::CoroutineEntryCount> funcs = {};
875 	std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
876 	std::size_t position;
877 	std::vector<Constant> constantsPool;
878 };
879 
880 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)881 void VPrintf(const std::vector<Value *> &vals)
882 {
883 	sz::Call(::function, ::basicBlock, Ice::IceType_i32, reinterpret_cast<const void *>(rr::DebugPrintf), V(vals), true);
884 }
885 #endif  // ENABLE_RR_PRINT
886 
Nucleus()887 Nucleus::Nucleus()
888 {
889 	::codegenMutex.lock();  // SubzeroReactor is currently not thread safe
890 
891 	Ice::ClFlags &Flags = Ice::ClFlags::Flags;
892 	Ice::ClFlags::getParsedClFlags(Flags);
893 
894 #if defined(__arm__)
895 	Flags.setTargetArch(Ice::Target_ARM32);
896 	Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
897 #elif defined(__mips__)
898 	Flags.setTargetArch(Ice::Target_MIPS32);
899 	Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
900 #else  // x86
901 	Flags.setTargetArch(sizeof(void *) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
902 	Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
903 #endif
904 	Flags.setOutFileType(Ice::FT_Elf);
905 	Flags.setOptLevel(toIce(rr::getPragmaState(rr::OptimizationLevel)));
906 	Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
907 	Flags.setDisableHybridAssembly(true);
908 
909 	// Emit functions into separate sections in the ELF so we can find them by name
910 	Flags.setFunctionSections(true);
911 
912 	static llvm::raw_os_ostream cout(std::cout);
913 	static llvm::raw_os_ostream cerr(std::cerr);
914 
915 	if(subzeroEmitTextAsm)
916 	{
917 		// Decorate text asm with liveness info
918 		Flags.setDecorateAsm(true);
919 	}
920 
921 	if(false)  // Write out to a file
922 	{
923 		std::error_code errorCode;
924 		::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
925 		::elfFile = new Ice::ELFFileStreamer(*out);
926 		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
927 	}
928 	else
929 	{
930 		ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
931 		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
932 		::routine = elfMemory;
933 	}
934 
935 #if !__has_feature(memory_sanitizer)
936 	// thread_local variables in shared libraries are initialized at load-time,
937 	// but this is not observed by MemorySanitizer if the loader itself was not
938 	// instrumented, leading to false-positive uninitialized variable errors.
939 	ASSERT(Variable::unmaterializedVariables == nullptr);
940 #endif
941 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables{};
942 }
943 
~Nucleus()944 Nucleus::~Nucleus()
945 {
946 	delete Variable::unmaterializedVariables;
947 	Variable::unmaterializedVariables = nullptr;
948 
949 	delete ::routine;
950 	::routine = nullptr;
951 
952 	delete ::allocator;
953 	::allocator = nullptr;
954 
955 	delete ::function;
956 	::function = nullptr;
957 
958 	delete ::context;
959 	::context = nullptr;
960 
961 	delete ::elfFile;
962 	::elfFile = nullptr;
963 
964 	delete ::out;
965 	::out = nullptr;
966 
967 	::entryBlock = nullptr;
968 	::basicBlock = nullptr;
969 	::basicBlockTop = nullptr;
970 
971 	::codegenMutex.unlock();
972 }
973 
974 // This function lowers and produces executable binary code in memory for the input functions,
975 // and returns a Routine with the entry points to these functions.
976 template<size_t Count>
acquireRoutine(Ice::Cfg * const (& functions)[Count],const char * const (& names)[Count])977 static std::shared_ptr<Routine> acquireRoutine(Ice::Cfg *const (&functions)[Count], const char *const (&names)[Count])
978 {
979 	// This logic is modeled after the IceCompiler, as well as GlobalContext::translateFunctions
980 	// and GlobalContext::emitItems.
981 
982 	if(subzeroDumpEnabled)
983 	{
984 		// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
985 		::context->getStrDump().SetUnbuffered();
986 	}
987 
988 	::context->emitFileHeader();
989 
990 	// Translate
991 
992 	for(size_t i = 0; i < Count; ++i)
993 	{
994 		Ice::Cfg *currFunc = functions[i];
995 
996 		// Install function allocator in TLS for Cfg-specific container allocators
997 		Ice::CfgLocalAllocatorScope allocScope(currFunc);
998 
999 		currFunc->setFunctionName(Ice::GlobalString::createWithString(::context, names[i]));
1000 
1001 		if(::optimizerCallback)
1002 		{
1003 			Nucleus::OptimizerReport report;
1004 			rr::optimize(currFunc, &report);
1005 			::optimizerCallback(&report);
1006 			::optimizerCallback = nullptr;
1007 		}
1008 		else
1009 		{
1010 			rr::optimize(currFunc);
1011 		}
1012 
1013 		currFunc->computeInOutEdges();
1014 		ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1015 
1016 		currFunc->translate();
1017 		ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1018 
1019 		currFunc->getAssembler<>()->setInternal(currFunc->getInternal());
1020 
1021 		if(subzeroEmitTextAsm)
1022 		{
1023 			currFunc->emit();
1024 		}
1025 
1026 		currFunc->emitIAS();
1027 
1028 		if(currFunc->hasError())
1029 		{
1030 			return nullptr;
1031 		}
1032 	}
1033 
1034 	// Emit items
1035 
1036 	::context->lowerGlobals("");
1037 
1038 	auto objectWriter = ::context->getObjectWriter();
1039 
1040 	for(size_t i = 0; i < Count; ++i)
1041 	{
1042 		Ice::Cfg *currFunc = functions[i];
1043 
1044 		// Accumulate globals from functions to emit into the "last" section at the end
1045 		auto globals = currFunc->getGlobalInits();
1046 		if(globals && !globals->empty())
1047 		{
1048 			::context->getGlobals()->merge(globals.get());
1049 		}
1050 
1051 		auto assembler = currFunc->releaseAssembler();
1052 		assembler->alignFunction();
1053 		objectWriter->writeFunctionCode(currFunc->getFunctionName(), currFunc->getInternal(), assembler.get());
1054 	}
1055 
1056 	::context->lowerGlobals("last");
1057 	::context->lowerConstants();
1058 	::context->lowerJumpTables();
1059 
1060 	objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
1061 	::context->emitTargetRODataSections();
1062 	objectWriter->writeNonUserSections();
1063 
1064 	// Done compiling functions, get entry pointers to each of them
1065 	auto entryPoints = ::routine->loadImageAndGetEntryPoints({ names, names + Count });
1066 	ASSERT(entryPoints.size() == Count);
1067 	for(size_t i = 0; i < entryPoints.size(); ++i)
1068 	{
1069 		::routine->setEntry(i, entryPoints[i].entry);
1070 	}
1071 
1072 	::routine->finalize();
1073 
1074 	Routine *handoffRoutine = ::routine;
1075 	::routine = nullptr;
1076 
1077 	return std::shared_ptr<Routine>(handoffRoutine);
1078 }
1079 
acquireRoutine(const char * name)1080 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name)
1081 {
1082 	finalizeFunction();
1083 	return rr::acquireRoutine({ ::function }, { name });
1084 }
1085 
allocateStackVariable(Type * t,int arraySize)1086 Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
1087 {
1088 	Ice::Type type = T(t);
1089 	int typeSize = Ice::typeWidthInBytes(type);
1090 	int totalSize = typeSize * (arraySize ? arraySize : 1);
1091 
1092 	auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
1093 	auto address = ::function->makeVariable(T(getPointerType(t)));
1094 	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
1095 	::function->getEntryNode()->getInsts().push_front(alloca);
1096 
1097 	return V(address);
1098 }
1099 
createBasicBlock()1100 BasicBlock *Nucleus::createBasicBlock()
1101 {
1102 	return B(::function->makeNode());
1103 }
1104 
getInsertBlock()1105 BasicBlock *Nucleus::getInsertBlock()
1106 {
1107 	return B(::basicBlock);
1108 }
1109 
setInsertBlock(BasicBlock * basicBlock)1110 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
1111 {
1112 	// ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
1113 
1114 	::basicBlock = basicBlock;
1115 }
1116 
createFunction(Type * returnType,const std::vector<Type * > & paramTypes)1117 void Nucleus::createFunction(Type *returnType, const std::vector<Type *> &paramTypes)
1118 {
1119 	ASSERT(::function == nullptr);
1120 	ASSERT(::allocator == nullptr);
1121 	ASSERT(::entryBlock == nullptr);
1122 	ASSERT(::basicBlock == nullptr);
1123 	ASSERT(::basicBlockTop == nullptr);
1124 
1125 	::function = sz::createFunction(::context, T(returnType), T(paramTypes));
1126 
1127 	// NOTE: The scoped allocator sets the TLS allocator to the one in the function. This global one
1128 	// becomes invalid if another one is created; for example, when creating await and destroy functions
1129 	// for coroutines, in which case, we must make sure to create a new scoped allocator for ::function again.
1130 	// TODO: Get rid of this as a global, and create scoped allocs in every Nucleus function instead.
1131 	::allocator = new Ice::CfgLocalAllocatorScope(::function);
1132 
1133 	::entryBlock = ::function->getEntryNode();
1134 	::basicBlock = ::function->makeNode();
1135 	::basicBlockTop = ::basicBlock;
1136 }
1137 
getArgument(unsigned int index)1138 Value *Nucleus::getArgument(unsigned int index)
1139 {
1140 	return V(::function->getArgs()[index]);
1141 }
1142 
createRetVoid()1143 void Nucleus::createRetVoid()
1144 {
1145 	RR_DEBUG_INFO_UPDATE_LOC();
1146 
1147 	// Code generated after this point is unreachable, so any variables
1148 	// being read can safely return an undefined value. We have to avoid
1149 	// materializing variables after the terminator ret instruction.
1150 	Variable::killUnmaterialized();
1151 
1152 	Ice::InstRet *ret = Ice::InstRet::create(::function);
1153 	::basicBlock->appendInst(ret);
1154 }
1155 
createRet(Value * v)1156 void Nucleus::createRet(Value *v)
1157 {
1158 	RR_DEBUG_INFO_UPDATE_LOC();
1159 
1160 	// Code generated after this point is unreachable, so any variables
1161 	// being read can safely return an undefined value. We have to avoid
1162 	// materializing variables after the terminator ret instruction.
1163 	Variable::killUnmaterialized();
1164 
1165 	Ice::InstRet *ret = Ice::InstRet::create(::function, v);
1166 	::basicBlock->appendInst(ret);
1167 }
1168 
createBr(BasicBlock * dest)1169 void Nucleus::createBr(BasicBlock *dest)
1170 {
1171 	RR_DEBUG_INFO_UPDATE_LOC();
1172 	Variable::materializeAll();
1173 
1174 	auto br = Ice::InstBr::create(::function, dest);
1175 	::basicBlock->appendInst(br);
1176 }
1177 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)1178 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
1179 {
1180 	RR_DEBUG_INFO_UPDATE_LOC();
1181 	Variable::materializeAll();
1182 
1183 	auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
1184 	::basicBlock->appendInst(br);
1185 }
1186 
isCommutative(Ice::InstArithmetic::OpKind op)1187 static bool isCommutative(Ice::InstArithmetic::OpKind op)
1188 {
1189 	switch(op)
1190 	{
1191 	case Ice::InstArithmetic::Add:
1192 	case Ice::InstArithmetic::Fadd:
1193 	case Ice::InstArithmetic::Mul:
1194 	case Ice::InstArithmetic::Fmul:
1195 	case Ice::InstArithmetic::And:
1196 	case Ice::InstArithmetic::Or:
1197 	case Ice::InstArithmetic::Xor:
1198 		return true;
1199 	default:
1200 		return false;
1201 	}
1202 }
1203 
createArithmetic(Ice::InstArithmetic::OpKind op,Value * lhs,Value * rhs)1204 static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
1205 {
1206 	ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
1207 
1208 	bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
1209 
1210 	Ice::Variable *result = ::function->makeVariable(lhs->getType());
1211 	Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
1212 	::basicBlock->appendInst(arithmetic);
1213 
1214 	return V(result);
1215 }
1216 
createAdd(Value * lhs,Value * rhs)1217 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
1218 {
1219 	RR_DEBUG_INFO_UPDATE_LOC();
1220 	return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
1221 }
1222 
createSub(Value * lhs,Value * rhs)1223 Value *Nucleus::createSub(Value *lhs, Value *rhs)
1224 {
1225 	RR_DEBUG_INFO_UPDATE_LOC();
1226 	return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
1227 }
1228 
createMul(Value * lhs,Value * rhs)1229 Value *Nucleus::createMul(Value *lhs, Value *rhs)
1230 {
1231 	RR_DEBUG_INFO_UPDATE_LOC();
1232 	return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
1233 }
1234 
createUDiv(Value * lhs,Value * rhs)1235 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
1236 {
1237 	RR_DEBUG_INFO_UPDATE_LOC();
1238 	return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
1239 }
1240 
createSDiv(Value * lhs,Value * rhs)1241 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
1242 {
1243 	RR_DEBUG_INFO_UPDATE_LOC();
1244 	return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
1245 }
1246 
createFAdd(Value * lhs,Value * rhs)1247 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
1248 {
1249 	RR_DEBUG_INFO_UPDATE_LOC();
1250 	return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
1251 }
1252 
createFSub(Value * lhs,Value * rhs)1253 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
1254 {
1255 	RR_DEBUG_INFO_UPDATE_LOC();
1256 	return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
1257 }
1258 
createFMul(Value * lhs,Value * rhs)1259 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
1260 {
1261 	RR_DEBUG_INFO_UPDATE_LOC();
1262 	return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
1263 }
1264 
createFDiv(Value * lhs,Value * rhs)1265 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
1266 {
1267 	RR_DEBUG_INFO_UPDATE_LOC();
1268 	return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
1269 }
1270 
createURem(Value * lhs,Value * rhs)1271 Value *Nucleus::createURem(Value *lhs, Value *rhs)
1272 {
1273 	RR_DEBUG_INFO_UPDATE_LOC();
1274 	return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
1275 }
1276 
createSRem(Value * lhs,Value * rhs)1277 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
1278 {
1279 	RR_DEBUG_INFO_UPDATE_LOC();
1280 	return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
1281 }
1282 
createFRem(Value * lhs,Value * rhs)1283 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
1284 {
1285 	RR_DEBUG_INFO_UPDATE_LOC();
1286 	// TODO(b/148139679) Fix Subzero generating invalid code for FRem on vector types
1287 	// createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
1288 	UNIMPLEMENTED("b/148139679 Nucleus::createFRem");
1289 	return nullptr;
1290 }
1291 
createShl(Value * lhs,Value * rhs)1292 Value *Nucleus::createShl(Value *lhs, Value *rhs)
1293 {
1294 	RR_DEBUG_INFO_UPDATE_LOC();
1295 	return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
1296 }
1297 
createLShr(Value * lhs,Value * rhs)1298 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
1299 {
1300 	RR_DEBUG_INFO_UPDATE_LOC();
1301 	return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
1302 }
1303 
createAShr(Value * lhs,Value * rhs)1304 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
1305 {
1306 	RR_DEBUG_INFO_UPDATE_LOC();
1307 	return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
1308 }
1309 
createAnd(Value * lhs,Value * rhs)1310 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
1311 {
1312 	RR_DEBUG_INFO_UPDATE_LOC();
1313 	return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
1314 }
1315 
createOr(Value * lhs,Value * rhs)1316 Value *Nucleus::createOr(Value *lhs, Value *rhs)
1317 {
1318 	RR_DEBUG_INFO_UPDATE_LOC();
1319 	return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
1320 }
1321 
createXor(Value * lhs,Value * rhs)1322 Value *Nucleus::createXor(Value *lhs, Value *rhs)
1323 {
1324 	RR_DEBUG_INFO_UPDATE_LOC();
1325 	return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
1326 }
1327 
createNeg(Value * v)1328 Value *Nucleus::createNeg(Value *v)
1329 {
1330 	RR_DEBUG_INFO_UPDATE_LOC();
1331 	return createSub(createNullValue(T(v->getType())), v);
1332 }
1333 
createFNeg(Value * v)1334 Value *Nucleus::createFNeg(Value *v)
1335 {
1336 	RR_DEBUG_INFO_UPDATE_LOC();
1337 	std::vector<double> c = { -0.0 };
1338 	Value *negativeZero = Ice::isVectorType(v->getType()) ? createConstantVector(c, T(v->getType())) : V(::context->getConstantFloat(-0.0f));
1339 
1340 	return createFSub(negativeZero, v);
1341 }
1342 
createNot(Value * v)1343 Value *Nucleus::createNot(Value *v)
1344 {
1345 	RR_DEBUG_INFO_UPDATE_LOC();
1346 	if(Ice::isScalarIntegerType(v->getType()))
1347 	{
1348 		return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
1349 	}
1350 	else  // Vector
1351 	{
1352 		std::vector<int64_t> c = { -1 };
1353 		return createXor(v, createConstantVector(c, T(v->getType())));
1354 	}
1355 }
1356 
validateAtomicAndMemoryOrderArgs(bool atomic,std::memory_order memoryOrder)1357 static void validateAtomicAndMemoryOrderArgs(bool atomic, std::memory_order memoryOrder)
1358 {
1359 #if defined(__i386__) || defined(__x86_64__)
1360 	// We're good, atomics and strictest memory order (except seq_cst) are guaranteed.
1361 	// Note that sequential memory ordering could be guaranteed by using x86's LOCK prefix.
1362 	// Note also that relaxed memory order could be implemented using MOVNTPS and friends.
1363 #else
1364 	if(atomic)
1365 	{
1366 		UNIMPLEMENTED("b/150475088 Atomic load/store not implemented for current platform");
1367 	}
1368 	if(memoryOrder != std::memory_order_relaxed)
1369 	{
1370 		UNIMPLEMENTED("b/150475088 Memory order other than memory_order_relaxed not implemented for current platform");
1371 	}
1372 #endif
1373 
1374 	// Vulkan doesn't allow sequential memory order
1375 	ASSERT(memoryOrder != std::memory_order_seq_cst);
1376 }
1377 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1378 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1379 {
1380 	RR_DEBUG_INFO_UPDATE_LOC();
1381 	validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1382 
1383 	int valueType = (int)reinterpret_cast<intptr_t>(type);
1384 	Ice::Variable *result = nullptr;
1385 
1386 	if((valueType & EmulatedBits) && (align != 0))  // Narrow vector not stored on stack.
1387 	{
1388 		if(emulateIntrinsics)
1389 		{
1390 			if(typeSize(type) == 4)
1391 			{
1392 				auto pointer = RValue<Pointer<Byte>>(ptr);
1393 				Int x = *Pointer<Int>(pointer);
1394 
1395 				Int4 vector;
1396 				vector = Insert(vector, x, 0);
1397 
1398 				result = ::function->makeVariable(T(type));
1399 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1400 				::basicBlock->appendInst(bitcast);
1401 			}
1402 			else if(typeSize(type) == 8)
1403 			{
1404 				ASSERT_MSG(!atomic, "Emulated 64-bit loads are not atomic");
1405 				auto pointer = RValue<Pointer<Byte>>(ptr);
1406 				Int x = *Pointer<Int>(pointer);
1407 				Int y = *Pointer<Int>(pointer + 4);
1408 
1409 				Int4 vector;
1410 				vector = Insert(vector, x, 0);
1411 				vector = Insert(vector, y, 1);
1412 
1413 				result = ::function->makeVariable(T(type));
1414 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1415 				::basicBlock->appendInst(bitcast);
1416 			}
1417 			else
1418 				UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1419 		}
1420 		else
1421 		{
1422 			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
1423 			result = ::function->makeVariable(T(type));
1424 			auto load = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
1425 			load->addArg(ptr);
1426 			load->addArg(::context->getConstantInt32(typeSize(type)));
1427 			::basicBlock->appendInst(load);
1428 		}
1429 	}
1430 	else
1431 	{
1432 		result = sz::createLoad(::function, ::basicBlock, V(ptr), T(type), align);
1433 	}
1434 
1435 	ASSERT(result);
1436 	return V(result);
1437 }
1438 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1439 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1440 {
1441 	RR_DEBUG_INFO_UPDATE_LOC();
1442 	validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1443 
1444 #if __has_feature(memory_sanitizer)
1445 	// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
1446 	if(align != 0)
1447 	{
1448 		auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
1449 		call->addArg(ptr);
1450 		call->addArg(::context->getConstantInt64(typeSize(type)));
1451 		::basicBlock->appendInst(call);
1452 	}
1453 #endif
1454 
1455 	int valueType = (int)reinterpret_cast<intptr_t>(type);
1456 
1457 	if((valueType & EmulatedBits) && (align != 0))  // Narrow vector not stored on stack.
1458 	{
1459 		if(emulateIntrinsics)
1460 		{
1461 			if(typeSize(type) == 4)
1462 			{
1463 				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1464 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1465 				::basicBlock->appendInst(bitcast);
1466 
1467 				RValue<Int4> v(V(vector));
1468 
1469 				auto pointer = RValue<Pointer<Byte>>(ptr);
1470 				Int x = Extract(v, 0);
1471 				*Pointer<Int>(pointer) = x;
1472 			}
1473 			else if(typeSize(type) == 8)
1474 			{
1475 				ASSERT_MSG(!atomic, "Emulated 64-bit stores are not atomic");
1476 				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1477 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1478 				::basicBlock->appendInst(bitcast);
1479 
1480 				RValue<Int4> v(V(vector));
1481 
1482 				auto pointer = RValue<Pointer<Byte>>(ptr);
1483 				Int x = Extract(v, 0);
1484 				*Pointer<Int>(pointer) = x;
1485 				Int y = Extract(v, 1);
1486 				*Pointer<Int>(pointer + 4) = y;
1487 			}
1488 			else
1489 				UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1490 		}
1491 		else
1492 		{
1493 			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1494 			auto store = Ice::InstIntrinsic::create(::function, 3, nullptr, intrinsic);
1495 			store->addArg(value);
1496 			store->addArg(ptr);
1497 			store->addArg(::context->getConstantInt32(typeSize(type)));
1498 			::basicBlock->appendInst(store);
1499 		}
1500 	}
1501 	else
1502 	{
1503 		ASSERT(value->getType() == T(type));
1504 
1505 		auto store = Ice::InstStore::create(::function, V(value), V(ptr), align);
1506 		::basicBlock->appendInst(store);
1507 	}
1508 
1509 	return value;
1510 }
1511 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1512 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1513 {
1514 	RR_DEBUG_INFO_UPDATE_LOC();
1515 	ASSERT(index->getType() == Ice::IceType_i32);
1516 
1517 	if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
1518 	{
1519 		int32_t offset = constant->getValue() * (int)typeSize(type);
1520 
1521 		if(offset == 0)
1522 		{
1523 			return ptr;
1524 		}
1525 
1526 		return createAdd(ptr, createConstantInt(offset));
1527 	}
1528 
1529 	if(!Ice::isByteSizedType(T(type)))
1530 	{
1531 		index = createMul(index, createConstantInt((int)typeSize(type)));
1532 	}
1533 
1534 	if(sizeof(void *) == 8)
1535 	{
1536 		if(unsignedIndex)
1537 		{
1538 			index = createZExt(index, T(Ice::IceType_i64));
1539 		}
1540 		else
1541 		{
1542 			index = createSExt(index, T(Ice::IceType_i64));
1543 		}
1544 	}
1545 
1546 	return createAdd(ptr, index);
1547 }
1548 
createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp,Value * ptr,Value * value,std::memory_order memoryOrder)1549 static Value *createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp, Value *ptr, Value *value, std::memory_order memoryOrder)
1550 {
1551 	Ice::Variable *result = ::function->makeVariable(value->getType());
1552 
1553 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicRMW, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1554 	auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1555 	auto op = ::context->getConstantInt32(rmwOp);
1556 	auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
1557 	inst->addArg(op);
1558 	inst->addArg(ptr);
1559 	inst->addArg(value);
1560 	inst->addArg(order);
1561 	::basicBlock->appendInst(inst);
1562 
1563 	return V(result);
1564 }
1565 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1566 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1567 {
1568 	RR_DEBUG_INFO_UPDATE_LOC();
1569 	return createAtomicRMW(Ice::Intrinsics::AtomicAdd, ptr, value, memoryOrder);
1570 }
1571 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1572 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1573 {
1574 	RR_DEBUG_INFO_UPDATE_LOC();
1575 	return createAtomicRMW(Ice::Intrinsics::AtomicSub, ptr, value, memoryOrder);
1576 }
1577 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1578 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1579 {
1580 	RR_DEBUG_INFO_UPDATE_LOC();
1581 	return createAtomicRMW(Ice::Intrinsics::AtomicAnd, ptr, value, memoryOrder);
1582 }
1583 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1584 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1585 {
1586 	RR_DEBUG_INFO_UPDATE_LOC();
1587 	return createAtomicRMW(Ice::Intrinsics::AtomicOr, ptr, value, memoryOrder);
1588 }
1589 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1590 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1591 {
1592 	RR_DEBUG_INFO_UPDATE_LOC();
1593 	return createAtomicRMW(Ice::Intrinsics::AtomicXor, ptr, value, memoryOrder);
1594 }
1595 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1596 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1597 {
1598 	RR_DEBUG_INFO_UPDATE_LOC();
1599 	return createAtomicRMW(Ice::Intrinsics::AtomicExchange, ptr, value, memoryOrder);
1600 }
1601 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1602 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1603 {
1604 	RR_DEBUG_INFO_UPDATE_LOC();
1605 	Ice::Variable *result = ::function->makeVariable(value->getType());
1606 
1607 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicCmpxchg, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1608 	auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1609 	auto orderEq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderEqual));
1610 	auto orderNeq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderUnequal));
1611 	inst->addArg(ptr);
1612 	inst->addArg(compare);
1613 	inst->addArg(value);
1614 	inst->addArg(orderEq);
1615 	inst->addArg(orderNeq);
1616 	::basicBlock->appendInst(inst);
1617 
1618 	return V(result);
1619 }
1620 
createCast(Ice::InstCast::OpKind op,Value * v,Type * destType)1621 static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
1622 {
1623 	if(v->getType() == T(destType))
1624 	{
1625 		return v;
1626 	}
1627 
1628 	Ice::Variable *result = ::function->makeVariable(T(destType));
1629 	Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
1630 	::basicBlock->appendInst(cast);
1631 
1632 	return V(result);
1633 }
1634 
createTrunc(Value * v,Type * destType)1635 Value *Nucleus::createTrunc(Value *v, Type *destType)
1636 {
1637 	RR_DEBUG_INFO_UPDATE_LOC();
1638 	return createCast(Ice::InstCast::Trunc, v, destType);
1639 }
1640 
createZExt(Value * v,Type * destType)1641 Value *Nucleus::createZExt(Value *v, Type *destType)
1642 {
1643 	RR_DEBUG_INFO_UPDATE_LOC();
1644 	return createCast(Ice::InstCast::Zext, v, destType);
1645 }
1646 
createSExt(Value * v,Type * destType)1647 Value *Nucleus::createSExt(Value *v, Type *destType)
1648 {
1649 	RR_DEBUG_INFO_UPDATE_LOC();
1650 	return createCast(Ice::InstCast::Sext, v, destType);
1651 }
1652 
createFPToUI(Value * v,Type * destType)1653 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1654 {
1655 	RR_DEBUG_INFO_UPDATE_LOC();
1656 	return createCast(Ice::InstCast::Fptoui, v, destType);
1657 }
1658 
createFPToSI(Value * v,Type * destType)1659 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1660 {
1661 	RR_DEBUG_INFO_UPDATE_LOC();
1662 	return createCast(Ice::InstCast::Fptosi, v, destType);
1663 }
1664 
createSIToFP(Value * v,Type * destType)1665 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1666 {
1667 	RR_DEBUG_INFO_UPDATE_LOC();
1668 	return createCast(Ice::InstCast::Sitofp, v, destType);
1669 }
1670 
createFPTrunc(Value * v,Type * destType)1671 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1672 {
1673 	RR_DEBUG_INFO_UPDATE_LOC();
1674 	return createCast(Ice::InstCast::Fptrunc, v, destType);
1675 }
1676 
createFPExt(Value * v,Type * destType)1677 Value *Nucleus::createFPExt(Value *v, Type *destType)
1678 {
1679 	RR_DEBUG_INFO_UPDATE_LOC();
1680 	return createCast(Ice::InstCast::Fpext, v, destType);
1681 }
1682 
createBitCast(Value * v,Type * destType)1683 Value *Nucleus::createBitCast(Value *v, Type *destType)
1684 {
1685 	RR_DEBUG_INFO_UPDATE_LOC();
1686 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1687 	// support for casting between scalars and wide vectors. For platforms where this is not supported,
1688 	// emulate them by writing to the stack and reading back as the destination type.
1689 	if(emulateMismatchedBitCast || (v->getType() == Ice::Type::IceType_i64))
1690 	{
1691 		if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
1692 		{
1693 			Value *address = allocateStackVariable(destType);
1694 			createStore(v, address, T(v->getType()));
1695 			return createLoad(address, destType);
1696 		}
1697 		else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
1698 		{
1699 			Value *address = allocateStackVariable(T(v->getType()));
1700 			createStore(v, address, T(v->getType()));
1701 			return createLoad(address, destType);
1702 		}
1703 	}
1704 
1705 	return createCast(Ice::InstCast::Bitcast, v, destType);
1706 }
1707 
createIntCompare(Ice::InstIcmp::ICond condition,Value * lhs,Value * rhs)1708 static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
1709 {
1710 	ASSERT(lhs->getType() == rhs->getType());
1711 
1712 	auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
1713 	auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
1714 	::basicBlock->appendInst(cmp);
1715 
1716 	return V(result);
1717 }
1718 
createICmpEQ(Value * lhs,Value * rhs)1719 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1720 {
1721 	RR_DEBUG_INFO_UPDATE_LOC();
1722 	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
1723 }
1724 
createICmpNE(Value * lhs,Value * rhs)1725 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1726 {
1727 	RR_DEBUG_INFO_UPDATE_LOC();
1728 	return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
1729 }
1730 
createICmpUGT(Value * lhs,Value * rhs)1731 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1732 {
1733 	RR_DEBUG_INFO_UPDATE_LOC();
1734 	return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
1735 }
1736 
createICmpUGE(Value * lhs,Value * rhs)1737 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1738 {
1739 	RR_DEBUG_INFO_UPDATE_LOC();
1740 	return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
1741 }
1742 
createICmpULT(Value * lhs,Value * rhs)1743 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1744 {
1745 	RR_DEBUG_INFO_UPDATE_LOC();
1746 	return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
1747 }
1748 
createICmpULE(Value * lhs,Value * rhs)1749 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1750 {
1751 	RR_DEBUG_INFO_UPDATE_LOC();
1752 	return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
1753 }
1754 
createICmpSGT(Value * lhs,Value * rhs)1755 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1756 {
1757 	RR_DEBUG_INFO_UPDATE_LOC();
1758 	return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
1759 }
1760 
createICmpSGE(Value * lhs,Value * rhs)1761 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1762 {
1763 	RR_DEBUG_INFO_UPDATE_LOC();
1764 	return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
1765 }
1766 
createICmpSLT(Value * lhs,Value * rhs)1767 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1768 {
1769 	RR_DEBUG_INFO_UPDATE_LOC();
1770 	return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
1771 }
1772 
createICmpSLE(Value * lhs,Value * rhs)1773 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1774 {
1775 	RR_DEBUG_INFO_UPDATE_LOC();
1776 	return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
1777 }
1778 
createFloatCompare(Ice::InstFcmp::FCond condition,Value * lhs,Value * rhs)1779 static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
1780 {
1781 	ASSERT(lhs->getType() == rhs->getType());
1782 	ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
1783 
1784 	auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
1785 	auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
1786 	::basicBlock->appendInst(cmp);
1787 
1788 	return V(result);
1789 }
1790 
createFCmpOEQ(Value * lhs,Value * rhs)1791 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1792 {
1793 	RR_DEBUG_INFO_UPDATE_LOC();
1794 	return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
1795 }
1796 
createFCmpOGT(Value * lhs,Value * rhs)1797 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1798 {
1799 	RR_DEBUG_INFO_UPDATE_LOC();
1800 	return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
1801 }
1802 
createFCmpOGE(Value * lhs,Value * rhs)1803 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1804 {
1805 	RR_DEBUG_INFO_UPDATE_LOC();
1806 	return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
1807 }
1808 
createFCmpOLT(Value * lhs,Value * rhs)1809 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1810 {
1811 	RR_DEBUG_INFO_UPDATE_LOC();
1812 	return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
1813 }
1814 
createFCmpOLE(Value * lhs,Value * rhs)1815 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1816 {
1817 	RR_DEBUG_INFO_UPDATE_LOC();
1818 	return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
1819 }
1820 
createFCmpONE(Value * lhs,Value * rhs)1821 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1822 {
1823 	RR_DEBUG_INFO_UPDATE_LOC();
1824 	return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
1825 }
1826 
createFCmpORD(Value * lhs,Value * rhs)1827 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1828 {
1829 	RR_DEBUG_INFO_UPDATE_LOC();
1830 	return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
1831 }
1832 
createFCmpUNO(Value * lhs,Value * rhs)1833 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1834 {
1835 	RR_DEBUG_INFO_UPDATE_LOC();
1836 	return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
1837 }
1838 
createFCmpUEQ(Value * lhs,Value * rhs)1839 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1840 {
1841 	RR_DEBUG_INFO_UPDATE_LOC();
1842 	return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
1843 }
1844 
createFCmpUGT(Value * lhs,Value * rhs)1845 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1846 {
1847 	RR_DEBUG_INFO_UPDATE_LOC();
1848 	return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
1849 }
1850 
createFCmpUGE(Value * lhs,Value * rhs)1851 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1852 {
1853 	RR_DEBUG_INFO_UPDATE_LOC();
1854 	return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
1855 }
1856 
createFCmpULT(Value * lhs,Value * rhs)1857 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1858 {
1859 	RR_DEBUG_INFO_UPDATE_LOC();
1860 	return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
1861 }
1862 
createFCmpULE(Value * lhs,Value * rhs)1863 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1864 {
1865 	RR_DEBUG_INFO_UPDATE_LOC();
1866 	return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
1867 }
1868 
createFCmpUNE(Value * lhs,Value * rhs)1869 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1870 {
1871 	RR_DEBUG_INFO_UPDATE_LOC();
1872 	return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
1873 }
1874 
createExtractElement(Value * vector,Type * type,int index)1875 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1876 {
1877 	RR_DEBUG_INFO_UPDATE_LOC();
1878 	auto result = ::function->makeVariable(T(type));
1879 	auto extract = Ice::InstExtractElement::create(::function, result, V(vector), ::context->getConstantInt32(index));
1880 	::basicBlock->appendInst(extract);
1881 
1882 	return V(result);
1883 }
1884 
createInsertElement(Value * vector,Value * element,int index)1885 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1886 {
1887 	RR_DEBUG_INFO_UPDATE_LOC();
1888 	auto result = ::function->makeVariable(vector->getType());
1889 	auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
1890 	::basicBlock->appendInst(insert);
1891 
1892 	return V(result);
1893 }
1894 
createShuffleVector(Value * V1,Value * V2,std::vector<int> select)1895 Value *Nucleus::createShuffleVector(Value *V1, Value *V2, std::vector<int> select)
1896 {
1897 	RR_DEBUG_INFO_UPDATE_LOC();
1898 	ASSERT(V1->getType() == V2->getType());
1899 
1900 	size_t size = Ice::typeNumElements(V1->getType());
1901 	auto result = ::function->makeVariable(V1->getType());
1902 	auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
1903 
1904 	const size_t selectSize = select.size();
1905 	for(size_t i = 0; i < size; i++)
1906 	{
1907 		shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i % selectSize])));
1908 	}
1909 
1910 	::basicBlock->appendInst(shuffle);
1911 
1912 	return V(result);
1913 }
1914 
createSelect(Value * C,Value * ifTrue,Value * ifFalse)1915 Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
1916 {
1917 	RR_DEBUG_INFO_UPDATE_LOC();
1918 	ASSERT(ifTrue->getType() == ifFalse->getType());
1919 
1920 	auto result = ::function->makeVariable(ifTrue->getType());
1921 	auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
1922 	::basicBlock->appendInst(select);
1923 
1924 	return V(result);
1925 }
1926 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1927 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1928 {
1929 	RR_DEBUG_INFO_UPDATE_LOC();
1930 	auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
1931 	::basicBlock->appendInst(switchInst);
1932 
1933 	return reinterpret_cast<SwitchCases *>(switchInst);
1934 }
1935 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1936 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1937 {
1938 	RR_DEBUG_INFO_UPDATE_LOC();
1939 	switchCases->addBranch(label, label, branch);
1940 }
1941 
createUnreachable()1942 void Nucleus::createUnreachable()
1943 {
1944 	RR_DEBUG_INFO_UPDATE_LOC();
1945 	Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
1946 	::basicBlock->appendInst(unreachable);
1947 }
1948 
getType(Value * value)1949 Type *Nucleus::getType(Value *value)
1950 {
1951 	return T(V(value)->getType());
1952 }
1953 
getContainedType(Type * vectorType)1954 Type *Nucleus::getContainedType(Type *vectorType)
1955 {
1956 	Ice::Type vecTy = T(vectorType);
1957 	switch(vecTy)
1958 	{
1959 	case Ice::IceType_v4i1: return T(Ice::IceType_i1);
1960 	case Ice::IceType_v8i1: return T(Ice::IceType_i1);
1961 	case Ice::IceType_v16i1: return T(Ice::IceType_i1);
1962 	case Ice::IceType_v16i8: return T(Ice::IceType_i8);
1963 	case Ice::IceType_v8i16: return T(Ice::IceType_i16);
1964 	case Ice::IceType_v4i32: return T(Ice::IceType_i32);
1965 	case Ice::IceType_v4f32: return T(Ice::IceType_f32);
1966 	default:
1967 		ASSERT_MSG(false, "getContainedType: input type is not a vector type");
1968 		return {};
1969 	}
1970 }
1971 
getPointerType(Type * ElementType)1972 Type *Nucleus::getPointerType(Type *ElementType)
1973 {
1974 	return T(sz::getPointerType(T(ElementType)));
1975 }
1976 
getNaturalIntType()1977 static constexpr Ice::Type getNaturalIntType()
1978 {
1979 	constexpr size_t intSize = sizeof(int);
1980 	static_assert(intSize == 4 || intSize == 8, "");
1981 	return intSize == 4 ? Ice::IceType_i32 : Ice::IceType_i64;
1982 }
1983 
getPrintfStorageType(Type * valueType)1984 Type *Nucleus::getPrintfStorageType(Type *valueType)
1985 {
1986 	Ice::Type valueTy = T(valueType);
1987 	switch(valueTy)
1988 	{
1989 	case Ice::IceType_i32:
1990 		return T(getNaturalIntType());
1991 
1992 	case Ice::IceType_f32:
1993 		return T(Ice::IceType_f64);
1994 
1995 	default:
1996 		UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1997 		return {};
1998 	}
1999 }
2000 
createNullValue(Type * Ty)2001 Value *Nucleus::createNullValue(Type *Ty)
2002 {
2003 	RR_DEBUG_INFO_UPDATE_LOC();
2004 	if(Ice::isVectorType(T(Ty)))
2005 	{
2006 		ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
2007 		std::vector<int64_t> c = { 0 };
2008 		return createConstantVector(c, Ty);
2009 	}
2010 	else
2011 	{
2012 		return V(::context->getConstantZero(T(Ty)));
2013 	}
2014 }
2015 
createConstantLong(int64_t i)2016 Value *Nucleus::createConstantLong(int64_t i)
2017 {
2018 	RR_DEBUG_INFO_UPDATE_LOC();
2019 	return V(::context->getConstantInt64(i));
2020 }
2021 
createConstantInt(int i)2022 Value *Nucleus::createConstantInt(int i)
2023 {
2024 	RR_DEBUG_INFO_UPDATE_LOC();
2025 	return V(::context->getConstantInt32(i));
2026 }
2027 
createConstantInt(unsigned int i)2028 Value *Nucleus::createConstantInt(unsigned int i)
2029 {
2030 	RR_DEBUG_INFO_UPDATE_LOC();
2031 	return V(::context->getConstantInt32(i));
2032 }
2033 
createConstantBool(bool b)2034 Value *Nucleus::createConstantBool(bool b)
2035 {
2036 	RR_DEBUG_INFO_UPDATE_LOC();
2037 	return V(::context->getConstantInt1(b));
2038 }
2039 
createConstantByte(signed char i)2040 Value *Nucleus::createConstantByte(signed char i)
2041 {
2042 	RR_DEBUG_INFO_UPDATE_LOC();
2043 	return V(::context->getConstantInt8(i));
2044 }
2045 
createConstantByte(unsigned char i)2046 Value *Nucleus::createConstantByte(unsigned char i)
2047 {
2048 	RR_DEBUG_INFO_UPDATE_LOC();
2049 	return V(::context->getConstantInt8(i));
2050 }
2051 
createConstantShort(short i)2052 Value *Nucleus::createConstantShort(short i)
2053 {
2054 	RR_DEBUG_INFO_UPDATE_LOC();
2055 	return V(::context->getConstantInt16(i));
2056 }
2057 
createConstantShort(unsigned short i)2058 Value *Nucleus::createConstantShort(unsigned short i)
2059 {
2060 	RR_DEBUG_INFO_UPDATE_LOC();
2061 	return V(::context->getConstantInt16(i));
2062 }
2063 
createConstantFloat(float x)2064 Value *Nucleus::createConstantFloat(float x)
2065 {
2066 	RR_DEBUG_INFO_UPDATE_LOC();
2067 	return V(::context->getConstantFloat(x));
2068 }
2069 
createNullPointer(Type * Ty)2070 Value *Nucleus::createNullPointer(Type *Ty)
2071 {
2072 	RR_DEBUG_INFO_UPDATE_LOC();
2073 	return createNullValue(T(sizeof(void *) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
2074 }
2075 
IceConstantData(const void * data,size_t size,size_t alignment=1)2076 static Ice::Constant *IceConstantData(const void *data, size_t size, size_t alignment = 1)
2077 {
2078 	return sz::getConstantPointer(::context, ::routine->addConstantData(data, size, alignment));
2079 }
2080 
createConstantVector(std::vector<int64_t> constants,Type * type)2081 Value *Nucleus::createConstantVector(std::vector<int64_t> constants, Type *type)
2082 {
2083 	RR_DEBUG_INFO_UPDATE_LOC();
2084 	const int vectorSize = 16;
2085 	ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
2086 	const int alignment = vectorSize;
2087 
2088 	const auto &i = constants;
2089 	const size_t s = constants.size();
2090 
2091 	// TODO(b/148082873): Fix global variable constants when generating multiple functions
2092 	Ice::Constant *ptr = nullptr;
2093 
2094 	switch((int)reinterpret_cast<intptr_t>(type))
2095 	{
2096 	case Ice::IceType_v4i32:
2097 	case Ice::IceType_v4i1:
2098 		{
2099 			const int initializer[4] = { (int)i[0 % s], (int)i[1 % s], (int)i[2 % s], (int)i[3 % s] };
2100 			static_assert(sizeof(initializer) == vectorSize);
2101 			ptr = IceConstantData(initializer, vectorSize, alignment);
2102 		}
2103 		break;
2104 	case Ice::IceType_v8i16:
2105 	case Ice::IceType_v8i1:
2106 		{
2107 			const short initializer[8] = { (short)i[0 % s], (short)i[1 % s], (short)i[2 % s], (short)i[3 % s], (short)i[4 % s], (short)i[5 % s], (short)i[6 % s], (short)i[7 % s] };
2108 			static_assert(sizeof(initializer) == vectorSize);
2109 			ptr = IceConstantData(initializer, vectorSize, alignment);
2110 		}
2111 		break;
2112 	case Ice::IceType_v16i8:
2113 	case Ice::IceType_v16i1:
2114 		{
2115 			const char initializer[16] = { (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[4 % s], (char)i[5 % s], (char)i[6 % s], (char)i[7 % s],
2116 				                           (char)i[8 % s], (char)i[9 % s], (char)i[10 % s], (char)i[11 % s], (char)i[12 % s], (char)i[13 % s], (char)i[14 % s], (char)i[15 % s] };
2117 			static_assert(sizeof(initializer) == vectorSize);
2118 			ptr = IceConstantData(initializer, vectorSize, alignment);
2119 		}
2120 		break;
2121 	case Type_v2i32:
2122 		{
2123 			const int initializer[4] = { (int)i[0 % s], (int)i[1 % s], (int)i[0 % s], (int)i[1 % s] };
2124 			static_assert(sizeof(initializer) == vectorSize);
2125 			ptr = IceConstantData(initializer, vectorSize, alignment);
2126 		}
2127 		break;
2128 	case Type_v4i16:
2129 		{
2130 			const short initializer[8] = { (short)i[0 % s], (short)i[1 % s], (short)i[2 % s], (short)i[3 % s], (short)i[0 % s], (short)i[1 % s], (short)i[2 % s], (short)i[3 % s] };
2131 			static_assert(sizeof(initializer) == vectorSize);
2132 			ptr = IceConstantData(initializer, vectorSize, alignment);
2133 		}
2134 		break;
2135 	case Type_v8i8:
2136 		{
2137 			const char initializer[16] = { (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[4 % s], (char)i[5 % s], (char)i[6 % s], (char)i[7 % s], (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[4 % s], (char)i[5 % s], (char)i[6 % s], (char)i[7 % s] };
2138 			static_assert(sizeof(initializer) == vectorSize);
2139 			ptr = IceConstantData(initializer, vectorSize, alignment);
2140 		}
2141 		break;
2142 	case Type_v4i8:
2143 		{
2144 			const char initializer[16] = { (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s], (char)i[0 % s], (char)i[1 % s], (char)i[2 % s], (char)i[3 % s] };
2145 			static_assert(sizeof(initializer) == vectorSize);
2146 			ptr = IceConstantData(initializer, vectorSize, alignment);
2147 		}
2148 		break;
2149 	default:
2150 		UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
2151 	}
2152 
2153 	ASSERT(ptr);
2154 
2155 	Ice::Variable *result = sz::createLoad(::function, ::basicBlock, ptr, T(type), alignment);
2156 	return V(result);
2157 }
2158 
createConstantVector(std::vector<double> constants,Type * type)2159 Value *Nucleus::createConstantVector(std::vector<double> constants, Type *type)
2160 {
2161 	RR_DEBUG_INFO_UPDATE_LOC();
2162 	const int vectorSize = 16;
2163 	ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
2164 	const int alignment = vectorSize;
2165 
2166 	const auto &f = constants;
2167 	const size_t s = constants.size();
2168 
2169 	// TODO(b/148082873): Fix global variable constants when generating multiple functions
2170 	Ice::Constant *ptr = nullptr;
2171 
2172 	switch((int)reinterpret_cast<intptr_t>(type))
2173 	{
2174 	case Ice::IceType_v4f32:
2175 		{
2176 			const float initializer[4] = { (float)f[0 % s], (float)f[1 % s], (float)f[2 % s], (float)f[3 % s] };
2177 			static_assert(sizeof(initializer) == vectorSize);
2178 			ptr = IceConstantData(initializer, vectorSize, alignment);
2179 		}
2180 		break;
2181 	case Type_v2f32:
2182 		{
2183 			const float initializer[4] = { (float)f[0 % s], (float)f[1 % s], (float)f[0 % s], (float)f[1 % s] };
2184 			static_assert(sizeof(initializer) == vectorSize);
2185 			ptr = IceConstantData(initializer, vectorSize, alignment);
2186 		}
2187 		break;
2188 	default:
2189 		UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
2190 	}
2191 
2192 	ASSERT(ptr);
2193 
2194 	Ice::Variable *result = sz::createLoad(::function, ::basicBlock, ptr, T(type), alignment);
2195 	return V(result);
2196 }
2197 
createConstantString(const char * v)2198 Value *Nucleus::createConstantString(const char *v)
2199 {
2200 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
2201 	return V(IceConstantData(v, strlen(v) + 1));
2202 }
2203 
setOptimizerCallback(OptimizerCallback * callback)2204 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
2205 {
2206 	::optimizerCallback = callback;
2207 }
2208 
type()2209 Type *Void::type()
2210 {
2211 	return T(Ice::IceType_void);
2212 }
2213 
type()2214 Type *Bool::type()
2215 {
2216 	return T(Ice::IceType_i1);
2217 }
2218 
type()2219 Type *Byte::type()
2220 {
2221 	return T(Ice::IceType_i8);
2222 }
2223 
type()2224 Type *SByte::type()
2225 {
2226 	return T(Ice::IceType_i8);
2227 }
2228 
type()2229 Type *Short::type()
2230 {
2231 	return T(Ice::IceType_i16);
2232 }
2233 
type()2234 Type *UShort::type()
2235 {
2236 	return T(Ice::IceType_i16);
2237 }
2238 
type()2239 Type *Byte4::type()
2240 {
2241 	return T(Type_v4i8);
2242 }
2243 
type()2244 Type *SByte4::type()
2245 {
2246 	return T(Type_v4i8);
2247 }
2248 
SaturateUnsigned(RValue<Short> x)2249 static RValue<Byte> SaturateUnsigned(RValue<Short> x)
2250 {
2251 	return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
2252 }
2253 
Extract(RValue<Byte8> val,int i)2254 static RValue<Byte> Extract(RValue<Byte8> val, int i)
2255 {
2256 	return RValue<Byte>(Nucleus::createExtractElement(val.value(), Byte::type(), i));
2257 }
2258 
Insert(RValue<Byte8> val,RValue<Byte> element,int i)2259 static RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
2260 {
2261 	return RValue<Byte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2262 }
2263 
AddSat(RValue<Byte8> x,RValue<Byte8> y)2264 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2265 {
2266 	RR_DEBUG_INFO_UPDATE_LOC();
2267 	if(emulateIntrinsics)
2268 	{
2269 		return Scalarize([](auto a, auto b) { return SaturateUnsigned(Short(Int(a) + Int(b))); }, x, y);
2270 	}
2271 	else
2272 	{
2273 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2274 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2275 		auto paddusb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2276 		paddusb->addArg(x.value());
2277 		paddusb->addArg(y.value());
2278 		::basicBlock->appendInst(paddusb);
2279 
2280 		return RValue<Byte8>(V(result));
2281 	}
2282 }
2283 
SubSat(RValue<Byte8> x,RValue<Byte8> y)2284 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2285 {
2286 	RR_DEBUG_INFO_UPDATE_LOC();
2287 	if(emulateIntrinsics)
2288 	{
2289 		return Scalarize([](auto a, auto b) { return SaturateUnsigned(Short(Int(a) - Int(b))); }, x, y);
2290 	}
2291 	else
2292 	{
2293 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2294 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2295 		auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2296 		psubusw->addArg(x.value());
2297 		psubusw->addArg(y.value());
2298 		::basicBlock->appendInst(psubusw);
2299 
2300 		return RValue<Byte8>(V(result));
2301 	}
2302 }
2303 
Extract(RValue<SByte8> val,int i)2304 RValue<SByte> Extract(RValue<SByte8> val, int i)
2305 {
2306 	RR_DEBUG_INFO_UPDATE_LOC();
2307 	return RValue<SByte>(Nucleus::createExtractElement(val.value(), SByte::type(), i));
2308 }
2309 
Insert(RValue<SByte8> val,RValue<SByte> element,int i)2310 RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
2311 {
2312 	RR_DEBUG_INFO_UPDATE_LOC();
2313 	return RValue<SByte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2314 }
2315 
operator >>(RValue<SByte8> lhs,unsigned char rhs)2316 RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2317 {
2318 	RR_DEBUG_INFO_UPDATE_LOC();
2319 	if(emulateIntrinsics)
2320 	{
2321 		return Scalarize([rhs](auto a) { return a >> SByte(rhs); }, lhs);
2322 	}
2323 	else
2324 	{
2325 #if defined(__i386__) || defined(__x86_64__)
2326 		// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
2327 		RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
2328 		RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
2329 
2330 		return As<SByte8>(hi | lo);
2331 #else
2332 		return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2333 #endif
2334 	}
2335 }
2336 
SignMask(RValue<Byte8> x)2337 RValue<Int> SignMask(RValue<Byte8> x)
2338 {
2339 	RR_DEBUG_INFO_UPDATE_LOC();
2340 	if(emulateIntrinsics || CPUID::ARM)
2341 	{
2342 		Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2343 		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2344 	}
2345 	else
2346 	{
2347 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2348 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2349 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2350 		movmsk->addArg(x.value());
2351 		::basicBlock->appendInst(movmsk);
2352 
2353 		return RValue<Int>(V(result)) & 0xFF;
2354 	}
2355 }
2356 
2357 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2358 //	{
2359 //		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value(), y.value()));
2360 //	}
2361 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)2362 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2363 {
2364 	RR_DEBUG_INFO_UPDATE_LOC();
2365 	return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2366 }
2367 
type()2368 Type *Byte8::type()
2369 {
2370 	return T(Type_v8i8);
2371 }
2372 
2373 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2374 //	{
2375 //		return RValue<SByte8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2376 //	}
2377 
2378 //	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2379 //	{
2380 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2381 //	}
2382 
SaturateSigned(RValue<Short> x)2383 RValue<SByte> SaturateSigned(RValue<Short> x)
2384 {
2385 	RR_DEBUG_INFO_UPDATE_LOC();
2386 	return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
2387 }
2388 
AddSat(RValue<SByte8> x,RValue<SByte8> y)2389 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2390 {
2391 	RR_DEBUG_INFO_UPDATE_LOC();
2392 	if(emulateIntrinsics)
2393 	{
2394 		return Scalarize([](auto a, auto b) { return SaturateSigned(Short(Int(a) + Int(b))); }, x, y);
2395 	}
2396 	else
2397 	{
2398 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2399 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2400 		auto paddsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2401 		paddsb->addArg(x.value());
2402 		paddsb->addArg(y.value());
2403 		::basicBlock->appendInst(paddsb);
2404 
2405 		return RValue<SByte8>(V(result));
2406 	}
2407 }
2408 
SubSat(RValue<SByte8> x,RValue<SByte8> y)2409 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2410 {
2411 	RR_DEBUG_INFO_UPDATE_LOC();
2412 	if(emulateIntrinsics)
2413 	{
2414 		return Scalarize([](auto a, auto b) { return SaturateSigned(Short(Int(a) - Int(b))); }, x, y);
2415 	}
2416 	else
2417 	{
2418 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2419 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2420 		auto psubsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2421 		psubsb->addArg(x.value());
2422 		psubsb->addArg(y.value());
2423 		::basicBlock->appendInst(psubsb);
2424 
2425 		return RValue<SByte8>(V(result));
2426 	}
2427 }
2428 
SignMask(RValue<SByte8> x)2429 RValue<Int> SignMask(RValue<SByte8> x)
2430 {
2431 	RR_DEBUG_INFO_UPDATE_LOC();
2432 	if(emulateIntrinsics || CPUID::ARM)
2433 	{
2434 		SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2435 		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2436 	}
2437 	else
2438 	{
2439 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2440 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2441 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2442 		movmsk->addArg(x.value());
2443 		::basicBlock->appendInst(movmsk);
2444 
2445 		return RValue<Int>(V(result)) & 0xFF;
2446 	}
2447 }
2448 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)2449 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2450 {
2451 	RR_DEBUG_INFO_UPDATE_LOC();
2452 	return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2453 }
2454 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)2455 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2456 {
2457 	RR_DEBUG_INFO_UPDATE_LOC();
2458 	return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2459 }
2460 
type()2461 Type *SByte8::type()
2462 {
2463 	return T(Type_v8i8);
2464 }
2465 
type()2466 Type *Byte16::type()
2467 {
2468 	return T(Ice::IceType_v16i8);
2469 }
2470 
type()2471 Type *SByte16::type()
2472 {
2473 	return T(Ice::IceType_v16i8);
2474 }
2475 
type()2476 Type *Short2::type()
2477 {
2478 	return T(Type_v2i16);
2479 }
2480 
type()2481 Type *UShort2::type()
2482 {
2483 	return T(Type_v2i16);
2484 }
2485 
Short4(RValue<Int4> cast)2486 Short4::Short4(RValue<Int4> cast)
2487 {
2488 	std::vector<int> select = { 0, 2, 4, 6, 0, 2, 4, 6 };
2489 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2490 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2491 
2492 	Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value();
2493 	Value *short4 = Nucleus::createBitCast(int2, Short4::type());
2494 
2495 	storeValue(short4);
2496 }
2497 
2498 //	Short4::Short4(RValue<Float> cast)
2499 //	{
2500 //	}
2501 
Short4(RValue<Float4> cast)2502 Short4::Short4(RValue<Float4> cast)
2503 {
2504 	// TODO(b/150791192): Generalize and optimize
2505 	auto smin = std::numeric_limits<short>::min();
2506 	auto smax = std::numeric_limits<short>::max();
2507 	*this = Short4(Int4(Max(Min(cast, Float4(smax)), Float4(smin))));
2508 }
2509 
operator <<(RValue<Short4> lhs,unsigned char rhs)2510 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2511 {
2512 	RR_DEBUG_INFO_UPDATE_LOC();
2513 	if(emulateIntrinsics)
2514 	{
2515 		return Scalarize([rhs](auto x) { return x << Short(rhs); }, lhs);
2516 	}
2517 	else
2518 	{
2519 		return RValue<Short4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2520 	}
2521 }
2522 
operator >>(RValue<Short4> lhs,unsigned char rhs)2523 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2524 {
2525 	RR_DEBUG_INFO_UPDATE_LOC();
2526 	if(emulateIntrinsics)
2527 	{
2528 		return Scalarize([rhs](auto x) { return x >> Short(rhs); }, lhs);
2529 	}
2530 	else
2531 	{
2532 		return RValue<Short4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2533 	}
2534 }
2535 
Max(RValue<Short4> x,RValue<Short4> y)2536 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2537 {
2538 	RR_DEBUG_INFO_UPDATE_LOC();
2539 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2540 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
2541 	::basicBlock->appendInst(cmp);
2542 
2543 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2544 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2545 	::basicBlock->appendInst(select);
2546 
2547 	return RValue<Short4>(V(result));
2548 }
2549 
Min(RValue<Short4> x,RValue<Short4> y)2550 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2551 {
2552 	RR_DEBUG_INFO_UPDATE_LOC();
2553 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2554 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
2555 	::basicBlock->appendInst(cmp);
2556 
2557 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2558 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2559 	::basicBlock->appendInst(select);
2560 
2561 	return RValue<Short4>(V(result));
2562 }
2563 
SaturateSigned(RValue<Int> x)2564 RValue<Short> SaturateSigned(RValue<Int> x)
2565 {
2566 	RR_DEBUG_INFO_UPDATE_LOC();
2567 	return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
2568 }
2569 
AddSat(RValue<Short4> x,RValue<Short4> y)2570 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2571 {
2572 	RR_DEBUG_INFO_UPDATE_LOC();
2573 	if(emulateIntrinsics)
2574 	{
2575 		return Scalarize([](auto a, auto b) { return SaturateSigned(Int(a) + Int(b)); }, x, y);
2576 	}
2577 	else
2578 	{
2579 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2580 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2581 		auto paddsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2582 		paddsw->addArg(x.value());
2583 		paddsw->addArg(y.value());
2584 		::basicBlock->appendInst(paddsw);
2585 
2586 		return RValue<Short4>(V(result));
2587 	}
2588 }
2589 
SubSat(RValue<Short4> x,RValue<Short4> y)2590 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2591 {
2592 	RR_DEBUG_INFO_UPDATE_LOC();
2593 	if(emulateIntrinsics)
2594 	{
2595 		return Scalarize([](auto a, auto b) { return SaturateSigned(Int(a) - Int(b)); }, x, y);
2596 	}
2597 	else
2598 	{
2599 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2600 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2601 		auto psubsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2602 		psubsw->addArg(x.value());
2603 		psubsw->addArg(y.value());
2604 		::basicBlock->appendInst(psubsw);
2605 
2606 		return RValue<Short4>(V(result));
2607 	}
2608 }
2609 
MulHigh(RValue<Short4> x,RValue<Short4> y)2610 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2611 {
2612 	RR_DEBUG_INFO_UPDATE_LOC();
2613 	if(emulateIntrinsics)
2614 	{
2615 		return Scalarize([](auto a, auto b) { return Short((Int(a) * Int(b)) >> 16); }, x, y);
2616 	}
2617 	else
2618 	{
2619 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2620 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2621 		auto pmulhw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2622 		pmulhw->addArg(x.value());
2623 		pmulhw->addArg(y.value());
2624 		::basicBlock->appendInst(pmulhw);
2625 
2626 		return RValue<Short4>(V(result));
2627 	}
2628 }
2629 
MulAdd(RValue<Short4> x,RValue<Short4> y)2630 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2631 {
2632 	RR_DEBUG_INFO_UPDATE_LOC();
2633 	if(emulateIntrinsics)
2634 	{
2635 		Int2 result;
2636 		result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
2637 		result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
2638 
2639 		return result;
2640 	}
2641 	else
2642 	{
2643 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2644 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2645 		auto pmaddwd = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2646 		pmaddwd->addArg(x.value());
2647 		pmaddwd->addArg(y.value());
2648 		::basicBlock->appendInst(pmaddwd);
2649 
2650 		return As<Int2>(V(result));
2651 	}
2652 }
2653 
PackSigned(RValue<Short4> x,RValue<Short4> y)2654 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2655 {
2656 	RR_DEBUG_INFO_UPDATE_LOC();
2657 	if(emulateIntrinsics)
2658 	{
2659 		SByte8 result;
2660 		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
2661 		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
2662 		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
2663 		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
2664 		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
2665 		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
2666 		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
2667 		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
2668 
2669 		return result;
2670 	}
2671 	else
2672 	{
2673 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2674 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2675 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2676 		pack->addArg(x.value());
2677 		pack->addArg(y.value());
2678 		::basicBlock->appendInst(pack);
2679 
2680 		return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2681 	}
2682 }
2683 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2684 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2685 {
2686 	RR_DEBUG_INFO_UPDATE_LOC();
2687 	if(emulateIntrinsics)
2688 	{
2689 		Byte8 result;
2690 		result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
2691 		result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
2692 		result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
2693 		result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
2694 		result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
2695 		result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
2696 		result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
2697 		result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
2698 
2699 		return result;
2700 	}
2701 	else
2702 	{
2703 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2704 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2705 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2706 		pack->addArg(x.value());
2707 		pack->addArg(y.value());
2708 		::basicBlock->appendInst(pack);
2709 
2710 		return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2711 	}
2712 }
2713 
CmpGT(RValue<Short4> x,RValue<Short4> y)2714 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2715 {
2716 	RR_DEBUG_INFO_UPDATE_LOC();
2717 	return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2718 }
2719 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2720 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2721 {
2722 	RR_DEBUG_INFO_UPDATE_LOC();
2723 	return RValue<Short4>(Nucleus::createICmpEQ(x.value(), y.value()));
2724 }
2725 
type()2726 Type *Short4::type()
2727 {
2728 	return T(Type_v4i16);
2729 }
2730 
UShort4(RValue<Float4> cast,bool saturate)2731 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2732 {
2733 	if(saturate)
2734 	{
2735 		if(CPUID::SSE4_1)
2736 		{
2737 			// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
2738 			// PackUnsigned takes care of 0x0000 saturation.
2739 			Int4 int4(Min(cast, Float4(0xFFFF)));
2740 			*this = As<UShort4>(PackUnsigned(int4, int4));
2741 		}
2742 		else if(CPUID::ARM)
2743 		{
2744 			// ARM saturates the 32-bit integer result on overflow/undeflow.
2745 			Int4 int4(cast);
2746 			*this = As<UShort4>(PackUnsigned(int4, int4));
2747 		}
2748 		else
2749 		{
2750 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2751 		}
2752 	}
2753 	else
2754 	{
2755 		*this = Short4(Int4(cast));
2756 	}
2757 }
2758 
Extract(RValue<UShort4> val,int i)2759 RValue<UShort> Extract(RValue<UShort4> val, int i)
2760 {
2761 	return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
2762 }
2763 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2764 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2765 {
2766 	RR_DEBUG_INFO_UPDATE_LOC();
2767 	if(emulateIntrinsics)
2768 	{
2769 		return Scalarize([rhs](auto x) { return x << UShort(rhs); }, lhs);
2770 	}
2771 	else
2772 	{
2773 		return RValue<UShort4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2774 	}
2775 }
2776 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2777 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2778 {
2779 	RR_DEBUG_INFO_UPDATE_LOC();
2780 	if(emulateIntrinsics)
2781 	{
2782 		return Scalarize([rhs](auto x) { return x >> UShort(rhs); }, lhs);
2783 	}
2784 	else
2785 	{
2786 		return RValue<UShort4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2787 	}
2788 }
2789 
Max(RValue<UShort4> x,RValue<UShort4> y)2790 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2791 {
2792 	RR_DEBUG_INFO_UPDATE_LOC();
2793 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2794 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
2795 	::basicBlock->appendInst(cmp);
2796 
2797 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2798 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2799 	::basicBlock->appendInst(select);
2800 
2801 	return RValue<UShort4>(V(result));
2802 }
2803 
Min(RValue<UShort4> x,RValue<UShort4> y)2804 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2805 {
2806 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2807 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
2808 	::basicBlock->appendInst(cmp);
2809 
2810 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2811 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2812 	::basicBlock->appendInst(select);
2813 
2814 	return RValue<UShort4>(V(result));
2815 }
2816 
SaturateUnsigned(RValue<Int> x)2817 RValue<UShort> SaturateUnsigned(RValue<Int> x)
2818 {
2819 	RR_DEBUG_INFO_UPDATE_LOC();
2820 	return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
2821 }
2822 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2823 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2824 {
2825 	RR_DEBUG_INFO_UPDATE_LOC();
2826 	if(emulateIntrinsics)
2827 	{
2828 		return Scalarize([](auto a, auto b) { return SaturateUnsigned(Int(a) + Int(b)); }, x, y);
2829 	}
2830 	else
2831 	{
2832 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2833 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2834 		auto paddusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2835 		paddusw->addArg(x.value());
2836 		paddusw->addArg(y.value());
2837 		::basicBlock->appendInst(paddusw);
2838 
2839 		return RValue<UShort4>(V(result));
2840 	}
2841 }
2842 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2843 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2844 {
2845 	RR_DEBUG_INFO_UPDATE_LOC();
2846 	if(emulateIntrinsics)
2847 	{
2848 		return Scalarize([](auto a, auto b) { return SaturateUnsigned(Int(a) - Int(b)); }, x, y);
2849 	}
2850 	else
2851 	{
2852 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2853 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2854 		auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2855 		psubusw->addArg(x.value());
2856 		psubusw->addArg(y.value());
2857 		::basicBlock->appendInst(psubusw);
2858 
2859 		return RValue<UShort4>(V(result));
2860 	}
2861 }
2862 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2863 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2864 {
2865 	RR_DEBUG_INFO_UPDATE_LOC();
2866 	if(emulateIntrinsics)
2867 	{
2868 		return Scalarize([](auto a, auto b) { return UShort((UInt(a) * UInt(b)) >> 16); }, x, y);
2869 	}
2870 	else
2871 	{
2872 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2873 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2874 		auto pmulhuw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2875 		pmulhuw->addArg(x.value());
2876 		pmulhuw->addArg(y.value());
2877 		::basicBlock->appendInst(pmulhuw);
2878 
2879 		return RValue<UShort4>(V(result));
2880 	}
2881 }
2882 
MulHigh(RValue<Int4> x,RValue<Int4> y)2883 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2884 {
2885 	RR_DEBUG_INFO_UPDATE_LOC();
2886 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2887 
2888 	return Scalarize([](auto a, auto b) { return Int((Long(a) * Long(b)) >> Long(Int(32))); }, x, y);
2889 }
2890 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2891 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2892 {
2893 	RR_DEBUG_INFO_UPDATE_LOC();
2894 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2895 
2896 	if(false)  // Partial product based implementation.
2897 	{
2898 		auto xh = x >> 16;
2899 		auto yh = y >> 16;
2900 		auto xl = x & UInt4(0x0000FFFF);
2901 		auto yl = y & UInt4(0x0000FFFF);
2902 		auto xlyh = xl * yh;
2903 		auto xhyl = xh * yl;
2904 		auto xlyhh = xlyh >> 16;
2905 		auto xhylh = xhyl >> 16;
2906 		auto xlyhl = xlyh & UInt4(0x0000FFFF);
2907 		auto xhyll = xhyl & UInt4(0x0000FFFF);
2908 		auto xlylh = (xl * yl) >> 16;
2909 		auto oflow = (xlyhl + xhyll + xlylh) >> 16;
2910 
2911 		return (xh * yh) + (xlyhh + xhylh) + oflow;
2912 	}
2913 
2914 	return Scalarize([](auto a, auto b) { return UInt((Long(a) * Long(b)) >> Long(Int(32))); }, x, y);
2915 }
2916 
Average(RValue<UShort4> x,RValue<UShort4> y)2917 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2918 {
2919 	RR_DEBUG_INFO_UPDATE_LOC();
2920 	UNIMPLEMENTED_NO_BUG("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
2921 	return UShort4(0);
2922 }
2923 
type()2924 Type *UShort4::type()
2925 {
2926 	return T(Type_v4i16);
2927 }
2928 
Extract(RValue<Short8> val,int i)2929 RValue<Short> Extract(RValue<Short8> val, int i)
2930 {
2931 	RR_DEBUG_INFO_UPDATE_LOC();
2932 	return RValue<Short>(Nucleus::createExtractElement(val.value(), Short::type(), i));
2933 }
2934 
Insert(RValue<Short8> val,RValue<Short> element,int i)2935 RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
2936 {
2937 	RR_DEBUG_INFO_UPDATE_LOC();
2938 	return RValue<Short8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2939 }
2940 
operator <<(RValue<Short8> lhs,unsigned char rhs)2941 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2942 {
2943 	RR_DEBUG_INFO_UPDATE_LOC();
2944 	if(emulateIntrinsics)
2945 	{
2946 		return Scalarize([rhs](auto x) { return x << Short(rhs); }, lhs);
2947 	}
2948 	else
2949 	{
2950 		return RValue<Short8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2951 	}
2952 }
2953 
operator >>(RValue<Short8> lhs,unsigned char rhs)2954 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2955 {
2956 	RR_DEBUG_INFO_UPDATE_LOC();
2957 	if(emulateIntrinsics)
2958 	{
2959 		return Scalarize([rhs](auto x) { return x >> Short(rhs); }, lhs);
2960 	}
2961 	else
2962 	{
2963 		return RValue<Short8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2964 	}
2965 }
2966 
MulAdd(RValue<Short8> x,RValue<Short8> y)2967 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2968 {
2969 	RR_DEBUG_INFO_UPDATE_LOC();
2970 	UNIMPLEMENTED_NO_BUG("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
2971 	return Int4(0);
2972 }
2973 
MulHigh(RValue<Short8> x,RValue<Short8> y)2974 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2975 {
2976 	RR_DEBUG_INFO_UPDATE_LOC();
2977 	UNIMPLEMENTED_NO_BUG("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
2978 	return Short8(0);
2979 }
2980 
type()2981 Type *Short8::type()
2982 {
2983 	return T(Ice::IceType_v8i16);
2984 }
2985 
Extract(RValue<UShort8> val,int i)2986 RValue<UShort> Extract(RValue<UShort8> val, int i)
2987 {
2988 	RR_DEBUG_INFO_UPDATE_LOC();
2989 	return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
2990 }
2991 
Insert(RValue<UShort8> val,RValue<UShort> element,int i)2992 RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
2993 {
2994 	RR_DEBUG_INFO_UPDATE_LOC();
2995 	return RValue<UShort8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2996 }
2997 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2998 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2999 {
3000 	RR_DEBUG_INFO_UPDATE_LOC();
3001 	if(emulateIntrinsics)
3002 	{
3003 		return Scalarize([rhs](auto x) { return x << UShort(rhs); }, lhs);
3004 	}
3005 	else
3006 	{
3007 		return RValue<UShort8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3008 	}
3009 }
3010 
operator >>(RValue<UShort8> lhs,unsigned char rhs)3011 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3012 {
3013 	RR_DEBUG_INFO_UPDATE_LOC();
3014 	if(emulateIntrinsics)
3015 	{
3016 		return Scalarize([rhs](auto x) { return x >> UShort(rhs); }, lhs);
3017 	}
3018 	else
3019 	{
3020 		return RValue<UShort8>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3021 	}
3022 }
3023 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)3024 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3025 {
3026 	RR_DEBUG_INFO_UPDATE_LOC();
3027 	UNIMPLEMENTED_NO_BUG("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
3028 	return UShort8(0);
3029 }
3030 
type()3031 Type *UShort8::type()
3032 {
3033 	return T(Ice::IceType_v8i16);
3034 }
3035 
operator ++(Int & val,int)3036 RValue<Int> operator++(Int &val, int)  // Post-increment
3037 {
3038 	RR_DEBUG_INFO_UPDATE_LOC();
3039 	RValue<Int> res = val;
3040 	val += 1;
3041 	return res;
3042 }
3043 
operator ++(Int & val)3044 const Int &operator++(Int &val)  // Pre-increment
3045 {
3046 	RR_DEBUG_INFO_UPDATE_LOC();
3047 	val += 1;
3048 	return val;
3049 }
3050 
operator --(Int & val,int)3051 RValue<Int> operator--(Int &val, int)  // Post-decrement
3052 {
3053 	RR_DEBUG_INFO_UPDATE_LOC();
3054 	RValue<Int> res = val;
3055 	val -= 1;
3056 	return res;
3057 }
3058 
operator --(Int & val)3059 const Int &operator--(Int &val)  // Pre-decrement
3060 {
3061 	RR_DEBUG_INFO_UPDATE_LOC();
3062 	val -= 1;
3063 	return val;
3064 }
3065 
RoundInt(RValue<Float> cast)3066 RValue<Int> RoundInt(RValue<Float> cast)
3067 {
3068 	RR_DEBUG_INFO_UPDATE_LOC();
3069 	if(emulateIntrinsics || CPUID::ARM)
3070 	{
3071 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3072 		return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
3073 	}
3074 	else
3075 	{
3076 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3077 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3078 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3079 		nearbyint->addArg(cast.value());
3080 		::basicBlock->appendInst(nearbyint);
3081 
3082 		return RValue<Int>(V(result));
3083 	}
3084 }
3085 
type()3086 Type *Int::type()
3087 {
3088 	return T(Ice::IceType_i32);
3089 }
3090 
type()3091 Type *Long::type()
3092 {
3093 	return T(Ice::IceType_i64);
3094 }
3095 
UInt(RValue<Float> cast)3096 UInt::UInt(RValue<Float> cast)
3097 {
3098 	RR_DEBUG_INFO_UPDATE_LOC();
3099 	// Smallest positive value representable in UInt, but not in Int
3100 	const unsigned int ustart = 0x80000000u;
3101 	const float ustartf = float(ustart);
3102 
3103 	// If the value is negative, store 0, otherwise store the result of the conversion
3104 	storeValue((~(As<Int>(cast) >> 31) &
3105 	            // Check if the value can be represented as an Int
3106 	            IfThenElse(cast >= ustartf,
3107 	                       // If the value is too large, subtract ustart and re-add it after conversion.
3108 	                       As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
3109 	                       // Otherwise, just convert normally
3110 	                       Int(cast)))
3111 	               .value());
3112 }
3113 
operator ++(UInt & val,int)3114 RValue<UInt> operator++(UInt &val, int)  // Post-increment
3115 {
3116 	RR_DEBUG_INFO_UPDATE_LOC();
3117 	RValue<UInt> res = val;
3118 	val += 1;
3119 	return res;
3120 }
3121 
operator ++(UInt & val)3122 const UInt &operator++(UInt &val)  // Pre-increment
3123 {
3124 	RR_DEBUG_INFO_UPDATE_LOC();
3125 	val += 1;
3126 	return val;
3127 }
3128 
operator --(UInt & val,int)3129 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
3130 {
3131 	RR_DEBUG_INFO_UPDATE_LOC();
3132 	RValue<UInt> res = val;
3133 	val -= 1;
3134 	return res;
3135 }
3136 
operator --(UInt & val)3137 const UInt &operator--(UInt &val)  // Pre-decrement
3138 {
3139 	RR_DEBUG_INFO_UPDATE_LOC();
3140 	val -= 1;
3141 	return val;
3142 }
3143 
3144 //	RValue<UInt> RoundUInt(RValue<Float> cast)
3145 //	{
3146 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
3147 //	}
3148 
type()3149 Type *UInt::type()
3150 {
3151 	return T(Ice::IceType_i32);
3152 }
3153 
3154 //	Int2::Int2(RValue<Int> cast)
3155 //	{
3156 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
3157 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
3158 //
3159 //		Constant *shuffle[2];
3160 //		shuffle[0] = Nucleus::createConstantInt(0);
3161 //		shuffle[1] = Nucleus::createConstantInt(0);
3162 //
3163 //		Value *replicate = Nucleus::createShuffleVector(vector, UndefValue::get(Int2::type()), Nucleus::createConstantVector(shuffle, 2));
3164 //
3165 //		storeValue(replicate);
3166 //	}
3167 
operator <<(RValue<Int2> lhs,unsigned char rhs)3168 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
3169 {
3170 	RR_DEBUG_INFO_UPDATE_LOC();
3171 	if(emulateIntrinsics)
3172 	{
3173 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
3174 	}
3175 	else
3176 	{
3177 		return RValue<Int2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3178 	}
3179 }
3180 
operator >>(RValue<Int2> lhs,unsigned char rhs)3181 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
3182 {
3183 	RR_DEBUG_INFO_UPDATE_LOC();
3184 	if(emulateIntrinsics)
3185 	{
3186 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
3187 	}
3188 	else
3189 	{
3190 		return RValue<Int2>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3191 	}
3192 }
3193 
type()3194 Type *Int2::type()
3195 {
3196 	return T(Type_v2i32);
3197 }
3198 
operator <<(RValue<UInt2> lhs,unsigned char rhs)3199 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
3200 {
3201 	RR_DEBUG_INFO_UPDATE_LOC();
3202 	if(emulateIntrinsics)
3203 	{
3204 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
3205 	}
3206 	else
3207 	{
3208 		return RValue<UInt2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3209 	}
3210 }
3211 
operator >>(RValue<UInt2> lhs,unsigned char rhs)3212 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
3213 {
3214 	RR_DEBUG_INFO_UPDATE_LOC();
3215 	if(emulateIntrinsics)
3216 	{
3217 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
3218 	}
3219 	else
3220 	{
3221 		return RValue<UInt2>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3222 	}
3223 }
3224 
type()3225 Type *UInt2::type()
3226 {
3227 	return T(Type_v2i32);
3228 }
3229 
Int4(RValue<Byte4> cast)3230 Int4::Int4(RValue<Byte4> cast)
3231     : XYZW(this)
3232 {
3233 	RR_DEBUG_INFO_UPDATE_LOC();
3234 	Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3235 	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3236 
3237 	Value *e;
3238 	std::vector<int> swizzle = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
3239 	Value *b = Nucleus::createBitCast(a, Byte16::type());
3240 	Value *c = Nucleus::createShuffleVector(b, Nucleus::createNullValue(Byte16::type()), swizzle);
3241 
3242 	std::vector<int> swizzle2 = { 0, 8, 1, 9, 2, 10, 3, 11 };
3243 	Value *d = Nucleus::createBitCast(c, Short8::type());
3244 	e = Nucleus::createShuffleVector(d, Nucleus::createNullValue(Short8::type()), swizzle2);
3245 
3246 	Value *f = Nucleus::createBitCast(e, Int4::type());
3247 	storeValue(f);
3248 }
3249 
Int4(RValue<SByte4> cast)3250 Int4::Int4(RValue<SByte4> cast)
3251     : XYZW(this)
3252 {
3253 	RR_DEBUG_INFO_UPDATE_LOC();
3254 	Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3255 	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3256 
3257 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
3258 	Value *b = Nucleus::createBitCast(a, Byte16::type());
3259 	Value *c = Nucleus::createShuffleVector(b, b, swizzle);
3260 
3261 	std::vector<int> swizzle2 = { 0, 0, 1, 1, 2, 2, 3, 3 };
3262 	Value *d = Nucleus::createBitCast(c, Short8::type());
3263 	Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
3264 
3265 	*this = As<Int4>(e) >> 24;
3266 }
3267 
Int4(RValue<Short4> cast)3268 Int4::Int4(RValue<Short4> cast)
3269     : XYZW(this)
3270 {
3271 	RR_DEBUG_INFO_UPDATE_LOC();
3272 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3 };
3273 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
3274 
3275 	*this = As<Int4>(c) >> 16;
3276 }
3277 
Int4(RValue<UShort4> cast)3278 Int4::Int4(RValue<UShort4> cast)
3279     : XYZW(this)
3280 {
3281 	RR_DEBUG_INFO_UPDATE_LOC();
3282 	std::vector<int> swizzle = { 0, 8, 1, 9, 2, 10, 3, 11 };
3283 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
3284 	Value *d = Nucleus::createBitCast(c, Int4::type());
3285 	storeValue(d);
3286 }
3287 
Int4(RValue<Int> rhs)3288 Int4::Int4(RValue<Int> rhs)
3289     : XYZW(this)
3290 {
3291 	RR_DEBUG_INFO_UPDATE_LOC();
3292 	Value *vector = Nucleus::createBitCast(rhs.value(), Int4::type());
3293 
3294 	std::vector<int> swizzle = { 0 };
3295 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3296 
3297 	storeValue(replicate);
3298 }
3299 
operator <<(RValue<Int4> lhs,unsigned char rhs)3300 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
3301 {
3302 	RR_DEBUG_INFO_UPDATE_LOC();
3303 	if(emulateIntrinsics)
3304 	{
3305 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
3306 	}
3307 	else
3308 	{
3309 		return RValue<Int4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3310 	}
3311 }
3312 
operator >>(RValue<Int4> lhs,unsigned char rhs)3313 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
3314 {
3315 	RR_DEBUG_INFO_UPDATE_LOC();
3316 	if(emulateIntrinsics)
3317 	{
3318 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
3319 	}
3320 	else
3321 	{
3322 		return RValue<Int4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3323 	}
3324 }
3325 
CmpEQ(RValue<Int4> x,RValue<Int4> y)3326 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
3327 {
3328 	RR_DEBUG_INFO_UPDATE_LOC();
3329 	return RValue<Int4>(Nucleus::createICmpEQ(x.value(), y.value()));
3330 }
3331 
CmpLT(RValue<Int4> x,RValue<Int4> y)3332 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
3333 {
3334 	RR_DEBUG_INFO_UPDATE_LOC();
3335 	return RValue<Int4>(Nucleus::createICmpSLT(x.value(), y.value()));
3336 }
3337 
CmpLE(RValue<Int4> x,RValue<Int4> y)3338 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
3339 {
3340 	RR_DEBUG_INFO_UPDATE_LOC();
3341 	return RValue<Int4>(Nucleus::createICmpSLE(x.value(), y.value()));
3342 }
3343 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)3344 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
3345 {
3346 	RR_DEBUG_INFO_UPDATE_LOC();
3347 	return RValue<Int4>(Nucleus::createICmpNE(x.value(), y.value()));
3348 }
3349 
CmpNLT(RValue<Int4> x,RValue<Int4> y)3350 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
3351 {
3352 	RR_DEBUG_INFO_UPDATE_LOC();
3353 	return RValue<Int4>(Nucleus::createICmpSGE(x.value(), y.value()));
3354 }
3355 
CmpNLE(RValue<Int4> x,RValue<Int4> y)3356 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
3357 {
3358 	RR_DEBUG_INFO_UPDATE_LOC();
3359 	return RValue<Int4>(Nucleus::createICmpSGT(x.value(), y.value()));
3360 }
3361 
Abs(RValue<Int4> x)3362 RValue<Int4> Abs(RValue<Int4> x)
3363 {
3364 	// TODO: Optimize.
3365 	auto negative = x >> 31;
3366 	return (x ^ negative) - negative;
3367 }
3368 
Max(RValue<Int4> x,RValue<Int4> y)3369 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
3370 {
3371 	RR_DEBUG_INFO_UPDATE_LOC();
3372 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3373 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
3374 	::basicBlock->appendInst(cmp);
3375 
3376 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3377 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3378 	::basicBlock->appendInst(select);
3379 
3380 	return RValue<Int4>(V(result));
3381 }
3382 
Min(RValue<Int4> x,RValue<Int4> y)3383 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
3384 {
3385 	RR_DEBUG_INFO_UPDATE_LOC();
3386 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3387 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
3388 	::basicBlock->appendInst(cmp);
3389 
3390 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3391 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3392 	::basicBlock->appendInst(select);
3393 
3394 	return RValue<Int4>(V(result));
3395 }
3396 
RoundInt(RValue<Float4> cast)3397 RValue<Int4> RoundInt(RValue<Float4> cast)
3398 {
3399 	RR_DEBUG_INFO_UPDATE_LOC();
3400 	if(emulateIntrinsics || CPUID::ARM)
3401 	{
3402 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3403 		return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
3404 	}
3405 	else
3406 	{
3407 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3408 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3409 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3410 		nearbyint->addArg(cast.value());
3411 		::basicBlock->appendInst(nearbyint);
3412 
3413 		return RValue<Int4>(V(result));
3414 	}
3415 }
3416 
RoundIntClamped(RValue<Float4> cast)3417 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
3418 {
3419 	RR_DEBUG_INFO_UPDATE_LOC();
3420 
3421 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
3422 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
3423 	// saturate to 0x80000000.
3424 	RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
3425 
3426 	if(emulateIntrinsics || CPUID::ARM)
3427 	{
3428 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3429 		return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
3430 	}
3431 	else
3432 	{
3433 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3434 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3435 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3436 		nearbyint->addArg(clamped.value());
3437 		::basicBlock->appendInst(nearbyint);
3438 
3439 		return RValue<Int4>(V(result));
3440 	}
3441 }
3442 
PackSigned(RValue<Int4> x,RValue<Int4> y)3443 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
3444 {
3445 	RR_DEBUG_INFO_UPDATE_LOC();
3446 	if(emulateIntrinsics)
3447 	{
3448 		Short8 result;
3449 		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
3450 		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
3451 		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
3452 		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
3453 		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
3454 		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
3455 		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
3456 		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
3457 
3458 		return result;
3459 	}
3460 	else
3461 	{
3462 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3463 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3464 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3465 		pack->addArg(x.value());
3466 		pack->addArg(y.value());
3467 		::basicBlock->appendInst(pack);
3468 
3469 		return RValue<Short8>(V(result));
3470 	}
3471 }
3472 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)3473 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
3474 {
3475 	RR_DEBUG_INFO_UPDATE_LOC();
3476 	if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
3477 	{
3478 		RValue<Int4> sx = As<Int4>(x);
3479 		RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
3480 
3481 		RValue<Int4> sy = As<Int4>(y);
3482 		RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
3483 
3484 		return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
3485 	}
3486 	else
3487 	{
3488 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3489 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3490 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3491 		pack->addArg(x.value());
3492 		pack->addArg(y.value());
3493 		::basicBlock->appendInst(pack);
3494 
3495 		return RValue<UShort8>(V(result));
3496 	}
3497 }
3498 
SignMask(RValue<Int4> x)3499 RValue<Int> SignMask(RValue<Int4> x)
3500 {
3501 	RR_DEBUG_INFO_UPDATE_LOC();
3502 	if(emulateIntrinsics || CPUID::ARM)
3503 	{
3504 		Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
3505 		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
3506 	}
3507 	else
3508 	{
3509 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3510 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3511 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3512 		movmsk->addArg(x.value());
3513 		::basicBlock->appendInst(movmsk);
3514 
3515 		return RValue<Int>(V(result));
3516 	}
3517 }
3518 
type()3519 Type *Int4::type()
3520 {
3521 	return T(Ice::IceType_v4i32);
3522 }
3523 
UInt4(RValue<Float4> cast)3524 UInt4::UInt4(RValue<Float4> cast)
3525     : XYZW(this)
3526 {
3527 	RR_DEBUG_INFO_UPDATE_LOC();
3528 	// Smallest positive value representable in UInt, but not in Int
3529 	const unsigned int ustart = 0x80000000u;
3530 	const float ustartf = float(ustart);
3531 
3532 	// Check if the value can be represented as an Int
3533 	Int4 uiValue = CmpNLT(cast, Float4(ustartf));
3534 	// If the value is too large, subtract ustart and re-add it after conversion.
3535 	uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
3536 	          // Otherwise, just convert normally
3537 	          (~uiValue & Int4(cast));
3538 	// If the value is negative, store 0, otherwise store the result of the conversion
3539 	storeValue((~(As<Int4>(cast) >> 31) & uiValue).value());
3540 }
3541 
UInt4(RValue<UInt> rhs)3542 UInt4::UInt4(RValue<UInt> rhs)
3543     : XYZW(this)
3544 {
3545 	RR_DEBUG_INFO_UPDATE_LOC();
3546 	Value *vector = Nucleus::createBitCast(rhs.value(), UInt4::type());
3547 
3548 	std::vector<int> swizzle = { 0 };
3549 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3550 
3551 	storeValue(replicate);
3552 }
3553 
operator <<(RValue<UInt4> lhs,unsigned char rhs)3554 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
3555 {
3556 	RR_DEBUG_INFO_UPDATE_LOC();
3557 	if(emulateIntrinsics)
3558 	{
3559 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
3560 	}
3561 	else
3562 	{
3563 		return RValue<UInt4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3564 	}
3565 }
3566 
operator >>(RValue<UInt4> lhs,unsigned char rhs)3567 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
3568 {
3569 	RR_DEBUG_INFO_UPDATE_LOC();
3570 	if(emulateIntrinsics)
3571 	{
3572 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
3573 	}
3574 	else
3575 	{
3576 		return RValue<UInt4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3577 	}
3578 }
3579 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)3580 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
3581 {
3582 	RR_DEBUG_INFO_UPDATE_LOC();
3583 	return RValue<UInt4>(Nucleus::createICmpEQ(x.value(), y.value()));
3584 }
3585 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)3586 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
3587 {
3588 	RR_DEBUG_INFO_UPDATE_LOC();
3589 	return RValue<UInt4>(Nucleus::createICmpULT(x.value(), y.value()));
3590 }
3591 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)3592 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
3593 {
3594 	RR_DEBUG_INFO_UPDATE_LOC();
3595 	return RValue<UInt4>(Nucleus::createICmpULE(x.value(), y.value()));
3596 }
3597 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)3598 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
3599 {
3600 	RR_DEBUG_INFO_UPDATE_LOC();
3601 	return RValue<UInt4>(Nucleus::createICmpNE(x.value(), y.value()));
3602 }
3603 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)3604 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
3605 {
3606 	RR_DEBUG_INFO_UPDATE_LOC();
3607 	return RValue<UInt4>(Nucleus::createICmpUGE(x.value(), y.value()));
3608 }
3609 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)3610 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
3611 {
3612 	RR_DEBUG_INFO_UPDATE_LOC();
3613 	return RValue<UInt4>(Nucleus::createICmpUGT(x.value(), y.value()));
3614 }
3615 
Max(RValue<UInt4> x,RValue<UInt4> y)3616 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
3617 {
3618 	RR_DEBUG_INFO_UPDATE_LOC();
3619 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3620 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
3621 	::basicBlock->appendInst(cmp);
3622 
3623 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3624 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3625 	::basicBlock->appendInst(select);
3626 
3627 	return RValue<UInt4>(V(result));
3628 }
3629 
Min(RValue<UInt4> x,RValue<UInt4> y)3630 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
3631 {
3632 	RR_DEBUG_INFO_UPDATE_LOC();
3633 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3634 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
3635 	::basicBlock->appendInst(cmp);
3636 
3637 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3638 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3639 	::basicBlock->appendInst(select);
3640 
3641 	return RValue<UInt4>(V(result));
3642 }
3643 
type()3644 Type *UInt4::type()
3645 {
3646 	return T(Ice::IceType_v4i32);
3647 }
3648 
type()3649 Type *Half::type()
3650 {
3651 	return T(Ice::IceType_i16);
3652 }
3653 
Sqrt(RValue<Float> x)3654 RValue<Float> Sqrt(RValue<Float> x)
3655 {
3656 	RR_DEBUG_INFO_UPDATE_LOC();
3657 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
3658 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3659 	auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3660 	sqrt->addArg(x.value());
3661 	::basicBlock->appendInst(sqrt);
3662 
3663 	return RValue<Float>(V(result));
3664 }
3665 
Round(RValue<Float> x)3666 RValue<Float> Round(RValue<Float> x)
3667 {
3668 	RR_DEBUG_INFO_UPDATE_LOC();
3669 	return Float4(Round(Float4(x))).x;
3670 }
3671 
Trunc(RValue<Float> x)3672 RValue<Float> Trunc(RValue<Float> x)
3673 {
3674 	RR_DEBUG_INFO_UPDATE_LOC();
3675 	return Float4(Trunc(Float4(x))).x;
3676 }
3677 
Frac(RValue<Float> x)3678 RValue<Float> Frac(RValue<Float> x)
3679 {
3680 	RR_DEBUG_INFO_UPDATE_LOC();
3681 	return Float4(Frac(Float4(x))).x;
3682 }
3683 
Floor(RValue<Float> x)3684 RValue<Float> Floor(RValue<Float> x)
3685 {
3686 	RR_DEBUG_INFO_UPDATE_LOC();
3687 	return Float4(Floor(Float4(x))).x;
3688 }
3689 
Ceil(RValue<Float> x)3690 RValue<Float> Ceil(RValue<Float> x)
3691 {
3692 	RR_DEBUG_INFO_UPDATE_LOC();
3693 	return Float4(Ceil(Float4(x))).x;
3694 }
3695 
type()3696 Type *Float::type()
3697 {
3698 	return T(Ice::IceType_f32);
3699 }
3700 
type()3701 Type *Float2::type()
3702 {
3703 	return T(Type_v2f32);
3704 }
3705 
Float4(RValue<Float> rhs)3706 Float4::Float4(RValue<Float> rhs)
3707     : XYZW(this)
3708 {
3709 	RR_DEBUG_INFO_UPDATE_LOC();
3710 	Value *vector = Nucleus::createBitCast(rhs.value(), Float4::type());
3711 
3712 	std::vector<int> swizzle = { 0 };
3713 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3714 
3715 	storeValue(replicate);
3716 }
3717 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)3718 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
3719 {
3720 	return ScalarizeCall(fmodf, lhs, rhs);
3721 }
3722 
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3723 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3724 {
3725 	// TODO(b/214591655): Use FMA when available.
3726 	return x * y + z;
3727 }
3728 
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3729 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3730 {
3731 	// TODO(b/214591655): Use FMA instructions when available.
3732 	return ScalarizeCall(fmaf, x, y, z);
3733 }
3734 
Abs(RValue<Float4> x)3735 RValue<Float4> Abs(RValue<Float4> x)
3736 {
3737 	// TODO: Optimize.
3738 	Value *vector = Nucleus::createBitCast(x.value(), Int4::type());
3739 	std::vector<int64_t> constantVector = { 0x7FFFFFFF };
3740 	Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, Int4::type()));
3741 
3742 	return As<Float4>(result);
3743 }
3744 
Max(RValue<Float4> x,RValue<Float4> y)3745 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3746 {
3747 	RR_DEBUG_INFO_UPDATE_LOC();
3748 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3749 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value(), y.value());
3750 	::basicBlock->appendInst(cmp);
3751 
3752 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3753 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3754 	::basicBlock->appendInst(select);
3755 
3756 	return RValue<Float4>(V(result));
3757 }
3758 
Min(RValue<Float4> x,RValue<Float4> y)3759 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3760 {
3761 	RR_DEBUG_INFO_UPDATE_LOC();
3762 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3763 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value(), y.value());
3764 	::basicBlock->appendInst(cmp);
3765 
3766 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3767 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3768 	::basicBlock->appendInst(select);
3769 
3770 	return RValue<Float4>(V(result));
3771 }
3772 
HasRcpApprox()3773 bool HasRcpApprox()
3774 {
3775 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3776 	return false;
3777 }
3778 
RcpApprox(RValue<Float4> x,bool exactAtPow2)3779 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
3780 {
3781 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3782 	UNREACHABLE("RValue<Float4> RcpApprox()");
3783 	return { 0.0f };
3784 }
3785 
RcpApprox(RValue<Float> x,bool exactAtPow2)3786 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
3787 {
3788 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3789 	UNREACHABLE("RValue<Float> RcpApprox()");
3790 	return { 0.0f };
3791 }
3792 
HasRcpSqrtApprox()3793 bool HasRcpSqrtApprox()
3794 {
3795 	return false;
3796 }
3797 
RcpSqrtApprox(RValue<Float4> x)3798 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
3799 {
3800 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3801 	UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
3802 	return { 0.0f };
3803 }
3804 
RcpSqrtApprox(RValue<Float> x)3805 RValue<Float> RcpSqrtApprox(RValue<Float> x)
3806 {
3807 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3808 	UNREACHABLE("RValue<Float> RcpSqrtApprox()");
3809 	return { 0.0f };
3810 }
3811 
Sqrt(RValue<Float4> x)3812 RValue<Float4> Sqrt(RValue<Float4> x)
3813 {
3814 	RR_DEBUG_INFO_UPDATE_LOC();
3815 	if(emulateIntrinsics || CPUID::ARM)
3816 	{
3817 		Float4 result;
3818 		result.x = Sqrt(Float(Float4(x).x));
3819 		result.y = Sqrt(Float(Float4(x).y));
3820 		result.z = Sqrt(Float(Float4(x).z));
3821 		result.w = Sqrt(Float(Float4(x).w));
3822 
3823 		return result;
3824 	}
3825 	else
3826 	{
3827 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3828 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3829 		auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3830 		sqrt->addArg(x.value());
3831 		::basicBlock->appendInst(sqrt);
3832 
3833 		return RValue<Float4>(V(result));
3834 	}
3835 }
3836 
SignMask(RValue<Float4> x)3837 RValue<Int> SignMask(RValue<Float4> x)
3838 {
3839 	RR_DEBUG_INFO_UPDATE_LOC();
3840 	if(emulateIntrinsics || CPUID::ARM)
3841 	{
3842 		Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
3843 		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
3844 	}
3845 	else
3846 	{
3847 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3848 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3849 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3850 		movmsk->addArg(x.value());
3851 		::basicBlock->appendInst(movmsk);
3852 
3853 		return RValue<Int>(V(result));
3854 	}
3855 }
3856 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3857 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3858 {
3859 	RR_DEBUG_INFO_UPDATE_LOC();
3860 	return RValue<Int4>(Nucleus::createFCmpOEQ(x.value(), y.value()));
3861 }
3862 
CmpLT(RValue<Float4> x,RValue<Float4> y)3863 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3864 {
3865 	RR_DEBUG_INFO_UPDATE_LOC();
3866 	return RValue<Int4>(Nucleus::createFCmpOLT(x.value(), y.value()));
3867 }
3868 
CmpLE(RValue<Float4> x,RValue<Float4> y)3869 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3870 {
3871 	RR_DEBUG_INFO_UPDATE_LOC();
3872 	return RValue<Int4>(Nucleus::createFCmpOLE(x.value(), y.value()));
3873 }
3874 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3875 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3876 {
3877 	RR_DEBUG_INFO_UPDATE_LOC();
3878 	return RValue<Int4>(Nucleus::createFCmpONE(x.value(), y.value()));
3879 }
3880 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3881 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3882 {
3883 	RR_DEBUG_INFO_UPDATE_LOC();
3884 	return RValue<Int4>(Nucleus::createFCmpOGE(x.value(), y.value()));
3885 }
3886 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3887 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3888 {
3889 	RR_DEBUG_INFO_UPDATE_LOC();
3890 	return RValue<Int4>(Nucleus::createFCmpOGT(x.value(), y.value()));
3891 }
3892 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3893 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3894 {
3895 	RR_DEBUG_INFO_UPDATE_LOC();
3896 	return RValue<Int4>(Nucleus::createFCmpUEQ(x.value(), y.value()));
3897 }
3898 
CmpULT(RValue<Float4> x,RValue<Float4> y)3899 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3900 {
3901 	RR_DEBUG_INFO_UPDATE_LOC();
3902 	return RValue<Int4>(Nucleus::createFCmpULT(x.value(), y.value()));
3903 }
3904 
CmpULE(RValue<Float4> x,RValue<Float4> y)3905 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3906 {
3907 	RR_DEBUG_INFO_UPDATE_LOC();
3908 	return RValue<Int4>(Nucleus::createFCmpULE(x.value(), y.value()));
3909 }
3910 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3911 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3912 {
3913 	RR_DEBUG_INFO_UPDATE_LOC();
3914 	return RValue<Int4>(Nucleus::createFCmpUNE(x.value(), y.value()));
3915 }
3916 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3917 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3918 {
3919 	RR_DEBUG_INFO_UPDATE_LOC();
3920 	return RValue<Int4>(Nucleus::createFCmpUGE(x.value(), y.value()));
3921 }
3922 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3923 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3924 {
3925 	RR_DEBUG_INFO_UPDATE_LOC();
3926 	return RValue<Int4>(Nucleus::createFCmpUGT(x.value(), y.value()));
3927 }
3928 
Round(RValue<Float4> x)3929 RValue<Float4> Round(RValue<Float4> x)
3930 {
3931 	RR_DEBUG_INFO_UPDATE_LOC();
3932 	if(emulateIntrinsics || CPUID::ARM)
3933 	{
3934 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3935 		return (x + Float4(0x00C00000)) - Float4(0x00C00000);
3936 	}
3937 	else if(CPUID::SSE4_1)
3938 	{
3939 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3940 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3941 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3942 		round->addArg(x.value());
3943 		round->addArg(::context->getConstantInt32(0));
3944 		::basicBlock->appendInst(round);
3945 
3946 		return RValue<Float4>(V(result));
3947 	}
3948 	else
3949 	{
3950 		return Float4(RoundInt(x));
3951 	}
3952 }
3953 
Trunc(RValue<Float4> x)3954 RValue<Float4> Trunc(RValue<Float4> x)
3955 {
3956 	RR_DEBUG_INFO_UPDATE_LOC();
3957 	if(CPUID::SSE4_1)
3958 	{
3959 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3960 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3961 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3962 		round->addArg(x.value());
3963 		round->addArg(::context->getConstantInt32(3));
3964 		::basicBlock->appendInst(round);
3965 
3966 		return RValue<Float4>(V(result));
3967 	}
3968 	else
3969 	{
3970 		return Float4(Int4(x));
3971 	}
3972 }
3973 
Frac(RValue<Float4> x)3974 RValue<Float4> Frac(RValue<Float4> x)
3975 {
3976 	RR_DEBUG_INFO_UPDATE_LOC();
3977 	Float4 frc;
3978 
3979 	if(CPUID::SSE4_1)
3980 	{
3981 		frc = x - Floor(x);
3982 	}
3983 	else
3984 	{
3985 		frc = x - Float4(Int4(x));  // Signed fractional part.
3986 
3987 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3988 	}
3989 
3990 	// x - floor(x) can be 1.0 for very small negative x.
3991 	// Clamp against the value just below 1.0.
3992 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3993 }
3994 
Floor(RValue<Float4> x)3995 RValue<Float4> Floor(RValue<Float4> x)
3996 {
3997 	RR_DEBUG_INFO_UPDATE_LOC();
3998 	if(CPUID::SSE4_1)
3999 	{
4000 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4001 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4002 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4003 		round->addArg(x.value());
4004 		round->addArg(::context->getConstantInt32(1));
4005 		::basicBlock->appendInst(round);
4006 
4007 		return RValue<Float4>(V(result));
4008 	}
4009 	else
4010 	{
4011 		return x - Frac(x);
4012 	}
4013 }
4014 
Ceil(RValue<Float4> x)4015 RValue<Float4> Ceil(RValue<Float4> x)
4016 {
4017 	RR_DEBUG_INFO_UPDATE_LOC();
4018 	if(CPUID::SSE4_1)
4019 	{
4020 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4021 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4022 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4023 		round->addArg(x.value());
4024 		round->addArg(::context->getConstantInt32(2));
4025 		::basicBlock->appendInst(round);
4026 
4027 		return RValue<Float4>(V(result));
4028 	}
4029 	else
4030 	{
4031 		return -Floor(-x);
4032 	}
4033 }
4034 
type()4035 Type *Float4::type()
4036 {
4037 	return T(Ice::IceType_v4f32);
4038 }
4039 
Ticks()4040 RValue<Long> Ticks()
4041 {
4042 	RR_DEBUG_INFO_UPDATE_LOC();
4043 	UNIMPLEMENTED_NO_BUG("RValue<Long> Ticks()");
4044 	return Long(Int(0));
4045 }
4046 
ConstantPointer(const void * ptr)4047 RValue<Pointer<Byte>> ConstantPointer(const void *ptr)
4048 {
4049 	RR_DEBUG_INFO_UPDATE_LOC();
4050 	return RValue<Pointer<Byte>>{ V(sz::getConstantPointer(::context, ptr)) };
4051 }
4052 
ConstantData(const void * data,size_t size)4053 RValue<Pointer<Byte>> ConstantData(const void *data, size_t size)
4054 {
4055 	RR_DEBUG_INFO_UPDATE_LOC();
4056 	return RValue<Pointer<Byte>>{ V(IceConstantData(data, size)) };
4057 }
4058 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)4059 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
4060 {
4061 	RR_DEBUG_INFO_UPDATE_LOC();
4062 	return V(sz::Call(::function, ::basicBlock, T(retTy), V(fptr.value()), V(args), false));
4063 }
4064 
Breakpoint()4065 void Breakpoint()
4066 {
4067 	RR_DEBUG_INFO_UPDATE_LOC();
4068 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4069 	auto trap = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4070 	::basicBlock->appendInst(trap);
4071 }
4072 
createFence(std::memory_order memoryOrder)4073 void Nucleus::createFence(std::memory_order memoryOrder)
4074 {
4075 	RR_DEBUG_INFO_UPDATE_LOC();
4076 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicFence, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4077 	auto inst = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4078 	auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
4079 	inst->addArg(order);
4080 	::basicBlock->appendInst(inst);
4081 }
4082 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)4083 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
4084 {
4085 	RR_DEBUG_INFO_UPDATE_LOC();
4086 	UNIMPLEMENTED("b/155867273 Subzero createMaskedLoad()");
4087 	return nullptr;
4088 }
4089 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)4090 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
4091 {
4092 	RR_DEBUG_INFO_UPDATE_LOC();
4093 	UNIMPLEMENTED("b/155867273 Subzero createMaskedStore()");
4094 }
4095 
4096 template<typename T>
4097 struct UnderlyingType
4098 {
4099 	using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
4100 };
4101 
4102 template<typename T>
4103 using UnderlyingTypeT = typename UnderlyingType<T>::Type;
4104 
4105 template<typename T, typename EL = UnderlyingTypeT<T>>
gather(T & out,RValue<Pointer<EL>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)4106 static void gather(T &out, RValue<Pointer<EL>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes)
4107 {
4108 	constexpr bool atomic = false;
4109 	constexpr std::memory_order order = std::memory_order_relaxed;
4110 
4111 	Pointer<Byte> baseBytePtr = base;
4112 
4113 	out = T(0);
4114 	for(int i = 0; i < SIMD::Width; i++)
4115 	{
4116 		If(Extract(mask, i) != 0)
4117 		{
4118 			auto offset = Extract(offsets, i);
4119 			auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
4120 			out = Insert(out, el, i);
4121 		}
4122 		Else If(zeroMaskedLanes)
4123 		{
4124 			out = Insert(out, EL(0), i);
4125 		}
4126 	}
4127 }
4128 
4129 template<typename T, typename EL = UnderlyingTypeT<T>>
scatter(RValue<Pointer<EL>> base,RValue<T> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)4130 static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
4131 {
4132 	constexpr bool atomic = false;
4133 	constexpr std::memory_order order = std::memory_order_relaxed;
4134 
4135 	Pointer<Byte> baseBytePtr = base;
4136 
4137 	for(int i = 0; i < SIMD::Width; i++)
4138 	{
4139 		If(Extract(mask, i) != 0)
4140 		{
4141 			auto offset = Extract(offsets, i);
4142 			Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
4143 		}
4144 	}
4145 }
4146 
Gather(RValue<Pointer<Float>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)4147 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4148 {
4149 	RR_DEBUG_INFO_UPDATE_LOC();
4150 	SIMD::Float result{};
4151 	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
4152 	return result;
4153 }
4154 
Gather(RValue<Pointer<Int>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)4155 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4156 {
4157 	RR_DEBUG_INFO_UPDATE_LOC();
4158 	SIMD::Int result{};
4159 	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
4160 	return result;
4161 }
4162 
Scatter(RValue<Pointer<Float>> base,RValue<SIMD::Float> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)4163 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
4164 {
4165 	RR_DEBUG_INFO_UPDATE_LOC();
4166 	scatter(base, val, offsets, mask, alignment);
4167 }
4168 
Scatter(RValue<Pointer<Int>> base,RValue<SIMD::Int> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)4169 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
4170 {
4171 	RR_DEBUG_INFO_UPDATE_LOC();
4172 	scatter<SIMD::Int>(base, val, offsets, mask, alignment);
4173 }
4174 
Ctlz(RValue<UInt> x,bool isZeroUndef)4175 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
4176 {
4177 	RR_DEBUG_INFO_UPDATE_LOC();
4178 	if(emulateIntrinsics)
4179 	{
4180 		UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4181 		return UInt(0);
4182 	}
4183 	else
4184 	{
4185 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4186 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4187 		auto ctlz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4188 		ctlz->addArg(x.value());
4189 		::basicBlock->appendInst(ctlz);
4190 
4191 		return RValue<UInt>(V(result));
4192 	}
4193 }
4194 
Ctlz(RValue<UInt4> x,bool isZeroUndef)4195 RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
4196 {
4197 	RR_DEBUG_INFO_UPDATE_LOC();
4198 	if(emulateIntrinsics)
4199 	{
4200 		UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4201 		return UInt4(0);
4202 	}
4203 	else
4204 	{
4205 		return Scalarize([isZeroUndef](auto a) { return Ctlz(a, isZeroUndef); }, x);
4206 	}
4207 }
4208 
Cttz(RValue<UInt> x,bool isZeroUndef)4209 RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
4210 {
4211 	RR_DEBUG_INFO_UPDATE_LOC();
4212 	if(emulateIntrinsics)
4213 	{
4214 		UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4215 		return UInt(0);
4216 	}
4217 	else
4218 	{
4219 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4220 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4221 		auto cttz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4222 		cttz->addArg(x.value());
4223 		::basicBlock->appendInst(cttz);
4224 
4225 		return RValue<UInt>(V(result));
4226 	}
4227 }
4228 
Cttz(RValue<UInt4> x,bool isZeroUndef)4229 RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
4230 {
4231 	RR_DEBUG_INFO_UPDATE_LOC();
4232 	if(emulateIntrinsics)
4233 	{
4234 		UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4235 		return UInt4(0);
4236 	}
4237 	else
4238 	{
4239 		return Scalarize([isZeroUndef](auto a) { return Cttz(a, isZeroUndef); }, x);
4240 	}
4241 }
4242 
4243 // TODO(b/148276653): Both atomicMin and atomicMax use a static (global) mutex that makes all min
4244 // operations for a given T mutually exclusive, rather than only the ones on the value pointed to
4245 // by ptr. Use a CAS loop, as is done for LLVMReactor's min/max atomic for Android.
4246 // TODO(b/148207274): Or, move this down into Subzero as a CAS-based operation.
4247 template<typename T>
atomicMin(T * ptr,T value)4248 static T atomicMin(T *ptr, T value)
4249 {
4250 	static std::mutex m;
4251 
4252 	std::lock_guard<std::mutex> lock(m);
4253 	T origValue = *ptr;
4254 	*ptr = std::min(origValue, value);
4255 	return origValue;
4256 }
4257 
4258 template<typename T>
atomicMax(T * ptr,T value)4259 static T atomicMax(T *ptr, T value)
4260 {
4261 	static std::mutex m;
4262 
4263 	std::lock_guard<std::mutex> lock(m);
4264 	T origValue = *ptr;
4265 	*ptr = std::max(origValue, value);
4266 	return origValue;
4267 }
4268 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4269 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4270 {
4271 	RR_DEBUG_INFO_UPDATE_LOC();
4272 	return Call(atomicMin<int32_t>, x, y);
4273 }
4274 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4275 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4276 {
4277 	RR_DEBUG_INFO_UPDATE_LOC();
4278 	return Call(atomicMin<uint32_t>, x, y);
4279 }
4280 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4281 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4282 {
4283 	RR_DEBUG_INFO_UPDATE_LOC();
4284 	return Call(atomicMax<int32_t>, x, y);
4285 }
4286 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4287 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4288 {
4289 	RR_DEBUG_INFO_UPDATE_LOC();
4290 	return Call(atomicMax<uint32_t>, x, y);
4291 }
4292 
EmitDebugLocation()4293 void EmitDebugLocation()
4294 {
4295 #ifdef ENABLE_RR_DEBUG_INFO
4296 	emitPrintLocation(getCallerBacktrace());
4297 #endif  // ENABLE_RR_DEBUG_INFO
4298 }
EmitDebugVariable(Value * value)4299 void EmitDebugVariable(Value *value) {}
FlushDebug()4300 void FlushDebug() {}
4301 
4302 namespace {
4303 namespace coro {
4304 
4305 // Instance data per generated coroutine
4306 // This is the "handle" type used for Coroutine functions
4307 // Lifetime: from yield to when CoroutineEntryDestroy generated function is called.
4308 struct CoroutineData
4309 {
4310 	bool useInternalScheduler = false;
4311 	bool done = false;        // the coroutine should stop at the next yield()
4312 	bool terminated = false;  // the coroutine has finished.
4313 	bool inRoutine = false;   // is the coroutine currently executing?
4314 	marl::Scheduler::Fiber *mainFiber = nullptr;
4315 	marl::Scheduler::Fiber *routineFiber = nullptr;
4316 	void *promisePtr = nullptr;
4317 };
4318 
createCoroutineData()4319 CoroutineData *createCoroutineData()
4320 {
4321 	return new CoroutineData{};
4322 }
4323 
destroyCoroutineData(CoroutineData * coroData)4324 void destroyCoroutineData(CoroutineData *coroData)
4325 {
4326 	delete coroData;
4327 }
4328 
4329 // suspend() pauses execution of the coroutine, and resumes execution from the
4330 // caller's call to await().
4331 // Returns true if await() is called again, or false if coroutine_destroy()
4332 // is called.
suspend(Nucleus::CoroutineHandle handle)4333 bool suspend(Nucleus::CoroutineHandle handle)
4334 {
4335 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4336 	ASSERT(marl::Scheduler::Fiber::current() == coroData->routineFiber);
4337 	ASSERT(coroData->inRoutine);
4338 	coroData->inRoutine = false;
4339 	coroData->mainFiber->notify();
4340 	while(!coroData->inRoutine)
4341 	{
4342 		coroData->routineFiber->wait();
4343 	}
4344 	return !coroData->done;
4345 }
4346 
4347 // resume() is called by await(), blocking until the coroutine calls yield()
4348 // or the coroutine terminates.
resume(Nucleus::CoroutineHandle handle)4349 void resume(Nucleus::CoroutineHandle handle)
4350 {
4351 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4352 	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4353 	ASSERT(!coroData->inRoutine);
4354 	coroData->inRoutine = true;
4355 	coroData->routineFiber->notify();
4356 	while(coroData->inRoutine)
4357 	{
4358 		coroData->mainFiber->wait();
4359 	}
4360 }
4361 
4362 // stop() is called by coroutine_destroy(), signalling that it's done, then blocks
4363 // until the coroutine ends, and deletes the coroutine data.
stop(Nucleus::CoroutineHandle handle)4364 void stop(Nucleus::CoroutineHandle handle)
4365 {
4366 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4367 	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4368 	ASSERT(!coroData->inRoutine);
4369 	if(!coroData->terminated)
4370 	{
4371 		coroData->done = true;
4372 		coroData->inRoutine = true;
4373 		coroData->routineFiber->notify();
4374 		while(!coroData->terminated)
4375 		{
4376 			coroData->mainFiber->wait();
4377 		}
4378 	}
4379 	if(coroData->useInternalScheduler)
4380 	{
4381 		::getOrCreateScheduler().unbind();
4382 	}
4383 	coro::destroyCoroutineData(coroData);  // free the coroutine data.
4384 }
4385 
4386 namespace detail {
4387 thread_local rr::Nucleus::CoroutineHandle coroHandle{};
4388 }  // namespace detail
4389 
setHandleParam(Nucleus::CoroutineHandle handle)4390 void setHandleParam(Nucleus::CoroutineHandle handle)
4391 {
4392 	ASSERT(!detail::coroHandle);
4393 	detail::coroHandle = handle;
4394 }
4395 
getHandleParam()4396 Nucleus::CoroutineHandle getHandleParam()
4397 {
4398 	ASSERT(detail::coroHandle);
4399 	auto handle = detail::coroHandle;
4400 	detail::coroHandle = {};
4401 	return handle;
4402 }
4403 
isDone(Nucleus::CoroutineHandle handle)4404 bool isDone(Nucleus::CoroutineHandle handle)
4405 {
4406 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4407 	return coroData->done;
4408 }
4409 
setPromisePtr(Nucleus::CoroutineHandle handle,void * promisePtr)4410 void setPromisePtr(Nucleus::CoroutineHandle handle, void *promisePtr)
4411 {
4412 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4413 	coroData->promisePtr = promisePtr;
4414 }
4415 
getPromisePtr(Nucleus::CoroutineHandle handle)4416 void *getPromisePtr(Nucleus::CoroutineHandle handle)
4417 {
4418 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4419 	return coroData->promisePtr;
4420 }
4421 
4422 }  // namespace coro
4423 }  // namespace
4424 
4425 // Used to generate coroutines.
4426 // Lifetime: from yield to acquireCoroutine
4427 class CoroutineGenerator
4428 {
4429 public:
CoroutineGenerator()4430 	CoroutineGenerator()
4431 	{
4432 	}
4433 
4434 	// Inserts instructions at the top of the current function to make it a coroutine.
generateCoroutineBegin()4435 	void generateCoroutineBegin()
4436 	{
4437 		// Begin building the main coroutine_begin() function.
4438 		// We insert these instructions at the top of the entry node,
4439 		// before existing reactor-generated instructions.
4440 
4441 		//    CoroutineHandle coroutine_begin(<Arguments>)
4442 		//    {
4443 		//        this->handle = coro::getHandleParam();
4444 		//
4445 		//        YieldType promise;
4446 		//        coro::setPromisePtr(handle, &promise); // For await
4447 		//
4448 		//        ... <REACTOR CODE> ...
4449 		//
4450 
4451 		//        this->handle = coro::getHandleParam();
4452 		this->handle = sz::Call(::function, ::entryBlock, coro::getHandleParam);
4453 
4454 		//        YieldType promise;
4455 		//        coro::setPromisePtr(handle, &promise); // For await
4456 		this->promise = sz::allocateStackVariable(::function, T(::coroYieldType));
4457 		sz::Call(::function, ::entryBlock, coro::setPromisePtr, this->handle, this->promise);
4458 	}
4459 
4460 	// Adds instructions for Yield() calls at the current location of the main coroutine function.
generateYield(Value * val)4461 	void generateYield(Value *val)
4462 	{
4463 		//        ... <REACTOR CODE> ...
4464 		//
4465 		//        promise = val;
4466 		//        if (!coro::suspend(handle)) {
4467 		//            return false; // coroutine has been stopped by the caller.
4468 		//        }
4469 		//
4470 		//        ... <REACTOR CODE> ...
4471 
4472 		//        promise = val;
4473 		Nucleus::createStore(val, V(this->promise), ::coroYieldType);
4474 
4475 		//        if (!coro::suspend(handle)) {
4476 		auto result = sz::Call(::function, ::basicBlock, coro::suspend, this->handle);
4477 		auto doneBlock = Nucleus::createBasicBlock();
4478 		auto resumeBlock = Nucleus::createBasicBlock();
4479 		Nucleus::createCondBr(V(result), resumeBlock, doneBlock);
4480 
4481 		//            return false; // coroutine has been stopped by the caller.
4482 		::basicBlock = doneBlock;
4483 		Nucleus::createRetVoid();  // coroutine return value is ignored.
4484 
4485 		//        ... <REACTOR CODE> ...
4486 		::basicBlock = resumeBlock;
4487 	}
4488 
4489 	using FunctionUniquePtr = std::unique_ptr<Ice::Cfg>;
4490 
4491 	// Generates the await function for the current coroutine.
4492 	// Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateAwaitFunction()4493 	static FunctionUniquePtr generateAwaitFunction()
4494 	{
4495 		// bool coroutine_await(CoroutineHandle handle, YieldType* out)
4496 		// {
4497 		//     if (coro::isDone())
4498 		//     {
4499 		//         return false;
4500 		//     }
4501 		//     else // resume
4502 		//     {
4503 		//         YieldType* promise = coro::getPromisePtr(handle);
4504 		//         *out = *promise;
4505 		//         coro::resume(handle);
4506 		//         return true;
4507 		//     }
4508 		// }
4509 
4510 		// Subzero doesn't support bool types (IceType_i1) as return type
4511 		const Ice::Type ReturnType = Ice::IceType_i32;
4512 		const Ice::Type YieldPtrType = sz::getPointerType(T(::coroYieldType));
4513 		const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4514 
4515 		Ice::Cfg *awaitFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType, YieldPtrType });
4516 		Ice::CfgLocalAllocatorScope scopedAlloc{ awaitFunc };
4517 
4518 		Ice::Variable *handle = awaitFunc->getArgs()[0];
4519 		Ice::Variable *outPtr = awaitFunc->getArgs()[1];
4520 
4521 		auto doneBlock = awaitFunc->makeNode();
4522 		{
4523 			//         return false;
4524 			Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(0));
4525 			doneBlock->appendInst(ret);
4526 		}
4527 
4528 		auto resumeBlock = awaitFunc->makeNode();
4529 		{
4530 			//         YieldType* promise = coro::getPromisePtr(handle);
4531 			Ice::Variable *promise = sz::Call(awaitFunc, resumeBlock, coro::getPromisePtr, handle);
4532 
4533 			//         *out = *promise;
4534 			// Load promise value
4535 			Ice::Variable *promiseVal = awaitFunc->makeVariable(T(::coroYieldType));
4536 			auto load = Ice::InstLoad::create(awaitFunc, promiseVal, promise);
4537 			resumeBlock->appendInst(load);
4538 			// Then store it in output param
4539 			auto store = Ice::InstStore::create(awaitFunc, promiseVal, outPtr);
4540 			resumeBlock->appendInst(store);
4541 
4542 			//         coro::resume(handle);
4543 			sz::Call(awaitFunc, resumeBlock, coro::resume, handle);
4544 
4545 			//         return true;
4546 			Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(1));
4547 			resumeBlock->appendInst(ret);
4548 		}
4549 
4550 		//     if (coro::isDone())
4551 		//     {
4552 		//         <doneBlock>
4553 		//     }
4554 		//     else // resume
4555 		//     {
4556 		//         <resumeBlock>
4557 		//     }
4558 		Ice::CfgNode *bb = awaitFunc->getEntryNode();
4559 		Ice::Variable *done = sz::Call(awaitFunc, bb, coro::isDone, handle);
4560 		auto br = Ice::InstBr::create(awaitFunc, done, doneBlock, resumeBlock);
4561 		bb->appendInst(br);
4562 
4563 		return FunctionUniquePtr{ awaitFunc };
4564 	}
4565 
4566 	// Generates the destroy function for the current coroutine.
4567 	// Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateDestroyFunction()4568 	static FunctionUniquePtr generateDestroyFunction()
4569 	{
4570 		// void coroutine_destroy(Nucleus::CoroutineHandle handle)
4571 		// {
4572 		//     coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4573 		//     return;
4574 		// }
4575 
4576 		const Ice::Type ReturnType = Ice::IceType_void;
4577 		const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4578 
4579 		Ice::Cfg *destroyFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType });
4580 		Ice::CfgLocalAllocatorScope scopedAlloc{ destroyFunc };
4581 
4582 		Ice::Variable *handle = destroyFunc->getArgs()[0];
4583 
4584 		auto *bb = destroyFunc->getEntryNode();
4585 
4586 		//     coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4587 		sz::Call(destroyFunc, bb, coro::stop, handle);
4588 
4589 		//     return;
4590 		Ice::InstRet *ret = Ice::InstRet::create(destroyFunc);
4591 		bb->appendInst(ret);
4592 
4593 		return FunctionUniquePtr{ destroyFunc };
4594 	}
4595 
4596 private:
4597 	Ice::Variable *handle{};
4598 	Ice::Variable *promise{};
4599 };
4600 
invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle ()> beginFunc)4601 static Nucleus::CoroutineHandle invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle()> beginFunc)
4602 {
4603 	// This doubles up as our coroutine handle
4604 	auto coroData = coro::createCoroutineData();
4605 
4606 	coroData->useInternalScheduler = (marl::Scheduler::get() == nullptr);
4607 	if(coroData->useInternalScheduler)
4608 	{
4609 		::getOrCreateScheduler().bind();
4610 	}
4611 
4612 	auto run = [=] {
4613 		// Store handle in TLS so that the coroutine can grab it right away, before
4614 		// any fiber switch occurs.
4615 		coro::setHandleParam(coroData);
4616 
4617 		ASSERT(!coroData->routineFiber);
4618 		coroData->routineFiber = marl::Scheduler::Fiber::current();
4619 
4620 		beginFunc();
4621 
4622 		ASSERT(coroData->inRoutine);
4623 		coroData->done = true;        // coroutine is done.
4624 		coroData->terminated = true;  // signal that the coroutine data is ready for freeing.
4625 		coroData->inRoutine = false;
4626 		coroData->mainFiber->notify();
4627 	};
4628 
4629 	ASSERT(!coroData->mainFiber);
4630 	coroData->mainFiber = marl::Scheduler::Fiber::current();
4631 
4632 	// block until the first yield or coroutine end
4633 	ASSERT(!coroData->inRoutine);
4634 	coroData->inRoutine = true;
4635 	marl::schedule(marl::Task(run, marl::Task::Flags::SameThread));
4636 	while(coroData->inRoutine)
4637 	{
4638 		coroData->mainFiber->wait();
4639 	}
4640 
4641 	return coroData;
4642 }
4643 
createCoroutine(Type * yieldType,const std::vector<Type * > & params)4644 void Nucleus::createCoroutine(Type *yieldType, const std::vector<Type *> &params)
4645 {
4646 	// Start by creating a regular function
4647 	createFunction(yieldType, params);
4648 
4649 	// Save in case yield() is called
4650 	ASSERT(::coroYieldType == nullptr);  // Only one coroutine can be generated at once
4651 	::coroYieldType = yieldType;
4652 }
4653 
yield(Value * val)4654 void Nucleus::yield(Value *val)
4655 {
4656 	RR_DEBUG_INFO_UPDATE_LOC();
4657 	Variable::materializeAll();
4658 
4659 	// On first yield, we start generating coroutine functions
4660 	if(!::coroGen)
4661 	{
4662 		::coroGen = std::make_shared<CoroutineGenerator>();
4663 		::coroGen->generateCoroutineBegin();
4664 	}
4665 
4666 	ASSERT(::coroGen);
4667 	::coroGen->generateYield(val);
4668 }
4669 
coroutineEntryAwaitStub(Nucleus::CoroutineHandle,void * yieldValue)4670 static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void *yieldValue)
4671 {
4672 	return false;
4673 }
4674 
coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)4675 static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)
4676 {
4677 }
4678 
acquireCoroutine(const char * name)4679 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name)
4680 {
4681 	if(::coroGen)
4682 	{
4683 		// Finish generating coroutine functions
4684 		{
4685 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4686 			finalizeFunction();
4687 		}
4688 
4689 		auto awaitFunc = ::coroGen->generateAwaitFunction();
4690 		auto destroyFunc = ::coroGen->generateDestroyFunction();
4691 
4692 		// At this point, we no longer need the CoroutineGenerator.
4693 		::coroGen.reset();
4694 		::coroYieldType = nullptr;
4695 
4696 		auto routine = rr::acquireRoutine({ ::function, awaitFunc.get(), destroyFunc.get() },
4697 		                                  { name, "await", "destroy" });
4698 
4699 		return routine;
4700 	}
4701 	else
4702 	{
4703 		{
4704 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4705 			finalizeFunction();
4706 		}
4707 
4708 		::coroYieldType = nullptr;
4709 
4710 		// Not an actual coroutine (no yields), so return stubs for await and destroy
4711 		auto routine = rr::acquireRoutine({ ::function }, { name });
4712 
4713 		auto routineImpl = std::static_pointer_cast<ELFMemoryStreamer>(routine);
4714 		routineImpl->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void *>(&coroutineEntryAwaitStub));
4715 		routineImpl->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void *>(&coroutineEntryDestroyStub));
4716 		return routine;
4717 	}
4718 }
4719 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4720 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4721 {
4722 	const bool isCoroutine = routine.getEntry(Nucleus::CoroutineEntryAwait) != reinterpret_cast<const void *>(&coroutineEntryAwaitStub);
4723 
4724 	if(isCoroutine)
4725 	{
4726 		return rr::invokeCoroutineBegin(func);
4727 	}
4728 	else
4729 	{
4730 		// For regular routines, just invoke the begin func directly
4731 		return func();
4732 	}
4733 }
4734 
Int(RValue<scalar::Int> rhs)4735 SIMD::Int::Int(RValue<scalar::Int> rhs)
4736     : XYZW(this)
4737 {
4738 	RR_DEBUG_INFO_UPDATE_LOC();
4739 	Value *vector = Nucleus::createBitCast(rhs.value(), SIMD::Int::type());
4740 
4741 	std::vector<int> swizzle = { 0 };
4742 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
4743 
4744 	storeValue(replicate);
4745 }
4746 
operator <<(RValue<SIMD::Int> lhs,unsigned char rhs)4747 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs)
4748 {
4749 	RR_DEBUG_INFO_UPDATE_LOC();
4750 	if(emulateIntrinsics)
4751 	{
4752 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
4753 	}
4754 	else
4755 	{
4756 		return RValue<SIMD::Int>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
4757 	}
4758 }
4759 
operator >>(RValue<SIMD::Int> lhs,unsigned char rhs)4760 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs)
4761 {
4762 	RR_DEBUG_INFO_UPDATE_LOC();
4763 	if(emulateIntrinsics)
4764 	{
4765 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
4766 	}
4767 	else
4768 	{
4769 		return RValue<SIMD::Int>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
4770 	}
4771 }
4772 
CmpEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4773 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4774 {
4775 	RR_DEBUG_INFO_UPDATE_LOC();
4776 	return RValue<SIMD::Int>(Nucleus::createICmpEQ(x.value(), y.value()));
4777 }
4778 
CmpLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4779 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4780 {
4781 	RR_DEBUG_INFO_UPDATE_LOC();
4782 	return RValue<SIMD::Int>(Nucleus::createICmpSLT(x.value(), y.value()));
4783 }
4784 
CmpLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4785 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4786 {
4787 	RR_DEBUG_INFO_UPDATE_LOC();
4788 	return RValue<SIMD::Int>(Nucleus::createICmpSLE(x.value(), y.value()));
4789 }
4790 
CmpNEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4791 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4792 {
4793 	RR_DEBUG_INFO_UPDATE_LOC();
4794 	return RValue<SIMD::Int>(Nucleus::createICmpNE(x.value(), y.value()));
4795 }
4796 
CmpNLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4797 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4798 {
4799 	RR_DEBUG_INFO_UPDATE_LOC();
4800 	return RValue<SIMD::Int>(Nucleus::createICmpSGE(x.value(), y.value()));
4801 }
4802 
CmpNLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4803 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4804 {
4805 	RR_DEBUG_INFO_UPDATE_LOC();
4806 	return RValue<SIMD::Int>(Nucleus::createICmpSGT(x.value(), y.value()));
4807 }
4808 
Abs(RValue<SIMD::Int> x)4809 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x)
4810 {
4811 	// TODO: Optimize.
4812 	auto negative = x >> 31;
4813 	return (x ^ negative) - negative;
4814 }
4815 
Max(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4816 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4817 {
4818 	RR_DEBUG_INFO_UPDATE_LOC();
4819 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
4820 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
4821 	::basicBlock->appendInst(cmp);
4822 
4823 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
4824 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
4825 	::basicBlock->appendInst(select);
4826 
4827 	return RValue<SIMD::Int>(V(result));
4828 }
4829 
Min(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4830 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4831 {
4832 	RR_DEBUG_INFO_UPDATE_LOC();
4833 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
4834 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
4835 	::basicBlock->appendInst(cmp);
4836 
4837 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
4838 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
4839 	::basicBlock->appendInst(select);
4840 
4841 	return RValue<SIMD::Int>(V(result));
4842 }
4843 
RoundInt(RValue<SIMD::Float> cast)4844 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast)
4845 {
4846 	RR_DEBUG_INFO_UPDATE_LOC();
4847 	if(emulateIntrinsics || CPUID::ARM)
4848 	{
4849 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
4850 		return SIMD::Int((cast + SIMD::Float(0x00C00000)) - SIMD::Float(0x00C00000));
4851 	}
4852 	else
4853 	{
4854 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
4855 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4856 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4857 		nearbyint->addArg(cast.value());
4858 		::basicBlock->appendInst(nearbyint);
4859 
4860 		return RValue<SIMD::Int>(V(result));
4861 	}
4862 }
4863 
RoundIntClamped(RValue<SIMD::Float> cast)4864 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast)
4865 {
4866 	RR_DEBUG_INFO_UPDATE_LOC();
4867 
4868 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
4869 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
4870 	// saturate to 0x80000000.
4871 	RValue<SIMD::Float> clamped = Min(cast, SIMD::Float(0x7FFFFF80));
4872 
4873 	if(emulateIntrinsics || CPUID::ARM)
4874 	{
4875 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
4876 		return SIMD::Int((clamped + SIMD::Float(0x00C00000)) - SIMD::Float(0x00C00000));
4877 	}
4878 	else
4879 	{
4880 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
4881 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4882 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4883 		nearbyint->addArg(clamped.value());
4884 		::basicBlock->appendInst(nearbyint);
4885 
4886 		return RValue<SIMD::Int>(V(result));
4887 	}
4888 }
4889 
Extract128(RValue<SIMD::Int> val,int i)4890 RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
4891 {
4892 	ASSERT(SIMD::Width == 4);
4893 	ASSERT(i == 0);
4894 
4895 	return As<Int4>(val);
4896 }
4897 
Insert128(RValue<SIMD::Int> val,RValue<Int4> element,int i)4898 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
4899 {
4900 	ASSERT(SIMD::Width == 4);
4901 	ASSERT(i == 0);
4902 
4903 	return As<SIMD::Int>(element);
4904 }
4905 
type()4906 Type *SIMD::Int::type()
4907 {
4908 	return T(Ice::IceType_v4i32);
4909 }
4910 
UInt(RValue<SIMD::Float> cast)4911 SIMD::UInt::UInt(RValue<SIMD::Float> cast)
4912     : XYZW(this)
4913 {
4914 	RR_DEBUG_INFO_UPDATE_LOC();
4915 	// Smallest positive value representable in UInt, but not in Int
4916 	const unsigned int ustart = 0x80000000u;
4917 	const float ustartf = float(ustart);
4918 
4919 	// Check if the value can be represented as an Int
4920 	SIMD::Int uiValue = CmpNLT(cast, SIMD::Float(ustartf));
4921 	// If the value is too large, subtract ustart and re-add it after conversion.
4922 	uiValue = (uiValue & As<SIMD::Int>(As<SIMD::UInt>(SIMD::Int(cast - SIMD::Float(ustartf))) + SIMD::UInt(ustart))) |
4923 	          // Otherwise, just convert normally
4924 	          (~uiValue & SIMD::Int(cast));
4925 	// If the value is negative, store 0, otherwise store the result of the conversion
4926 	storeValue((~(As<SIMD::Int>(cast) >> 31) & uiValue).value());
4927 }
4928 
UInt(RValue<scalar::UInt> rhs)4929 SIMD::UInt::UInt(RValue<scalar::UInt> rhs)
4930     : XYZW(this)
4931 {
4932 	RR_DEBUG_INFO_UPDATE_LOC();
4933 	Value *vector = Nucleus::createBitCast(rhs.value(), SIMD::UInt::type());
4934 
4935 	std::vector<int> swizzle = { 0 };
4936 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
4937 
4938 	storeValue(replicate);
4939 }
4940 
operator <<(RValue<SIMD::UInt> lhs,unsigned char rhs)4941 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs)
4942 {
4943 	RR_DEBUG_INFO_UPDATE_LOC();
4944 	if(emulateIntrinsics)
4945 	{
4946 		return Scalarize([rhs](auto x) { return x << rhs; }, lhs);
4947 	}
4948 	else
4949 	{
4950 		return RValue<SIMD::UInt>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
4951 	}
4952 }
4953 
operator >>(RValue<SIMD::UInt> lhs,unsigned char rhs)4954 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs)
4955 {
4956 	RR_DEBUG_INFO_UPDATE_LOC();
4957 	if(emulateIntrinsics)
4958 	{
4959 		return Scalarize([rhs](auto x) { return x >> rhs; }, lhs);
4960 	}
4961 	else
4962 	{
4963 		return RValue<SIMD::UInt>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
4964 	}
4965 }
4966 
CmpEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4967 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4968 {
4969 	RR_DEBUG_INFO_UPDATE_LOC();
4970 	return RValue<SIMD::UInt>(Nucleus::createICmpEQ(x.value(), y.value()));
4971 }
4972 
CmpLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4973 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4974 {
4975 	RR_DEBUG_INFO_UPDATE_LOC();
4976 	return RValue<SIMD::UInt>(Nucleus::createICmpULT(x.value(), y.value()));
4977 }
4978 
CmpLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4979 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4980 {
4981 	RR_DEBUG_INFO_UPDATE_LOC();
4982 	return RValue<SIMD::UInt>(Nucleus::createICmpULE(x.value(), y.value()));
4983 }
4984 
CmpNEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4985 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4986 {
4987 	RR_DEBUG_INFO_UPDATE_LOC();
4988 	return RValue<SIMD::UInt>(Nucleus::createICmpNE(x.value(), y.value()));
4989 }
4990 
CmpNLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4991 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4992 {
4993 	RR_DEBUG_INFO_UPDATE_LOC();
4994 	return RValue<SIMD::UInt>(Nucleus::createICmpUGE(x.value(), y.value()));
4995 }
4996 
CmpNLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4997 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4998 {
4999 	RR_DEBUG_INFO_UPDATE_LOC();
5000 	return RValue<SIMD::UInt>(Nucleus::createICmpUGT(x.value(), y.value()));
5001 }
5002 
Max(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)5003 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
5004 {
5005 	RR_DEBUG_INFO_UPDATE_LOC();
5006 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
5007 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
5008 	::basicBlock->appendInst(cmp);
5009 
5010 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
5011 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
5012 	::basicBlock->appendInst(select);
5013 
5014 	return RValue<SIMD::UInt>(V(result));
5015 }
5016 
Min(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)5017 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
5018 {
5019 	RR_DEBUG_INFO_UPDATE_LOC();
5020 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
5021 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
5022 	::basicBlock->appendInst(cmp);
5023 
5024 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
5025 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
5026 	::basicBlock->appendInst(select);
5027 
5028 	return RValue<SIMD::UInt>(V(result));
5029 }
5030 
Extract128(RValue<SIMD::UInt> val,int i)5031 RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
5032 {
5033 	ASSERT(SIMD::Width == 4);
5034 	ASSERT(i == 0);
5035 
5036 	return As<UInt4>(val);
5037 }
5038 
Insert128(RValue<SIMD::UInt> val,RValue<UInt4> element,int i)5039 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
5040 {
5041 	ASSERT(SIMD::Width == 4);
5042 	ASSERT(i == 0);
5043 
5044 	return As<SIMD::UInt>(element);
5045 }
5046 
type()5047 Type *SIMD::UInt::type()
5048 {
5049 	return T(Ice::IceType_v4i32);
5050 }
5051 
Float(RValue<scalar::Float> rhs)5052 SIMD::Float::Float(RValue<scalar::Float> rhs)
5053     : XYZW(this)
5054 {
5055 	RR_DEBUG_INFO_UPDATE_LOC();
5056 	Value *vector = Nucleus::createBitCast(rhs.value(), SIMD::Float::type());
5057 
5058 	std::vector<int> swizzle = { 0 };
5059 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
5060 
5061 	storeValue(replicate);
5062 }
5063 
operator %(RValue<SIMD::Float> lhs,RValue<SIMD::Float> rhs)5064 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs)
5065 {
5066 	return ScalarizeCall(fmodf, lhs, rhs);
5067 }
5068 
MulAdd(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)5069 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
5070 {
5071 	// TODO(b/214591655): Use FMA when available.
5072 	return x * y + z;
5073 }
5074 
FMA(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)5075 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
5076 {
5077 	// TODO(b/214591655): Use FMA instructions when available.
5078 	return ScalarizeCall(fmaf, x, y, z);
5079 }
5080 
Abs(RValue<SIMD::Float> x)5081 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x)
5082 {
5083 	// TODO: Optimize.
5084 	Value *vector = Nucleus::createBitCast(x.value(), SIMD::Int::type());
5085 	std::vector<int64_t> constantVector = { 0x7FFFFFFF };
5086 	Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, SIMD::Int::type()));
5087 
5088 	return As<SIMD::Float>(result);
5089 }
5090 
Max(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5091 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5092 {
5093 	RR_DEBUG_INFO_UPDATE_LOC();
5094 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
5095 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value(), y.value());
5096 	::basicBlock->appendInst(cmp);
5097 
5098 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5099 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
5100 	::basicBlock->appendInst(select);
5101 
5102 	return RValue<SIMD::Float>(V(result));
5103 }
5104 
Min(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5105 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5106 {
5107 	RR_DEBUG_INFO_UPDATE_LOC();
5108 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
5109 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value(), y.value());
5110 	::basicBlock->appendInst(cmp);
5111 
5112 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5113 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
5114 	::basicBlock->appendInst(select);
5115 
5116 	return RValue<SIMD::Float>(V(result));
5117 }
5118 
Sqrt(RValue<SIMD::Float> x)5119 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x)
5120 {
5121 	RR_DEBUG_INFO_UPDATE_LOC();
5122 	if(emulateIntrinsics || CPUID::ARM)
5123 	{
5124 		return Scalarize([](auto a) { return Sqrt(a); }, x);
5125 	}
5126 	else
5127 	{
5128 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5129 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
5130 		auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
5131 		sqrt->addArg(x.value());
5132 		::basicBlock->appendInst(sqrt);
5133 
5134 		return RValue<SIMD::Float>(V(result));
5135 	}
5136 }
5137 
CmpEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5138 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5139 {
5140 	RR_DEBUG_INFO_UPDATE_LOC();
5141 	return RValue<SIMD::Int>(Nucleus::createFCmpOEQ(x.value(), y.value()));
5142 }
5143 
CmpLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5144 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5145 {
5146 	RR_DEBUG_INFO_UPDATE_LOC();
5147 	return RValue<SIMD::Int>(Nucleus::createFCmpOLT(x.value(), y.value()));
5148 }
5149 
CmpLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5150 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5151 {
5152 	RR_DEBUG_INFO_UPDATE_LOC();
5153 	return RValue<SIMD::Int>(Nucleus::createFCmpOLE(x.value(), y.value()));
5154 }
5155 
CmpNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5156 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5157 {
5158 	RR_DEBUG_INFO_UPDATE_LOC();
5159 	return RValue<SIMD::Int>(Nucleus::createFCmpONE(x.value(), y.value()));
5160 }
5161 
CmpNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5162 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5163 {
5164 	RR_DEBUG_INFO_UPDATE_LOC();
5165 	return RValue<SIMD::Int>(Nucleus::createFCmpOGE(x.value(), y.value()));
5166 }
5167 
CmpNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5168 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5169 {
5170 	RR_DEBUG_INFO_UPDATE_LOC();
5171 	return RValue<SIMD::Int>(Nucleus::createFCmpOGT(x.value(), y.value()));
5172 }
5173 
CmpUEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5174 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5175 {
5176 	RR_DEBUG_INFO_UPDATE_LOC();
5177 	return RValue<SIMD::Int>(Nucleus::createFCmpUEQ(x.value(), y.value()));
5178 }
5179 
CmpULT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5180 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5181 {
5182 	RR_DEBUG_INFO_UPDATE_LOC();
5183 	return RValue<SIMD::Int>(Nucleus::createFCmpULT(x.value(), y.value()));
5184 }
5185 
CmpULE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5186 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5187 {
5188 	RR_DEBUG_INFO_UPDATE_LOC();
5189 	return RValue<SIMD::Int>(Nucleus::createFCmpULE(x.value(), y.value()));
5190 }
5191 
CmpUNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5192 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5193 {
5194 	RR_DEBUG_INFO_UPDATE_LOC();
5195 	return RValue<SIMD::Int>(Nucleus::createFCmpUNE(x.value(), y.value()));
5196 }
5197 
CmpUNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5198 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5199 {
5200 	RR_DEBUG_INFO_UPDATE_LOC();
5201 	return RValue<SIMD::Int>(Nucleus::createFCmpUGE(x.value(), y.value()));
5202 }
5203 
CmpUNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)5204 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
5205 {
5206 	RR_DEBUG_INFO_UPDATE_LOC();
5207 	return RValue<SIMD::Int>(Nucleus::createFCmpUGT(x.value(), y.value()));
5208 }
5209 
Round(RValue<SIMD::Float> x)5210 RValue<SIMD::Float> Round(RValue<SIMD::Float> x)
5211 {
5212 	RR_DEBUG_INFO_UPDATE_LOC();
5213 	if(emulateIntrinsics || CPUID::ARM)
5214 	{
5215 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
5216 		return (x + SIMD::Float(0x00C00000)) - SIMD::Float(0x00C00000);
5217 	}
5218 	else if(CPUID::SSE4_1)
5219 	{
5220 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5221 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
5222 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
5223 		round->addArg(x.value());
5224 		round->addArg(::context->getConstantInt32(0));
5225 		::basicBlock->appendInst(round);
5226 
5227 		return RValue<SIMD::Float>(V(result));
5228 	}
5229 	else
5230 	{
5231 		return SIMD::Float(RoundInt(x));
5232 	}
5233 }
5234 
Trunc(RValue<SIMD::Float> x)5235 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x)
5236 {
5237 	RR_DEBUG_INFO_UPDATE_LOC();
5238 	if(CPUID::SSE4_1)
5239 	{
5240 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5241 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
5242 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
5243 		round->addArg(x.value());
5244 		round->addArg(::context->getConstantInt32(3));
5245 		::basicBlock->appendInst(round);
5246 
5247 		return RValue<SIMD::Float>(V(result));
5248 	}
5249 	else
5250 	{
5251 		return SIMD::Float(SIMD::Int(x));
5252 	}
5253 }
5254 
Frac(RValue<SIMD::Float> x)5255 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x)
5256 {
5257 	RR_DEBUG_INFO_UPDATE_LOC();
5258 	SIMD::Float frc;
5259 
5260 	if(CPUID::SSE4_1)
5261 	{
5262 		frc = x - Floor(x);
5263 	}
5264 	else
5265 	{
5266 		frc = x - SIMD::Float(SIMD::Int(x));  // Signed fractional part.
5267 
5268 		frc += As<SIMD::Float>(As<SIMD::Int>(CmpNLE(SIMD::Float(0.0f), frc)) & As<SIMD::Int>(SIMD::Float(1.0f)));  // Add 1.0 if negative.
5269 	}
5270 
5271 	// x - floor(x) can be 1.0 for very small negative x.
5272 	// Clamp against the value just below 1.0.
5273 	return Min(frc, As<SIMD::Float>(SIMD::Int(0x3F7FFFFF)));
5274 }
5275 
Floor(RValue<SIMD::Float> x)5276 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x)
5277 {
5278 	RR_DEBUG_INFO_UPDATE_LOC();
5279 	if(CPUID::SSE4_1)
5280 	{
5281 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5282 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
5283 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
5284 		round->addArg(x.value());
5285 		round->addArg(::context->getConstantInt32(1));
5286 		::basicBlock->appendInst(round);
5287 
5288 		return RValue<SIMD::Float>(V(result));
5289 	}
5290 	else
5291 	{
5292 		return x - Frac(x);
5293 	}
5294 }
5295 
Ceil(RValue<SIMD::Float> x)5296 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x)
5297 {
5298 	RR_DEBUG_INFO_UPDATE_LOC();
5299 	if(CPUID::SSE4_1)
5300 	{
5301 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
5302 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
5303 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
5304 		round->addArg(x.value());
5305 		round->addArg(::context->getConstantInt32(2));
5306 		::basicBlock->appendInst(round);
5307 
5308 		return RValue<SIMD::Float>(V(result));
5309 	}
5310 	else
5311 	{
5312 		return -Floor(-x);
5313 	}
5314 }
5315 
Extract128(RValue<SIMD::Float> val,int i)5316 RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
5317 {
5318 	ASSERT(SIMD::Width == 4);
5319 	ASSERT(i == 0);
5320 
5321 	return As<Float4>(val);
5322 }
5323 
Insert128(RValue<SIMD::Float> val,RValue<Float4> element,int i)5324 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
5325 {
5326 	ASSERT(SIMD::Width == 4);
5327 	ASSERT(i == 0);
5328 
5329 	return As<SIMD::Float>(element);
5330 }
5331 
type()5332 Type *SIMD::Float::type()
5333 {
5334 	return T(Ice::IceType_v4f32);
5335 }
5336 
5337 }  // namespace rr
5338