/*-------------------------------------------------------------------------
 * drawElements Quality Program OpenGL ES 3.1 Module
 * -------------------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Basic Compute Shader Tests.
 *//*--------------------------------------------------------------------*/

#include "es31fBasicComputeShaderTests.hpp"
#include "gluShaderProgram.hpp"
#include "gluObjectWrapper.hpp"
#include "gluRenderContext.hpp"
#include "gluProgramInterfaceQuery.hpp"
#include "gluContextInfo.hpp"
#include "glwFunctions.hpp"
#include "glwEnums.hpp"
#include "tcuTestLog.hpp"
#include "deRandom.hpp"
#include "deStringUtil.hpp"
#include "deMemory.h"

namespace deqp
{
namespace gles31
{
namespace Functional
{

using std::string;
using std::vector;
using tcu::TestLog;
using namespace glu;

//! Utility for mapping buffers.
class BufferMemMap
{
public:
    BufferMemMap(const glw::Functions &gl, uint32_t target, int offset, int size, uint32_t access)
        : m_gl(gl)
        , m_target(target)
        , m_ptr(DE_NULL)
    {
        m_ptr = gl.mapBufferRange(target, offset, size, access);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glMapBufferRange()");
        TCU_CHECK(m_ptr);
    }

    ~BufferMemMap(void)
    {
        m_gl.unmapBuffer(m_target);
    }

    void *getPtr(void) const
    {
        return m_ptr;
    }
    void *operator*(void) const
    {
        return m_ptr;
    }

private:
    BufferMemMap(const BufferMemMap &other);
    BufferMemMap &operator=(const BufferMemMap &other);

    const glw::Functions &m_gl;
    const uint32_t m_target;
    void *m_ptr;
};

namespace
{

class EmptyComputeShaderCase : public TestCase
{
public:
    EmptyComputeShaderCase(Context &context) : TestCase(context, "empty", "Empty shader")
    {
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = 1) in;\n"
               "void main (void) {}\n";

        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        gl.useProgram(program.getProgram());
        gl.dispatchCompute(1, 1, 1);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }
};

class UBOToSSBOInvertCase : public TestCase
{
public:
    UBOToSSBOInvertCase(Context &context, const char *name, const char *description, int numValues,
                        const tcu::IVec3 &localSize, const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_numValues(numValues)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
        DE_ASSERT(m_numValues % (m_workSize[0] * m_workSize[1] * m_workSize[2] * m_localSize[0] * m_localSize[1] *
                                 m_localSize[2]) ==
                  0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "uniform Input {\n"
            << "    uint values[" << m_numValues << "];\n"
            << "} ub_in;\n"
            << "layout(binding = 1) buffer Output {\n"
            << "    uint values[" << m_numValues << "];\n"
            << "} sb_out;\n"
            << "void main (void) {\n"
            << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
            << "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
            << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
               "gl_GlobalInvocationID.x;\n"
            << "    uint offset          = numValuesPerInv*groupNdx;\n"
            << "\n"
            << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
            << "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));
        const Buffer inputBuffer(m_context.getRenderContext());
        const Buffer outputBuffer(m_context.getRenderContext());
        std::vector<uint32_t> inputValues(m_numValues);

        // Compute input values.
        {
            de::Random rnd(0x111223f);
            for (int ndx = 0; ndx < (int)inputValues.size(); ndx++)
                inputValues[ndx] = rnd.getUint32();
        }

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Input buffer setup
        {
            const uint32_t blockIndex = gl.getProgramResourceIndex(program.getProgram(), GL_UNIFORM_BLOCK, "Input");
            const InterfaceBlockInfo blockInfo =
                getProgramInterfaceBlockInfo(gl, program.getProgram(), GL_UNIFORM_BLOCK, blockIndex);
            const uint32_t valueIndex = gl.getProgramResourceIndex(program.getProgram(), GL_UNIFORM, "Input.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_UNIFORM, valueIndex);

            gl.bindBuffer(GL_UNIFORM_BUFFER, *inputBuffer);
            gl.bufferData(GL_UNIFORM_BUFFER, (glw::GLsizeiptr)blockInfo.dataSize, DE_NULL, GL_STATIC_DRAW);

            {
                const BufferMemMap bufMap(gl, GL_UNIFORM_BUFFER, 0, (int)blockInfo.dataSize, GL_MAP_WRITE_BIT);

                for (uint32_t ndx = 0; ndx < de::min(valueInfo.arraySize, (uint32_t)inputValues.size()); ndx++)
                    *(uint32_t *)((uint8_t *)bufMap.getPtr() + valueInfo.offset + ndx * valueInfo.arrayStride) =
                        inputValues[ndx];
            }

            gl.uniformBlockBinding(program.getProgram(), blockIndex, 0);
            gl.bindBufferBase(GL_UNIFORM_BUFFER, 0, *inputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Input buffer setup failed");
        }

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());
            for (uint32_t ndx = 0; ndx < valueInfo.arraySize; ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                          valueInfo.arrayStride * ndx));
                const uint32_t ref = ~inputValues[ndx];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for Output.values[") + de::toString(ndx) + "]");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const int m_numValues;
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class CopyInvertSSBOCase : public TestCase
{
public:
    CopyInvertSSBOCase(Context &context, const char *name, const char *description, int numValues,
                       const tcu::IVec3 &localSize, const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_numValues(numValues)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
        DE_ASSERT(m_numValues % (m_workSize[0] * m_workSize[1] * m_workSize[2] * m_localSize[0] * m_localSize[1] *
                                 m_localSize[2]) ==
                  0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Input {\n"
            << "    uint values[" << m_numValues << "];\n"
            << "} sb_in;\n"
            << "layout (binding = 1) buffer Output {\n"
            << "    uint values[" << m_numValues << "];\n"
            << "} sb_out;\n"
            << "void main (void) {\n"
            << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
            << "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
            << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
               "gl_GlobalInvocationID.x;\n"
            << "    uint offset          = numValuesPerInv*groupNdx;\n"
            << "\n"
            << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
            << "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));
        const Buffer inputBuffer(m_context.getRenderContext());
        const Buffer outputBuffer(m_context.getRenderContext());
        std::vector<uint32_t> inputValues(m_numValues);

        // Compute input values.
        {
            de::Random rnd(0x124fef);
            for (int ndx = 0; ndx < (int)inputValues.size(); ndx++)
                inputValues[ndx] = rnd.getUint32();
        }

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Input buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Input");
            const InterfaceBlockInfo blockInfo =
                getProgramInterfaceBlockInfo(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Input.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *inputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, (glw::GLsizeiptr)blockInfo.dataSize, DE_NULL, GL_STATIC_DRAW);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, (int)blockInfo.dataSize, GL_MAP_WRITE_BIT);

                for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
                    *(uint32_t *)((uint8_t *)bufMap.getPtr() + valueInfo.offset + ndx * valueInfo.arrayStride) =
                        inputValues[ndx];
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, blockInfo.bufferBinding, *inputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Input buffer setup failed");
        }

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const InterfaceBlockInfo blockInfo =
                getProgramInterfaceBlockInfo(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockInfo.dataSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, blockInfo.bufferBinding, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());
            for (uint32_t ndx = 0; ndx < valueInfo.arraySize; ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                          valueInfo.arrayStride * ndx));
                const uint32_t ref = ~inputValues[ndx];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for Output.values[") + de::toString(ndx) + "]");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const int m_numValues;
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class InvertSSBOInPlaceCase : public TestCase
{
public:
    InvertSSBOInPlaceCase(Context &context, const char *name, const char *description, int numValues, bool isSized,
                          const tcu::IVec3 &localSize, const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_numValues(numValues)
        , m_isSized(isSized)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
        DE_ASSERT(m_numValues % (m_workSize[0] * m_workSize[1] * m_workSize[2] * m_localSize[0] * m_localSize[1] *
                                 m_localSize[2]) ==
                  0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer InOut {\n"
            << "    uint values[" << (m_isSized ? de::toString(m_numValues) : string("")) << "];\n"
            << "} sb_inout;\n"
            << "void main (void) {\n"
            << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
            << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
            << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
               "gl_GlobalInvocationID.x;\n"
            << "    uint offset          = numValuesPerInv*groupNdx;\n"
            << "\n"
            << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
            << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        const Buffer outputBuffer(m_context.getRenderContext());
        const uint32_t valueIndex =
            gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "InOut.values");
        const InterfaceVariableInfo valueInfo =
            getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
        const uint32_t blockSize = valueInfo.arrayStride * (uint32_t)m_numValues;
        std::vector<uint32_t> inputValues(m_numValues);

        // Compute input values.
        {
            de::Random rnd(0x82ce7f);
            for (int ndx = 0; ndx < (int)inputValues.size(); ndx++)
                inputValues[ndx] = rnd.getUint32();
        }

        TCU_CHECK(valueInfo.arraySize == (uint32_t)(m_isSized ? m_numValues : 0));

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Output buffer setup
        {
            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_DRAW);

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, (int)blockSize, GL_MAP_WRITE_BIT);

                for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
                    *(uint32_t *)((uint8_t *)bufMap.getPtr() + valueInfo.offset + ndx * valueInfo.arrayStride) =
                        inputValues[ndx];
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                          valueInfo.arrayStride * ndx));
                const uint32_t ref = ~inputValues[ndx];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for InOut.values[") + de::toString(ndx) + "]");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const int m_numValues;
    const bool m_isSized;
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class WriteToMultipleSSBOCase : public TestCase
{
public:
    WriteToMultipleSSBOCase(Context &context, const char *name, const char *description, int numValues, bool isSized,
                            const tcu::IVec3 &localSize, const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_numValues(numValues)
        , m_isSized(isSized)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
        DE_ASSERT(m_numValues % (m_workSize[0] * m_workSize[1] * m_workSize[2] * m_localSize[0] * m_localSize[1] *
                                 m_localSize[2]) ==
                  0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Out0 {\n"
            << "    uint values[" << (m_isSized ? de::toString(m_numValues) : string("")) << "];\n"
            << "} sb_out0;\n"
            << "layout(binding = 1) buffer Out1 {\n"
            << "    uint values[" << (m_isSized ? de::toString(m_numValues) : string("")) << "];\n"
            << "} sb_out1;\n"
            << "void main (void) {\n"
            << "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
            << "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
               "gl_GlobalInvocationID.x;\n"
            << "\n"
            << "    {\n"
            << "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
            << "        uint offset          = numValuesPerInv*groupNdx;\n"
            << "\n"
            << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
            << "            sb_out0.values[offset + ndx] = offset + ndx;\n"
            << "    }\n"
            << "    {\n"
            << "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
            << "        uint offset          = numValuesPerInv*groupNdx;\n"
            << "\n"
            << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
            << "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
            << "    }\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        const Buffer outputBuffer0(m_context.getRenderContext());
        const uint32_t value0Index =
            gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Out0.values");
        const InterfaceVariableInfo value0Info =
            getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, value0Index);
        const uint32_t block0Size = value0Info.arrayStride * (uint32_t)m_numValues;

        const Buffer outputBuffer1(m_context.getRenderContext());
        const uint32_t value1Index =
            gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Out1.values");
        const InterfaceVariableInfo value1Info =
            getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, value1Index);
        const uint32_t block1Size = value1Info.arrayStride * (uint32_t)m_numValues;

        TCU_CHECK(value0Info.arraySize == (uint32_t)(m_isSized ? m_numValues : 0));
        TCU_CHECK(value1Info.arraySize == (uint32_t)(m_isSized ? m_numValues : 0));

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Output buffer setup
        {
            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer0);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, block0Size, DE_NULL, GL_STREAM_DRAW);

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer0);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Buffer setup failed");
        }
        {
            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer1);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, block1Size, DE_NULL, GL_STREAM_DRAW);

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, *outputBuffer1);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer0);
        {
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, block0Size, GL_MAP_READ_BIT);

            for (uint32_t ndx = 0; ndx < (uint32_t)m_numValues; ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + value0Info.offset +
                                                          value0Info.arrayStride * ndx));
                const uint32_t ref = ndx;

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for Out0.values[") + de::toString(ndx) +
                                         "] res=" + de::toString(res) + " ref=" + de::toString(ref));
            }
        }
        gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer1);
        {
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, block1Size, GL_MAP_READ_BIT);

            for (uint32_t ndx = 0; ndx < (uint32_t)m_numValues; ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + value1Info.offset +
                                                          value1Info.arrayStride * ndx));
                const uint32_t ref = m_numValues - ndx;

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for Out1.values[") + de::toString(ndx) +
                                         "] res=" + de::toString(res) + " ref=" + de::toString(ref));
            }
        }
        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const int m_numValues;
    const bool m_isSized;
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class SSBOLocalBarrierCase : public TestCase
{
public:
    SSBOLocalBarrierCase(Context &context, const char *name, const char *description, const tcu::IVec3 &localSize,
                         const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer outputBuffer(m_context.getRenderContext());
        const int workGroupSize  = m_localSize[0] * m_localSize[1] * m_localSize[2];
        const int workGroupCount = m_workSize[0] * m_workSize[1] * m_workSize[2];
        const int numValues      = workGroupSize * workGroupCount;

        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Output {\n"
            << "    coherent uint values[" << numValues << "];\n"
            << "} sb_out;\n\n"
            << "shared uint offsets[" << workGroupSize << "];\n\n"
            << "void main (void) {\n"
            << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
            << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
            << "    uint globalOffs = localSize*globalNdx;\n"
            << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
               "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
            << "\n"
            << "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
            << "    memoryBarrierBuffer();\n"
            << "    barrier();\n"
            << "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"
            << "    memoryBarrierBuffer();\n"
            << "    barrier();\n"
            << "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
            << "}\n";

        const ShaderProgram program(m_context.getRenderContext(), ProgramSources() << ComputeSource(src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            for (int groupNdx = 0; groupNdx < workGroupCount; groupNdx++)
            {
                for (int localOffs = 0; localOffs < workGroupSize; localOffs++)
                {
                    const int globalOffs = groupNdx * workGroupSize;
                    const uint32_t res   = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                              valueInfo.arrayStride * (globalOffs + localOffs)));
                    const int offs0      = localOffs - 1 < 0 ? ((localOffs + workGroupSize - 1) % workGroupSize) :
                                                               ((localOffs - 1) % workGroupSize);
                    const int offs1      = localOffs - 2 < 0 ? ((localOffs + workGroupSize - 2) % workGroupSize) :
                                                               ((localOffs - 2) % workGroupSize);
                    const uint32_t ref   = (uint32_t)(globalOffs + offs0 + offs1);

                    if (res != ref)
                        throw tcu::TestError(string("Comparison failed for Output.values[") +
                                             de::toString(globalOffs + localOffs) + "]");
                }
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class SSBOBarrierCase : public TestCase
{
public:
    SSBOBarrierCase(Context &context, const char *name, const char *description, const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        const char *const glslVersionDeclaration = getGLSLVersionDeclaration(glslVersion);

        std::ostringstream src0;
        src0 << glslVersionDeclaration << "\n"
             << "layout (local_size_x = 1) in;\n"
                "uniform uint u_baseVal;\n"
                "layout(binding = 1) buffer Output {\n"
                "    uint values[];\n"
                "};\n"
                "void main (void) {\n"
                "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
                "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
                "    values[offset] = u_baseVal+offset;\n"
                "}\n";

        std::ostringstream src1;
        src1 << glslVersionDeclaration << "\n"
             << "layout (local_size_x = 1) in;\n"
                "uniform uint u_baseVal;\n"
                "layout(binding = 1) buffer Input {\n"
                "    uint values[];\n"
                "};\n"
                "layout(binding = 0) buffer Output {\n"
                "    coherent uint sum;\n"
                "};\n"
                "void main (void) {\n"
                "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
                "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
                "    uint value  = values[offset];\n"
                "    atomicAdd(sum, value);\n"
                "}\n";

        const ShaderProgram program0(m_context.getRenderContext(), ProgramSources() << ComputeSource(src0.str()));
        const ShaderProgram program1(m_context.getRenderContext(), ProgramSources() << ComputeSource(src1.str()));

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer tempBuffer(m_context.getRenderContext());
        const Buffer outputBuffer(m_context.getRenderContext());
        const uint32_t baseValue = 127;

        m_testCtx.getLog() << program0 << program1;
        if (!program0.isOk() || !program1.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        // Temp buffer setup
        {
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program0.getProgram(), GL_BUFFER_VARIABLE, "values[0]");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program0.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const uint32_t bufferSize = valueInfo.arrayStride * m_workSize[0] * m_workSize[1] * m_workSize[2];

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *tempBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, (glw::GLsizeiptr)bufferSize, DE_NULL, GL_STATIC_DRAW);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, *tempBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Temp buffer setup failed");
        }

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program1.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program1.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_WRITE_BIT);
                deMemset(bufMap.getPtr(), 0, blockSize);
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.useProgram(program0.getProgram());
        gl.uniform1ui(gl.getUniformLocation(program0.getProgram(), "u_baseVal"), baseValue);
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        gl.memoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
        gl.useProgram(program1.getProgram());
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Failed to dispatch commands");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program1.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program1.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex = gl.getProgramResourceIndex(program1.getProgram(), GL_BUFFER_VARIABLE, "sum");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program1.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset));
            uint32_t ref       = 0;

            for (int ndx = 0; ndx < m_workSize[0] * m_workSize[1] * m_workSize[2]; ndx++)
                ref += baseValue + (uint32_t)ndx;

            if (res != ref)
            {
                m_testCtx.getLog() << TestLog::Message << "ERROR: comparison failed, expected " << ref << ", got "
                                   << res << TestLog::EndMessage;
                throw tcu::TestError("Comparison failed");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec3 m_workSize;
};

class BasicSharedVarCase : public TestCase
{
public:
    BasicSharedVarCase(Context &context, const char *name, const char *description, const tcu::IVec3 &localSize,
                       const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer outputBuffer(m_context.getRenderContext());
        const int workGroupSize  = m_localSize[0] * m_localSize[1] * m_localSize[2];
        const int workGroupCount = m_workSize[0] * m_workSize[1] * m_workSize[2];
        const int numValues      = workGroupSize * workGroupCount;

        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Output {\n"
            << "    uint values[" << numValues << "];\n"
            << "} sb_out;\n\n"
            << "shared uint offsets[" << workGroupSize << "];\n\n"
            << "void main (void) {\n"
            << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
            << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
            << "    uint globalOffs = localSize*globalNdx;\n"
            << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
               "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
            << "\n"
            << "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
            << "    barrier();\n"
            << "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
            << "}\n";

        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            for (int groupNdx = 0; groupNdx < workGroupCount; groupNdx++)
            {
                for (int localOffs = 0; localOffs < workGroupSize; localOffs++)
                {
                    const int globalOffs = groupNdx * workGroupSize;
                    const uint32_t res   = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                              valueInfo.arrayStride * (globalOffs + localOffs)));
                    const uint32_t ref =
                        (uint32_t)(globalOffs + (workGroupSize - localOffs - 1) * (workGroupSize - localOffs - 1));

                    if (res != ref)
                        throw tcu::TestError(string("Comparison failed for Output.values[") +
                                             de::toString(globalOffs + localOffs) + "]");
                }
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class SharedVarAtomicOpCase : public TestCase
{
public:
    SharedVarAtomicOpCase(Context &context, const char *name, const char *description, const tcu::IVec3 &localSize,
                          const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer outputBuffer(m_context.getRenderContext());
        const int workGroupSize  = m_localSize[0] * m_localSize[1] * m_localSize[2];
        const int workGroupCount = m_workSize[0] * m_workSize[1] * m_workSize[2];
        const int numValues      = workGroupSize * workGroupCount;

        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Output {\n"
            << "    uint values[" << numValues << "];\n"
            << "} sb_out;\n\n"
            << "shared uint count;\n\n"
            << "void main (void) {\n"
            << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
            << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
            << "    uint globalOffs = localSize*globalNdx;\n"
            << "\n"
            << "    count = 0u;\n"
            << "    barrier();\n"
            << "    uint oldVal = atomicAdd(count, 1u);\n"
            << "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
            << "}\n";

        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            for (int groupNdx = 0; groupNdx < workGroupCount; groupNdx++)
            {
                for (int localOffs = 0; localOffs < workGroupSize; localOffs++)
                {
                    const int globalOffs = groupNdx * workGroupSize;
                    const uint32_t res   = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                              valueInfo.arrayStride * (globalOffs + localOffs)));
                    const uint32_t ref   = (uint32_t)(localOffs + 1);

                    if (res != ref)
                        throw tcu::TestError(string("Comparison failed for Output.values[") +
                                             de::toString(globalOffs + localOffs) + "]");
                }
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

class CopyImageToSSBOCase : public TestCase
{
public:
    CopyImageToSSBOCase(Context &context, const char *name, const char *description, const tcu::IVec2 &localSize,
                        const tcu::IVec2 &imageSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_imageSize(imageSize)
    {
        DE_ASSERT(m_imageSize[0] % m_localSize[0] == 0);
        DE_ASSERT(m_imageSize[1] % m_localSize[1] == 0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1] << ") in;\n"
            << "layout(r32ui, binding = 1) readonly uniform highp uimage2D u_srcImg;\n"
            << "layout(binding = 0) buffer Output {\n"
            << "    uint values[" << (m_imageSize[0] * m_imageSize[1]) << "];\n"
            << "} sb_out;\n\n"
            << "void main (void) {\n"
            << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
            << "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
            << "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer outputBuffer(m_context.getRenderContext());
        const Texture inputTexture(m_context.getRenderContext());
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));
        const tcu::IVec2 workSize = m_imageSize / m_localSize;
        de::Random rnd(0xab2c7);
        vector<uint32_t> inputValues(m_imageSize[0] * m_imageSize[1]);

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Input values
        for (vector<uint32_t>::iterator i = inputValues.begin(); i != inputValues.end(); ++i)
            *i = rnd.getUint32();

        // Input image setup
        gl.bindTexture(GL_TEXTURE_2D, *inputTexture);
        gl.texStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, m_imageSize[0], m_imageSize[1]);
        gl.texSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_imageSize[0], m_imageSize[1], GL_RED_INTEGER, GL_UNSIGNED_INT,
                         &inputValues[0]);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Uploading image data failed");

        // Bind to unit 1
        gl.bindImageTexture(1, *inputTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32UI);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Image setup failed");

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(workSize[0], workSize[1], 1);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());

            for (uint32_t ndx = 0; ndx < valueInfo.arraySize; ndx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                          valueInfo.arrayStride * ndx));
                const uint32_t ref = inputValues[ndx];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for Output.values[") + de::toString(ndx) + "]");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec2 m_localSize;
    const tcu::IVec2 m_imageSize;
};

class CopySSBOToImageCase : public TestCase
{
public:
    CopySSBOToImageCase(Context &context, const char *name, const char *description, const tcu::IVec2 &localSize,
                        const tcu::IVec2 &imageSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_imageSize(imageSize)
    {
        DE_ASSERT(m_imageSize[0] % m_localSize[0] == 0);
        DE_ASSERT(m_imageSize[1] % m_localSize[1] == 0);
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1] << ") in;\n"
            << "layout(r32ui, binding = 1) writeonly uniform highp uimage2D u_dstImg;\n"
            << "buffer Input {\n"
            << "    uint values[" << (m_imageSize[0] * m_imageSize[1]) << "];\n"
            << "} sb_in;\n\n"
            << "void main (void) {\n"
            << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
            << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
            << "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer inputBuffer(m_context.getRenderContext());
        const Texture outputTexture(m_context.getRenderContext());
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));
        const tcu::IVec2 workSize = m_imageSize / m_localSize;
        de::Random rnd(0x77238ac2);
        vector<uint32_t> inputValues(m_imageSize[0] * m_imageSize[1]);

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Input values
        for (vector<uint32_t>::iterator i = inputValues.begin(); i != inputValues.end(); ++i)
            *i = rnd.getUint32();

        // Input buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Input");
            const InterfaceBlockInfo blockInfo =
                getProgramInterfaceBlockInfo(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Input.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *inputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, (glw::GLsizeiptr)blockInfo.dataSize, DE_NULL, GL_STATIC_DRAW);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, (int)blockInfo.dataSize, GL_MAP_WRITE_BIT);

                for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
                    *(uint32_t *)((uint8_t *)bufMap.getPtr() + valueInfo.offset + ndx * valueInfo.arrayStride) =
                        inputValues[ndx];
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, blockInfo.bufferBinding, *inputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Input buffer setup failed");
        }

        // Output image setup
        gl.bindTexture(GL_TEXTURE_2D, *outputTexture);
        gl.texStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, m_imageSize[0], m_imageSize[1]);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Uploading image data failed");

        // Bind to unit 1
        gl.bindImageTexture(1, *outputTexture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32UI);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Image setup failed");

        // Dispatch compute workload
        gl.dispatchCompute(workSize[0], workSize[1], 1);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            Framebuffer fbo(m_context.getRenderContext());
            vector<uint32_t> pixels(inputValues.size() * 4);

            gl.bindFramebuffer(GL_FRAMEBUFFER, *fbo);
            gl.framebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, *outputTexture, 0);
            TCU_CHECK(gl.checkFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

            // \note In ES3 we have to use GL_RGBA_INTEGER
            gl.readBuffer(GL_COLOR_ATTACHMENT0);
            gl.readPixels(0, 0, m_imageSize[0], m_imageSize[1], GL_RGBA_INTEGER, GL_UNSIGNED_INT, &pixels[0]);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Reading pixels failed");

            for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
            {
                const uint32_t res = pixels[ndx * 4];
                const uint32_t ref = inputValues[ndx];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for pixel ") + de::toString(ndx));
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec2 m_localSize;
    const tcu::IVec2 m_imageSize;
};

class ImageAtomicOpCase : public TestCase
{
public:
    ImageAtomicOpCase(Context &context, const char *name, const char *description, int localSize,
                      const tcu::IVec2 &imageSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_imageSize(imageSize)
    {
    }

    void init(void)
    {
        auto contextType = m_context.getRenderContext().getType();
        if (!glu::contextSupports(contextType, glu::ApiType::es(3, 2)) &&
            !glu::contextSupports(contextType, glu::ApiType::core(4, 5)) &&
            !m_context.getContextInfo().isExtensionSupported("GL_OES_shader_image_atomic"))
            TCU_THROW(NotSupportedError, "Test requires OES_shader_image_atomic extension");
    }

    IterateResult iterate(void)
    {
        glu::ContextType contextType  = m_context.getRenderContext().getType();
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(contextType);
        const bool supportsES32orGL45 = glu::contextSupports(contextType, glu::ApiType::es(3, 2)) ||
                                        glu::contextSupports(contextType, glu::ApiType::core(4, 5));
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << (supportsES32orGL45 ? "\n" : "#extension GL_OES_shader_image_atomic : require\n")
            << "layout (local_size_x = " << m_localSize << ") in;\n"
            << "layout(r32ui, binding = 1) uniform highp uimage2D u_dstImg;\n"
            << "buffer Input {\n"
            << "    uint values[" << (m_imageSize[0] * m_imageSize[1] * m_localSize) << "];\n"
            << "} sb_in;\n\n"
            << "void main (void) {\n"
            << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
            << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
            << "\n"
            << "    if (gl_LocalInvocationIndex == 0u)\n"
            << "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
            << "    barrier();\n"
            << "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
            << "}\n";

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer inputBuffer(m_context.getRenderContext());
        const Texture outputTexture(m_context.getRenderContext());
        const ShaderProgram program(m_context.getRenderContext(), ProgramSources()
                                                                      << ShaderSource(SHADERTYPE_COMPUTE, src.str()));
        de::Random rnd(0x77238ac2);
        vector<uint32_t> inputValues(m_imageSize[0] * m_imageSize[1] * m_localSize);

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_imageSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Input values
        for (vector<uint32_t>::iterator i = inputValues.begin(); i != inputValues.end(); ++i)
            *i = rnd.getUint32();

        // Input buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Input");
            const InterfaceBlockInfo blockInfo =
                getProgramInterfaceBlockInfo(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Input.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *inputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, (glw::GLsizeiptr)blockInfo.dataSize, DE_NULL, GL_STATIC_DRAW);

            TCU_CHECK(valueInfo.arraySize == (uint32_t)inputValues.size());

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, (int)blockInfo.dataSize, GL_MAP_WRITE_BIT);

                for (uint32_t ndx = 0; ndx < (uint32_t)inputValues.size(); ndx++)
                    *(uint32_t *)((uint8_t *)bufMap.getPtr() + valueInfo.offset + ndx * valueInfo.arrayStride) =
                        inputValues[ndx];
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, blockInfo.bufferBinding, *inputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Input buffer setup failed");
        }

        // Output image setup
        gl.bindTexture(GL_TEXTURE_2D, *outputTexture);
        gl.texStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, m_imageSize[0], m_imageSize[1]);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Uploading image data failed");

        // Bind to unit 1
        gl.bindImageTexture(1, *outputTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32UI);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Image setup failed");

        // Dispatch compute workload
        gl.dispatchCompute(m_imageSize[0], m_imageSize[1], 1);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare
        {
            Framebuffer fbo(m_context.getRenderContext());
            vector<uint32_t> pixels(m_imageSize[0] * m_imageSize[1] * 4);

            gl.bindFramebuffer(GL_FRAMEBUFFER, *fbo);
            gl.framebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, *outputTexture, 0);
            TCU_CHECK(gl.checkFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

            // \note In ES3 we have to use GL_RGBA_INTEGER
            gl.readBuffer(GL_COLOR_ATTACHMENT0);
            gl.readPixels(0, 0, m_imageSize[0], m_imageSize[1], GL_RGBA_INTEGER, GL_UNSIGNED_INT, &pixels[0]);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Reading pixels failed");

            for (int pixelNdx = 0; pixelNdx < (int)inputValues.size() / m_localSize; pixelNdx++)
            {
                const uint32_t res = pixels[pixelNdx * 4];
                uint32_t ref       = 0;

                for (int offs = 0; offs < m_localSize; offs++)
                    ref += inputValues[pixelNdx * m_localSize + offs];

                if (res != ref)
                    throw tcu::TestError(string("Comparison failed for pixel ") + de::toString(pixelNdx));
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const int m_localSize;
    const tcu::IVec2 m_imageSize;
};

class ImageBarrierCase : public TestCase
{
public:
    ImageBarrierCase(Context &context, const char *name, const char *description, const tcu::IVec2 &workSize)
        : TestCase(context, name, description)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        const char *const glslVersionDeclaration = getGLSLVersionDeclaration(glslVersion);

        std::ostringstream src0;
        src0 << glslVersionDeclaration << "\n"
             << "layout (local_size_x = 1) in;\n"
                "uniform uint u_baseVal;\n"
                "layout(r32ui, binding = 2) writeonly uniform highp uimage2D u_img;\n"
                "void main (void) {\n"
                "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
                "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
                "    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset+u_baseVal, 0, 0, 0));\n"
                "}\n";

        std::ostringstream src1;
        src1 << glslVersionDeclaration << "\n"
             << "layout (local_size_x = 1) in;\n"
                "layout(r32ui, binding = 2) readonly uniform highp uimage2D u_img;\n"
                "layout(binding = 0) buffer Output {\n"
                "    coherent uint sum;\n"
                "};\n"
                "void main (void) {\n"
                "    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
                "    atomicAdd(sum, value);\n"
                "}\n";

        const ShaderProgram program0(m_context.getRenderContext(), ProgramSources() << ComputeSource(src0.str()));
        const ShaderProgram program1(m_context.getRenderContext(), ProgramSources() << ComputeSource(src1.str()));

        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Texture tempTexture(m_context.getRenderContext());
        const Buffer outputBuffer(m_context.getRenderContext());
        const uint32_t baseValue = 127;

        m_testCtx.getLog() << program0 << program1;
        if (!program0.isOk() || !program1.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        // Temp texture setup
        gl.bindTexture(GL_TEXTURE_2D, *tempTexture);
        gl.texStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, m_workSize[0], m_workSize[1]);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        gl.texParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Uploading image data failed");

        // Bind to unit 2
        gl.bindImageTexture(2, *tempTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32UI);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Image setup failed");

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program1.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program1.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);

            {
                const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_WRITE_BIT);
                deMemset(bufMap.getPtr(), 0, blockSize);
            }

            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.useProgram(program0.getProgram());
        gl.uniform1ui(gl.getUniformLocation(program0.getProgram(), "u_baseVal"), baseValue);
        gl.dispatchCompute(m_workSize[0], m_workSize[1], 1);
        gl.memoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
        gl.useProgram(program1.getProgram());
        gl.dispatchCompute(m_workSize[0], m_workSize[1], 1);
        GLU_EXPECT_NO_ERROR(gl.getError(), "Failed to dispatch commands");

        // Read back and compare
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program1.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program1.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex = gl.getProgramResourceIndex(program1.getProgram(), GL_BUFFER_VARIABLE, "sum");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program1.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);

            const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset));
            uint32_t ref       = 0;

            for (int ndx = 0; ndx < m_workSize[0] * m_workSize[1]; ndx++)
                ref += baseValue + (uint32_t)ndx;

            if (res != ref)
            {
                m_testCtx.getLog() << TestLog::Message << "ERROR: comparison failed, expected " << ref << ", got "
                                   << res << TestLog::EndMessage;
                throw tcu::TestError("Comparison failed");
            }
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec2 m_workSize;
};

class AtomicCounterCase : public TestCase
{
public:
    AtomicCounterCase(Context &context, const char *name, const char *description, const tcu::IVec3 &localSize,
                      const tcu::IVec3 &workSize)
        : TestCase(context, name, description)
        , m_localSize(localSize)
        , m_workSize(workSize)
    {
    }

    IterateResult iterate(void)
    {
        const glw::Functions &gl = m_context.getRenderContext().getFunctions();
        const Buffer outputBuffer(m_context.getRenderContext());
        const Buffer counterBuffer(m_context.getRenderContext());
        const int workGroupSize  = m_localSize[0] * m_localSize[1] * m_localSize[2];
        const int workGroupCount = m_workSize[0] * m_workSize[1] * m_workSize[2];
        const int numValues      = workGroupSize * workGroupCount;

        const GLSLVersion glslVersion = glu::getContextTypeGLSLVersion(m_context.getRenderContext().getType());
        std::ostringstream src;

        src << getGLSLVersionDeclaration(glslVersion) << "\n"
            << "layout (local_size_x = " << m_localSize[0] << ", local_size_y = " << m_localSize[1]
            << ", local_size_z = " << m_localSize[2] << ") in;\n"
            << "layout(binding = 0) buffer Output {\n"
            << "    uint values[" << numValues << "];\n"
            << "} sb_out;\n\n"
            << "layout(binding = 0, offset = 0) uniform atomic_uint u_count;\n\n"
            << "void main (void) {\n"
            << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
            << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
            << "    uint globalOffs = localSize*globalNdx;\n"
            << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
               "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
            << "\n"
            << "    uint oldVal = atomicCounterIncrement(u_count);\n"
            << "    sb_out.values[globalOffs+localOffs] = oldVal;\n"
            << "}\n";

        const ShaderProgram program(m_context.getRenderContext(), ProgramSources() << ComputeSource(src.str()));

        m_testCtx.getLog() << program;
        if (!program.isOk())
            TCU_FAIL("Compile failed");

        m_testCtx.getLog() << TestLog::Message << "Work groups: " << m_workSize << TestLog::EndMessage;

        gl.useProgram(program.getProgram());

        // Atomic counter buffer setup
        {
            const uint32_t uniformIndex = gl.getProgramResourceIndex(program.getProgram(), GL_UNIFORM, "u_count");
            const uint32_t bufferIndex  = getProgramResourceUint(gl, program.getProgram(), GL_UNIFORM, uniformIndex,
                                                                 GL_ATOMIC_COUNTER_BUFFER_INDEX);
            const uint32_t bufferSize   = getProgramResourceUint(gl, program.getProgram(), GL_ATOMIC_COUNTER_BUFFER,
                                                                 bufferIndex, GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_ATOMIC_COUNTER_BUFFER, *counterBuffer);
            gl.bufferData(GL_ATOMIC_COUNTER_BUFFER, bufferSize, DE_NULL, GL_STREAM_READ);

            {
                const BufferMemMap memMap(gl, GL_ATOMIC_COUNTER_BUFFER, 0, bufferSize, GL_MAP_WRITE_BIT);
                deMemset(memMap.getPtr(), 0, (int)bufferSize);
            }

            gl.bindBufferBase(GL_ATOMIC_COUNTER_BUFFER, 0, *counterBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Atomic counter buffer setup failed");
        }

        // Output buffer setup
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);

            gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *outputBuffer);
            gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockSize, DE_NULL, GL_STREAM_READ);
            gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *outputBuffer);
            GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
        }

        // Dispatch compute workload
        gl.dispatchCompute(m_workSize[0], m_workSize[1], m_workSize[2]);
        GLU_EXPECT_NO_ERROR(gl.getError(), "glDispatchCompute()");

        // Read back and compare atomic counter
        {
            const uint32_t uniformIndex = gl.getProgramResourceIndex(program.getProgram(), GL_UNIFORM, "u_count");
            const uint32_t uniformOffset =
                getProgramResourceUint(gl, program.getProgram(), GL_UNIFORM, uniformIndex, GL_OFFSET);
            const uint32_t bufferIndex = getProgramResourceUint(gl, program.getProgram(), GL_UNIFORM, uniformIndex,
                                                                GL_ATOMIC_COUNTER_BUFFER_INDEX);
            const uint32_t bufferSize  = getProgramResourceUint(gl, program.getProgram(), GL_ATOMIC_COUNTER_BUFFER,
                                                                bufferIndex, GL_BUFFER_DATA_SIZE);
            const BufferMemMap bufMap(gl, GL_ATOMIC_COUNTER_BUFFER, 0, bufferSize, GL_MAP_READ_BIT);

            const uint32_t resVal = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + uniformOffset));

            if (resVal != (uint32_t)numValues)
                throw tcu::TestError("Invalid atomic counter value");
        }

        // Read back and compare SSBO
        {
            const uint32_t blockIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_SHADER_STORAGE_BLOCK, "Output");
            const int blockSize = getProgramResourceInt(gl, program.getProgram(), GL_SHADER_STORAGE_BLOCK, blockIndex,
                                                        GL_BUFFER_DATA_SIZE);
            const uint32_t valueIndex =
                gl.getProgramResourceIndex(program.getProgram(), GL_BUFFER_VARIABLE, "Output.values");
            const InterfaceVariableInfo valueInfo =
                getProgramInterfaceVariableInfo(gl, program.getProgram(), GL_BUFFER_VARIABLE, valueIndex);
            const BufferMemMap bufMap(gl, GL_SHADER_STORAGE_BUFFER, 0, blockSize, GL_MAP_READ_BIT);
            uint32_t valSum = 0;
            uint32_t refSum = 0;

            for (int valNdx = 0; valNdx < numValues; valNdx++)
            {
                const uint32_t res = *((const uint32_t *)((const uint8_t *)bufMap.getPtr() + valueInfo.offset +
                                                          valueInfo.arrayStride * valNdx));

                valSum += res;
                refSum += (uint32_t)valNdx;

                if (!de::inBounds<uint32_t>(res, 0, (uint32_t)numValues))
                    throw tcu::TestError(string("Comparison failed for Output.values[") + de::toString(valNdx) + "]");
            }

            if (valSum != refSum)
                throw tcu::TestError("Total sum of values in Output.values doesn't match");
        }

        m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
        return STOP;
    }

private:
    const tcu::IVec3 m_localSize;
    const tcu::IVec3 m_workSize;
};

} // namespace

BasicComputeShaderTests::BasicComputeShaderTests(Context &context)
    : TestCaseGroup(context, "basic", "Basic Compute Shader Tests")
{
}

BasicComputeShaderTests::~BasicComputeShaderTests(void)
{
}

void BasicComputeShaderTests::init(void)
{
    addChild(new EmptyComputeShaderCase(m_context));

    addChild(new UBOToSSBOInvertCase(m_context, "ubo_to_ssbo_single_invocation",
                                     "Copy from UBO to SSBO, inverting bits", 256, tcu::IVec3(1, 1, 1),
                                     tcu::IVec3(1, 1, 1)));
    addChild(new UBOToSSBOInvertCase(m_context, "ubo_to_ssbo_single_group", "Copy from UBO to SSBO, inverting bits",
                                     1024, tcu::IVec3(2, 1, 4), tcu::IVec3(1, 1, 1)));
    addChild(new UBOToSSBOInvertCase(m_context, "ubo_to_ssbo_multiple_invocations",
                                     "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1, 1, 1),
                                     tcu::IVec3(2, 4, 1)));
    addChild(new UBOToSSBOInvertCase(m_context, "ubo_to_ssbo_multiple_groups", "Copy from UBO to SSBO, inverting bits",
                                     1024, tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4)));

    addChild(new CopyInvertSSBOCase(m_context, "copy_ssbo_single_invocation", "Copy between SSBOs, inverting bits", 256,
                                    tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new CopyInvertSSBOCase(m_context, "copy_ssbo_multiple_invocations", "Copy between SSBOs, inverting bits",
                                    1024, tcu::IVec3(1, 1, 1), tcu::IVec3(2, 4, 1)));
    addChild(new CopyInvertSSBOCase(m_context, "copy_ssbo_multiple_groups", "Copy between SSBOs, inverting bits", 1024,
                                    tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4)));

    addChild(new InvertSSBOInPlaceCase(m_context, "ssbo_rw_single_invocation", "Read and write same SSBO", 256, true,
                                       tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new InvertSSBOInPlaceCase(m_context, "ssbo_rw_multiple_groups", "Read and write same SSBO", 1024, true,
                                       tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4)));

    addChild(new InvertSSBOInPlaceCase(m_context, "ssbo_unsized_arr_single_invocation", "Read and write same SSBO", 256,
                                       false, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new InvertSSBOInPlaceCase(m_context, "ssbo_unsized_arr_multiple_groups", "Read and write same SSBO", 1024,
                                       false, tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4)));

    addChild(new WriteToMultipleSSBOCase(m_context, "write_multiple_arr_single_invocation", "Write to multiple SSBOs",
                                         256, true, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new WriteToMultipleSSBOCase(m_context, "write_multiple_arr_multiple_groups", "Write to multiple SSBOs",
                                         1024, true, tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4)));

    addChild(new WriteToMultipleSSBOCase(m_context, "write_multiple_unsized_arr_single_invocation",
                                         "Write to multiple SSBOs", 256, false, tcu::IVec3(1, 1, 1),
                                         tcu::IVec3(1, 1, 1)));
    addChild(new WriteToMultipleSSBOCase(m_context, "write_multiple_unsized_arr_multiple_groups",
                                         "Write to multiple SSBOs", 1024, false, tcu::IVec3(1, 4, 2),
                                         tcu::IVec3(2, 2, 4)));

    addChild(new SSBOLocalBarrierCase(m_context, "ssbo_local_barrier_single_invocation", "SSBO local barrier usage",
                                      tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new SSBOLocalBarrierCase(m_context, "ssbo_local_barrier_single_group", "SSBO local barrier usage",
                                      tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1)));
    addChild(new SSBOLocalBarrierCase(m_context, "ssbo_local_barrier_multiple_groups", "SSBO local barrier usage",
                                      tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3)));

    addChild(
        new SSBOBarrierCase(m_context, "ssbo_cmd_barrier_single", "SSBO memory barrier usage", tcu::IVec3(1, 1, 1)));
    addChild(
        new SSBOBarrierCase(m_context, "ssbo_cmd_barrier_multiple", "SSBO memory barrier usage", tcu::IVec3(11, 5, 7)));

    addChild(new BasicSharedVarCase(m_context, "shared_var_single_invocation", "Basic shared variable usage",
                                    tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new BasicSharedVarCase(m_context, "shared_var_single_group", "Basic shared variable usage",
                                    tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1)));
    addChild(new BasicSharedVarCase(m_context, "shared_var_multiple_invocations", "Basic shared variable usage",
                                    tcu::IVec3(1, 1, 1), tcu::IVec3(2, 5, 4)));
    addChild(new BasicSharedVarCase(m_context, "shared_var_multiple_groups", "Basic shared variable usage",
                                    tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3)));

    addChild(new SharedVarAtomicOpCase(m_context, "shared_atomic_op_single_invocation",
                                       "Atomic operation with shared var", tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new SharedVarAtomicOpCase(m_context, "shared_atomic_op_single_group", "Atomic operation with shared var",
                                       tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1)));
    addChild(new SharedVarAtomicOpCase(m_context, "shared_atomic_op_multiple_invocations",
                                       "Atomic operation with shared var", tcu::IVec3(1, 1, 1), tcu::IVec3(2, 5, 4)));
    addChild(new SharedVarAtomicOpCase(m_context, "shared_atomic_op_multiple_groups",
                                       "Atomic operation with shared var", tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3)));

    addChild(new CopyImageToSSBOCase(m_context, "copy_image_to_ssbo_small", "Image to SSBO copy", tcu::IVec2(1, 1),
                                     tcu::IVec2(64, 64)));
    addChild(new CopyImageToSSBOCase(m_context, "copy_image_to_ssbo_large", "Image to SSBO copy", tcu::IVec2(2, 4),
                                     tcu::IVec2(512, 512)));

    addChild(new CopySSBOToImageCase(m_context, "copy_ssbo_to_image_small", "SSBO to image copy", tcu::IVec2(1, 1),
                                     tcu::IVec2(64, 64)));
    addChild(new CopySSBOToImageCase(m_context, "copy_ssbo_to_image_large", "SSBO to image copy", tcu::IVec2(2, 4),
                                     tcu::IVec2(512, 512)));

    addChild(new ImageAtomicOpCase(m_context, "image_atomic_op_local_size_1", "Atomic operation with image", 1,
                                   tcu::IVec2(64, 64)));
    addChild(new ImageAtomicOpCase(m_context, "image_atomic_op_local_size_8", "Atomic operation with image", 8,
                                   tcu::IVec2(64, 64)));

    addChild(new ImageBarrierCase(m_context, "image_barrier_single", "Image barrier", tcu::IVec2(1, 1)));
    addChild(new ImageBarrierCase(m_context, "image_barrier_multiple", "Image barrier", tcu::IVec2(64, 64)));

    addChild(new AtomicCounterCase(m_context, "atomic_counter_single_invocation", "Basic atomic counter test",
                                   tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1)));
    addChild(new AtomicCounterCase(m_context, "atomic_counter_single_group", "Basic atomic counter test",
                                   tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1)));
    addChild(new AtomicCounterCase(m_context, "atomic_counter_multiple_invocations", "Basic atomic counter test",
                                   tcu::IVec3(1, 1, 1), tcu::IVec3(2, 5, 4)));
    addChild(new AtomicCounterCase(m_context, "atomic_counter_multiple_groups", "Basic atomic counter test",
                                   tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3)));
}

} // namespace Functional
} // namespace gles31
} // namespace deqp