#pragma once #include #include #include #include #include #include #include "GCM.h" #include "rsx_cache.h" #include "RSXFIFO.h" #include "RSXOffload.h" #include "RSXZCULL.h" #include "rsx_utils.h" #include "Common/bitfield.hpp" #include "Common/profiling_timer.hpp" #include "Common/texture_cache_types.h" #include "Program/RSXVertexProgram.h" #include "Program/RSXFragmentProgram.h" #include "Utilities/Thread.h" #include "Utilities/geometry.h" #include "Capture/rsx_trace.h" #include "Capture/rsx_replay.h" #include "Emu/Cell/lv2/sys_rsx.h" #include "Emu/IdManager.h" #include "Emu/system_config.h" extern atomic_t g_user_asked_for_frame_capture; extern atomic_t g_disable_frame_limit; extern rsx::frame_trace_data frame_debug; extern rsx::frame_capture_data frame_capture; namespace rsx { namespace overlays { class display_manager; } struct rsx_iomap_table { std::array, 4096> ea; std::array, 4096> io; std::array rs{}; rsx_iomap_table() noexcept : ea(fill_array(-1)) , io(fill_array(-1)) { } // Try to get the real address given a mapped address // Returns -1 on failure u32 get_addr(u32 offs) const noexcept { return this->ea[offs >> 20] | (offs & 0xFFFFF); } template bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept { if (len <= 1) return false; const u32 end = addr + len - 1; for (u32 block = (addr >> 20); block <= (end >> 20); ++block) { auto& mutex_ = rs[block]; if constexpr (IsFullLock) { if (self) [[ likely ]] { while (!mutex_.try_lock()) { self->cpu_wait({}); } } else { mutex_.lock(); } } else { if (!self) [[ likely ]] { mutex_.lock_shared(); } else { while (!mutex_.try_lock_shared()) { self->cpu_wait({}); } } } } return true; } template void unlock(u32 addr, u32 len) noexcept { ensure(len >= 1); const u32 end = addr + len - 1; for (u32 block = (addr >> 20); block <= (end >> 20); ++block) { if constexpr (IsFullLock) { rs[block].unlock(); } else { rs[block].unlock_shared(); } } } }; enum framebuffer_creation_context : u8 { context_draw = 0, context_clear_color = 1, context_clear_depth = 2, context_clear_all = context_clear_color | context_clear_depth }; enum pipeline_state : u32 { fragment_program_ucode_dirty = 0x1, // Fragment program ucode changed vertex_program_ucode_dirty = 0x2, // Vertex program ucode changed fragment_program_state_dirty = 0x4, // Fragment program state changed vertex_program_state_dirty = 0x8, // Vertex program state changed fragment_state_dirty = 0x10, // Fragment state changed (alpha test, etc) vertex_state_dirty = 0x20, // Vertex state changed (scale_offset, clip planes, etc) transform_constants_dirty = 0x40, // Transform constants changed fragment_constants_dirty = 0x80, // Fragment constants changed framebuffer_reads_dirty = 0x100, // Framebuffer contents changed fragment_texture_state_dirty = 0x200, // Fragment texture parameters changed vertex_texture_state_dirty = 0x400, // Fragment texture parameters changed scissor_config_state_dirty = 0x800, // Scissor region changed zclip_config_state_dirty = 0x1000, // Viewport Z clip changed scissor_setup_invalid = 0x2000, // Scissor configuration is broken scissor_setup_clipped = 0x4000, // Scissor region is cropped by viewport constraint polygon_stipple_pattern_dirty = 0x8000, // Rasterizer stippling pattern changed line_stipple_pattern_dirty = 0x10000, // Line stippling pattern changed push_buffer_arrays_dirty = 0x20000, // Push buffers have data written to them (immediate mode vertex buffers) fragment_program_dirty = fragment_program_ucode_dirty | fragment_program_state_dirty, vertex_program_dirty = vertex_program_ucode_dirty | vertex_program_state_dirty, invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty, invalidate_zclip_bits = vertex_state_dirty | zclip_config_state_dirty, memory_barrier_bits = framebuffer_reads_dirty, all_dirty = ~0u }; enum eng_interrupt_reason : u32 { backend_interrupt = 0x0001, // Backend-related interrupt memory_config_interrupt = 0x0002, // Memory configuration changed display_interrupt = 0x0004, // Display handling pipe_flush_interrupt = 0x0008, // Flush pipelines all_interrupt_bits = memory_config_interrupt | backend_interrupt | display_interrupt | pipe_flush_interrupt }; enum FIFO_state : u8 { running = 0, empty = 1, // PUT == GET spinning = 2, // Puller continuously jumps to self addr (synchronization technique) nop = 3, // Puller is processing a NOP command lock_wait = 4 // Puller is processing a lock acquire }; enum FIFO_hint : u8 { hint_conditional_render_eval = 1, hint_zcull_sync = 2 }; enum result_flags: u8 { result_none = 0, result_error = 1, result_zcull_intr = 2 }; enum ROP_control : u32 { alpha_test_enable = (1u << 0), framebuffer_srgb_enable = (1u << 1), csaa_enable = (1u << 4), msaa_mask_enable = (1u << 5), msaa_config_mask = (3u << 6), polygon_stipple_enable = (1u << 9), alpha_func_mask = (7u << 16) }; u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size); u32 get_address(u32 offset, u32 location, u32 size_to_check = 0, u32 line = __builtin_LINE(), u32 col = __builtin_COLUMN(), const char* file = __builtin_FILE(), const char* func = __builtin_FUNCTION()); struct tiled_region { u32 address; u32 base; GcmTileInfo *tile; u8 *ptr; void write(const void *src, u32 width, u32 height, u32 pitch); void read(void *dst, u32 width, u32 height, u32 pitch); }; struct vertex_array_buffer { rsx::vertex_base_type type; u8 attribute_size; u8 stride; std::span data; u8 index; bool is_be; }; struct vertex_array_register { rsx::vertex_base_type type; u8 attribute_size; std::array data; u8 index; }; struct empty_vertex_array { u8 index; }; struct draw_array_command { u32 __dummy; }; struct draw_indexed_array_command { std::span raw_index_buffer; }; struct draw_inlined_array { u32 __dummy; u32 __dummy2; }; struct interleaved_attribute_t { u8 index; bool modulo; u16 frequency; }; struct interleaved_range_info { bool interleaved = false; bool single_vertex = false; u32 base_offset = 0; u32 real_offset_address = 0; u8 memory_location = 0; u8 attribute_stride = 0; rsx::simple_array locations; // Check if we need to upload a full unoptimized range, i.e [0-max_index] std::pair calculate_required_range(u32 first, u32 count) const; }; enum attribute_buffer_placement : u8 { none = 0, persistent = 1, transient = 2 }; struct vertex_input_layout { std::vector interleaved_blocks{}; // Interleaved blocks to be uploaded as-is std::vector> volatile_blocks{}; // Volatile data blocks (immediate draw vertex data for example) rsx::simple_array referenced_registers{}; // Volatile register data std::array attribute_placement = fill_array(attribute_buffer_placement::none); vertex_input_layout() = default; void clear() { interleaved_blocks.clear(); volatile_blocks.clear(); referenced_registers.clear(); } bool validate() const { // Criteria: At least one array stream has to be defined to feed vertex positions // This stream cannot be a const register as the vertices cannot create a zero-area primitive if (!interleaved_blocks.empty() && interleaved_blocks.front().attribute_stride != 0) return true; if (!volatile_blocks.empty()) return true; for (u8 index = 0; index < limits::vertex_count; ++index) { switch (attribute_placement[index]) { case attribute_buffer_placement::transient: { // Ignore register reference if (std::find(referenced_registers.begin(), referenced_registers.end(), index) != referenced_registers.end()) continue; // The source is inline array or immediate draw push buffer return true; } case attribute_buffer_placement::persistent: { return true; } case attribute_buffer_placement::none: { continue; } default: { fmt::throw_exception("Unreachable"); } } } return false; } u32 calculate_interleaved_memory_requirements(u32 first_vertex, u32 vertex_count) const { u32 mem = 0; for (auto &block : interleaved_blocks) { const auto range = block.calculate_required_range(first_vertex, vertex_count); mem += range.second * block.attribute_stride; } return mem; } }; struct framebuffer_layout { u16 width; u16 height; std::array color_addresses; std::array color_pitch; std::array actual_color_pitch; std::array color_write_enabled; u32 zeta_address; u32 zeta_pitch; u32 actual_zeta_pitch; bool zeta_write_enabled; rsx::surface_target target; rsx::surface_color_format color_format; rsx::surface_depth_format2 depth_format; rsx::surface_antialiasing aa_mode; rsx::surface_raster_type raster_type; u32 aa_factors[2]; bool ignore_change; }; struct frame_statistics_t { u32 draw_calls; u32 submit_count; s64 setup_time; s64 vertex_upload_time; s64 textures_upload_time; s64 draw_exec_time; s64 flip_time; }; struct display_flip_info_t { std::deque buffer_queue; u32 buffer; bool skip_frame; bool emu_flip; bool in_progress; frame_statistics_t stats; inline void push(u32 _buffer) { buffer_queue.push_back(_buffer); } inline bool pop(u32 _buffer) { if (buffer_queue.empty()) { return false; } do { const auto index = buffer_queue.front(); buffer_queue.pop_front(); if (index == _buffer) { buffer = _buffer; return true; } } while (!buffer_queue.empty()); // Need to observe this happening in the wild rsx_log.error("Display queue was discarded while not empty!"); return false; } }; struct backend_configuration { bool supports_multidraw; // Draw call batching bool supports_hw_a2c; // Alpha to coverage bool supports_hw_renormalization; // Should be true on NV hardware which matches PS3 texture renormalization behaviour bool supports_hw_msaa; // MSAA support bool supports_hw_a2one; // Alpha to one bool supports_hw_conditional_render; // Conditional render bool supports_passthrough_dma; // DMA passthrough bool supports_asynchronous_compute; // Async compute bool supports_host_gpu_labels; // Advanced host synchronization }; struct sampled_image_descriptor_base; class thread : public cpu_thread { u64 timestamp_ctrl = 0; u64 timestamp_subvalue = 0; u64 m_cycles_counter = 0; display_flip_info_t m_queued_flip{}; void cpu_task() override; protected: atomic_t m_rsx_thread_exiting{ true }; std::array vertex_push_buffers; std::vector element_push_buffer; s32 m_skip_frame_ctr = 0; bool skip_current_frame = false; backend_configuration backend_config{}; // FIFO public: std::unique_ptr fifo_ctrl; std::vector> dump_callstack_list() const override; protected: FIFO::flattening_helper m_flattener; u32 fifo_ret_addr = RSX_CALL_STACK_EMPTY; u32 saved_fifo_ret = RSX_CALL_STACK_EMPTY; // Occlusion query bool zcull_surface_active = false; std::unique_ptr zcull_ctrl; // Framebuffer setup rsx::gcm_framebuffer_info m_surface_info[rsx::limits::color_buffers_count]; rsx::gcm_framebuffer_info m_depth_surface_info; framebuffer_layout m_framebuffer_layout{}; bool framebuffer_status_valid = false; // Overlays rsx::overlays::display_manager* m_overlay_manager = nullptr; // Invalidated memory range address_range m_invalidated_memory_range; // Profiler rsx::profiling_timer m_profiler; frame_statistics_t m_frame_stats; public: RsxDmaControl* ctrl = nullptr; u32 dma_address{0}; rsx_iomap_table iomap_table; u32 restore_point = 0; u32 dbg_step_pc = 0; u32 last_known_code_start = 0; atomic_t external_interrupt_lock{ 0 }; atomic_t external_interrupt_ack{ false }; atomic_t is_inited{ false }; bool is_fifo_idle() const; void flush_fifo(); // Returns [count of found commands, PC of their start] std::pair try_get_pc_of_x_cmds_backwards(u32 count, u32 get) const; void recover_fifo(u32 line = __builtin_LINE(), u32 col = __builtin_COLUMN(), const char* file = __builtin_FILE(), const char* func = __builtin_FUNCTION()); static void fifo_wake_delay(u64 div = 1); u32 get_fifo_cmd() const; std::string dump_regs() const override; void cpu_wait(bs_t old) override; static constexpr u32 id_base = 0x5555'5555; // See get_current_cpu_thread() // Performance approximation counters struct { atomic_t idle_time{ 0 }; // Time spent idling in microseconds u64 last_update_timestamp = 0; // Timestamp of last load update u64 FIFO_idle_timestamp = 0; // Timestamp of when FIFO queue becomes idle FIFO_state state = FIFO_state::running; u32 approximate_load = 0; u32 sampled_frames = 0; } performance_counters; enum class flip_request : u32 { emu_requested = 1, native_ui = 2, any = emu_requested | native_ui }; atomic_bitmask_t async_flip_requested{}; u8 async_flip_buffer{ 0 }; GcmTileInfo tiles[limits::tiles_count]; GcmZcullInfo zculls[limits::zculls_count]; void capture_frame(const std::string &name); const backend_configuration& get_backend_config() const { return backend_config; } public: std::shared_ptr> intr_thread; // I hate this flag, but until hle is closer to lle, its needed bool isHLE{ false }; u32 flip_status; int debug_level; atomic_t requested_vsync{false}; atomic_t enable_second_vhandler{false}; RsxDisplayInfo display_buffers[8]; u32 display_buffers_count{0}; u32 current_display_buffer{0}; shared_mutex sys_rsx_mtx; u32 device_addr{0}; u32 label_addr{0}; u32 main_mem_size{0}; u32 local_mem_size{0}; u32 rsx_event_port{0}; u32 driver_info{0}; void send_event(u64, u64, u64) const; bool m_rtts_dirty = true; std::array m_textures_dirty; std::array m_vertex_textures_dirty; bool m_framebuffer_state_contested = false; rsx::framebuffer_creation_context m_current_framebuffer_context = rsx::framebuffer_creation_context::context_draw; rsx::atomic_bitmask_t m_eng_interrupt_mask; u32 m_graphics_state = 0; u64 ROP_sync_timestamp = 0; program_hash_util::fragment_program_utils::fragment_program_metadata current_fp_metadata = {}; program_hash_util::vertex_program_utils::vertex_program_metadata current_vp_metadata = {}; protected: std::array get_color_surface_addresses() const; u32 get_zeta_surface_address() const; void get_framebuffer_layout(rsx::framebuffer_creation_context context, framebuffer_layout &layout); bool get_scissor(areau& region, bool clip_viewport); /** * Analyze vertex inputs and group all interleaved blocks */ void analyse_inputs_interleaved(vertex_input_layout&); RSXVertexProgram current_vertex_program = {}; RSXFragmentProgram current_fragment_program = {}; vertex_program_texture_state current_vp_texture_state = {}; fragment_program_texture_state current_fp_texture_state = {}; // Runs shader prefetch and resolves pipeline status flags void analyse_current_rsx_pipeline(); // Prefetch and analyze the currently active fragment program ucode void prefetch_fragment_program(); // Prefetch and analyze the currently active vertex program ucode void prefetch_vertex_program(); void get_current_vertex_program(const std::array, rsx::limits::vertex_textures_count>& sampler_descriptors); /** * Gets current fragment program and associated fragment state */ void get_current_fragment_program(const std::array, rsx::limits::fragment_textures_count>& sampler_descriptors); public: bool invalidate_fragment_program(u32 dst_dma, u32 dst_offset, u32 size); void on_framebuffer_options_changed(u32 opt); public: u64 target_rsx_flip_time = 0; u64 int_flip_index = 0; u64 last_guest_flip_timestamp = 0; u64 last_host_flip_timestamp = 0; vm::ptr flip_handler = vm::null; vm::ptr user_handler = vm::null; vm::ptr vblank_handler = vm::null; atomic_t vblank_count{0}; bool capture_current_frame = false; bool wait_for_flip_sema = false; u32 flip_sema_wait_val = 0; public: atomic_t sync_point_request = false; bool in_begin_end = false; struct desync_fifo_cmd_info { u32 cmd; u64 timestamp; }; std::queue recovered_fifo_cmds_history; atomic_t async_tasks_pending{ 0 }; bool zcull_stats_enabled = false; bool zcull_rendering_enabled = false; bool zcull_pixel_cnt_enabled = false; reports::conditional_render_eval cond_render_ctrl; virtual u64 get_cycles() = 0; virtual ~thread(); static constexpr auto thread_name = "rsx::thread"sv; protected: thread(); virtual void on_task(); virtual void on_exit(); /** * Execute a backend local task queue */ virtual void do_local_task(FIFO_state state); virtual void emit_geometry(u32) {} void run_FIFO(); public: thread(const thread&) = delete; thread& operator=(const thread&) = delete; virtual void clear_surface(u32 /*arg*/) {} virtual void begin(); virtual void end(); virtual void execute_nop_draw(); virtual void on_init_thread() = 0; virtual void on_frame_end(u32 buffer, bool forced = false); virtual void flip(const display_flip_info_t& info) = 0; virtual u64 timestamp(); virtual bool on_access_violation(u32 /*address*/, bool /*is_writing*/) { return false; } virtual void on_invalidate_memory_range(const address_range & /*range*/, rsx::invalidation_cause) {} virtual void notify_tile_unbound(u32 /*tile*/) {} // control virtual void renderctl(u32 /*request_code*/, void* /*args*/) {} // zcull void notify_zcull_info_changed(); void clear_zcull_stats(u32 type); void check_zcull_status(bool framebuffer_swap); void get_zcull_stats(u32 type, vm::addr_t sink); u32 copy_zcull_stats(u32 memory_range_start, u32 memory_range, u32 destination); void enable_conditional_rendering(vm::addr_t ref); void disable_conditional_rendering(); virtual void begin_conditional_rendering(const std::vector& sources); virtual void end_conditional_rendering(); // sync void sync(); flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional); virtual void sync_hint(FIFO_hint hint, reports::sync_hint_payload_t payload); virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; } std::span get_raw_index_array(const draw_clause& draw_indexed_clause) const; std::variant get_draw_command(const rsx::rsx_state& state) const; /** * Immediate mode rendering requires a temp push buffer to hold attrib values * Appends a value to the push buffer (currently only supports 32-wide types) */ void append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value); u32 get_push_buffer_vertex_count() const; void append_array_element(u32 index); u32 get_push_buffer_index_count() const; protected: /** * Computes VRAM requirements needed to upload raw vertex streams * result.first contains persistent memory requirements * result.second contains volatile memory requirements */ std::pair calculate_memory_requirements(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count); /** * Generates vertex input descriptors as an array of 16x4 s32s */ void fill_vertex_layout_state(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, s32* buffer, u32 persistent_offset = 0, u32 volatile_offset = 0); /** * Uploads vertex data described in the layout descriptor * Copies from local memory to the write-only output buffers provided in a sequential manner */ void write_vertex_data_to_memory(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, void *persistent_data, void *volatile_data); private: shared_mutex m_mtx_task; void handle_emu_flip(u32 buffer); void handle_invalidated_memory_range(); public: /** * Fill buffer with 4x4 scale offset matrix. * Vertex shader's position is to be multiplied by this matrix. * if flip_y is set, the matrix is modified to use d3d convention. */ void fill_scale_offset_data(void *buffer, bool flip_y) const; /** * Fill buffer with user clip information */ void fill_user_clip_data(void *buffer) const; /** * Fill buffer with vertex program constants. * Relocation table allows to do a partial fill with only selected registers. */ void fill_vertex_program_constants_data(void* buffer, const std::vector& reloc_table); /** * Fill buffer with fragment rasterization state. * Fills current fog values, alpha test parameters and texture scaling parameters */ void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program); /** * Notify that a section of memory has been mapped * If there is a notify_memory_unmapped request on this range yet to be handled, * handles it immediately. */ void on_notify_memory_mapped(u32 address_base, u32 size); /** * Notify that a section of memory has been unmapped * Any data held in the defined range is discarded */ void on_notify_memory_unmapped(u32 address_base, u32 size); /** * Notify to check internal state during semaphore wait */ virtual void on_semaphore_acquire_wait() {} virtual std::pair get_programs() const { return std::make_pair("", ""); } virtual bool scaled_image_from_memory(blit_src_info& /*src_info*/, blit_dst_info& /*dst_info*/, bool /*interpolate*/) { return false; } public: void reset(); void init(u32 ctrlAddress); // Emu App/Game flip, only immediately flips when called from rsxthread void request_emu_flip(u32 buffer); void pause(); void unpause(); void wait_pause(); // Get RSX approximate load in % u32 get_load(); // Get stats object frame_statistics_t& get_stats() { return m_frame_stats; } // Returns true if the current thread is the active RSX thread inline bool is_current_thread() const { return !!cpu_thread::get_current(); } }; inline thread* get_current_renderer() { return g_fxo->try_get(); } template class reservation_lock { u32 addr = 0, length = 0; bool locked = false; inline void lock_range(u32 addr, u32 length) { this->addr = addr; this->length = length; auto renderer = get_current_renderer(); cpu_thread* lock_owner = renderer->is_current_thread() ? renderer : nullptr; this->locked = renderer->iomap_table.lock(addr, length, lock_owner); } public: reservation_lock(u32 addr, u32 length) { if (g_cfg.core.rsx_accurate_res_access && addr < constants::local_mem_base) { lock_range(addr, length); } } // Multi-range lock. If ranges overlap, the combined range will be acquired. // If ranges do not overlap, the first range that is in main memory will be acquired. reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length) { if (g_cfg.core.rsx_accurate_res_access) { const auto range1 = utils::address_range::start_length(dst_addr, dst_length); const auto range2 = utils::address_range::start_length(src_addr, src_length); utils::address_range target_range; if (!range1.overlaps(range2)) [[likely]] { target_range = (dst_addr < constants::local_mem_base) ? range1 : range2; } else { // Very unlikely target_range = range1.get_min_max(range2); } if (target_range.start < constants::local_mem_base) { lock_range(target_range.start, target_range.length()); } } } ~reservation_lock() { if (locked) { get_current_renderer()->iomap_table.unlock(addr, length); } } }; class eng_lock { rsx::thread* pthr; public: eng_lock(rsx::thread* target) :pthr(target) { if (pthr->is_current_thread()) { pthr = nullptr; } else { pthr->pause(); } } ~eng_lock() { if (pthr) pthr->unpause(); } }; }