`timescale 1ns/1ps
`define GOWIN_BRAM

module video_double_buffer 
#(
    parameter READ_ADDRESS_WIDTH = 11, // size = 2^width
    parameter MAX_BUFFER_SIZE = 2040 // maximum packet size is 512-2 = 510 bytes
)
(
    input ARESET,
    
    // Writing interface:
    input WRCLK, // clock for write (video source) interface
    input [15:0] WRDATA, // pixel data
    input PIXEL_DE, // data enable, high when WRDATA signal has pixel data
    input PIXEL_VS, // vertical sync, high when in the middle of a frame. used to determine frame boundary

    // Reading interface:
    input RDCLK, // clock for read interface
    output DATA_READY, // high when a buffer is filled, goes low only after DATA_COMPLETE is set high by reader
    output [READ_ADDRESS_WIDTH:0] BUFFER_LENGTH, // size of this buffer, the number of valid data in it
    input DATA_COMPLETE, // set high by reader when all data in the buffer has been sent

    output LAST, // set when this is the last chunk of data for the frame
    output OVERRUN, // set high if a second buffer is filled before DATA_COMPLETE on the first one
    input [READ_ADDRESS_WIDTH-1:0] RDADDR, // requested data to read
    output [7:0] RDDATA
);

    localparam READ_DATA_WIDTH = 8,
               WRITE_DATA_WIDTH = 16,
               WRITE_ADDRESS_WIDTH = READ_ADDRESS_WIDTH - 1;

    // synchronization notes:
    // DATA_READY comes from the WRCLK domain.
    // DATA_COMPLETE is set in the RDCLK domain.
    // DATA_READY only goes low after DATA_COMPLETE is read high (after clock crossing to WRCLK)
    // ready should hold DATA_COMPLETE high until after it sees DATA_READY go back low (after crossing to RDCLK)

    reg reset_wrclk; // synchronized reset for write domain
    reg reset_rdclk; // """" for read domain
    reg pixel_vs_r; // one clock delay on VS
    wire vs_frame_start; // rising edge detect VS
    wire vs_frame_end; // falling edge detect VS
    reg [2:0] vdb_state; // current video double buffer state
    reg [2:0] vdb_next_state; // next """"
    reg [1:0] vdb_ready_state; // state machine for the READY / COMPLETE handshake
    reg [1:0] vdb_next_ready_state;
    wire buffer0_full; // set when write pointer is pointing after last position in the buffer
    wire buffer1_full;
    reg [WRITE_ADDRESS_WIDTH:0] buf0_write_address;
    reg [WRITE_ADDRESS_WIDTH:0] buf1_write_address;
    reg writing_to_buf0; // true if writing to buf0, false if writing to buf1
    reg global_wren; // allow writes to any buffer

    reg data_complete_wrclk; // synchronized DATA_COMPLETE, crossed to WRCLK domain
    reg data_complete_r; // metastable middle state
    
    wire data_ready_wrclk; // set by WRCLK domain when a buffer is ready for reading
    reg data_ready_rdclk; // synchronized DATA_READY, crossed to RDCLK domain
    reg data_ready_r; // metastable middle state

    reg frame_end_out; // if we are currently sending the LAST signal
    reg frame_end_in_next_buf; // means LAST should be asserted in next buffer

    // safely synchronize both resets - hold low until destination clock domain ticks
    always @(posedge WRCLK or posedge ARESET) begin
        if(ARESET)
            reset_wrclk <= 1'b1;
        else
            reset_wrclk <= 1'b0;
    end
    always @(posedge RDCLK or posedge ARESET) begin
        if(ARESET)
            reset_rdclk <= 1'b1;
        else
            reset_rdclk <= 1'b0;
    end

    // clock domain crossing for ready and complete
    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            data_complete_r <= 1'b0;
            data_complete_wrclk <= 1'b0;
        end else begin
            data_complete_r <= DATA_COMPLETE;
            data_complete_wrclk <= data_complete_r;
        end
    end
    always @(posedge RDCLK or posedge reset_rdclk) begin
        if(reset_rdclk) begin
            data_ready_r <= 1'b0;
            data_ready_rdclk <= 1'b0;
        end else begin
            data_ready_r <= data_ready_wrclk;
            data_ready_rdclk <= data_ready_r;
        end
    end
    assign DATA_READY = data_ready_rdclk;

    // rising / falling edge detector for vertical sync
    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            pixel_vs_r <= 1'b0;
        end else begin
            pixel_vs_r <= PIXEL_VS;
        end
    end

    // these wires go high for one cycle at their assigned edge of sync
    assign vs_frame_start = PIXEL_VS && (!pixel_vs_r);
    assign vs_frame_end = (!PIXEL_VS) && pixel_vs_r;

    // Main state machine:
    // initialize to invalid state, don't want to send a partial image
    // after first vert sync, begin filling buffer 0
    // when buffer 0 is full, send DATA_READY and the size of data in the buffer
    // - wait for DATA_COMPLETE before adding any more data to buffer 0
    // meanwhile, new data will go into buffer 1
    // after buffer 1 is full, and DATA_COMPLETE has been received, send DATA_READY and switch back to buffer 0
    // if DATA_COMPLETE was not sent, assert OVERRUN and halt until either reset or DATA_COMPLETE

    localparam      VDB_STATE_INITIAL                   = 3'b000,
                    VDB_STATE_FILL_BUF0_OTHER_EMPTY     = 3'b001,
                    VDB_STATE_FILL_BUF0_OTHER_FULL      = 3'b010,
                    VDB_STATE_FILL_BUF0_OTHER_FULL_VS   = 3'b011, // got frame complete before other buffer was emptied.
                    VDB_STATE_FILL_BUF1_OTHER_EMPTY     = 3'b100,
                    VDB_STATE_FILL_BUF1_OTHER_FULL      = 3'b101,
                    VDB_STATE_FILL_BUF1_OTHER_FULL_VS   = 3'b110,
                    VDB_STATE_OVERRUN                   = 3'b111;

    // Second state machine
    // Handles the READY signal and determines if a buffer has been cleared
    localparam      VDB_READY_IDLE                      = 2'b00, // waiting for buffer full
                    VDB_READY_BUF_FILLED                = 2'b01, // waiting for complete to go high
                    VDB_READY_WAIT_COMPLETE             = 2'b10, // waiting for complete to return low
                    VDB_READY_DONE                      = 2'b11; // handshake is finished.

    always @(*) begin
        vdb_next_state = vdb_state;
        case(vdb_state)
        VDB_STATE_INITIAL: begin
            if(vs_frame_start) begin
                vdb_next_state = VDB_STATE_FILL_BUF0_OTHER_EMPTY;
            end
        end

        VDB_STATE_FILL_BUF0_OTHER_EMPTY: begin
            // waiting for any of:
            // (a) buffer0 full -> swap buffers
            // (b) frame end -> swap buffers, and assert end of frame
            if(buffer0_full || vs_frame_end) begin
                vdb_next_state = VDB_STATE_FILL_BUF1_OTHER_FULL;
            end
        end
        VDB_STATE_FILL_BUF0_OTHER_FULL: begin
            // waiting on:
            // (a) finished the complete handshake -> stay in this buffer, but allow the next buffer swap
            // (b) buffer0 full -> overrun!
            // (c) frame end -> don't swap yet. wait for buffer empty, then swap and assert end of frame
            if(vdb_ready_state == VDB_READY_DONE) begin
                // finished the complete handshake, so other buffer was emptied
                vdb_next_state = VDB_STATE_FILL_BUF0_OTHER_EMPTY;
            end else if(buffer0_full) begin
                vdb_next_state = VDB_STATE_OVERRUN;
            end else if(vs_frame_end) begin
                vdb_next_state = VDB_STATE_FILL_BUF0_OTHER_FULL_VS;
            end
        end
        VDB_STATE_FILL_BUF0_OTHER_FULL_VS: begin
            // We get to this case if a frame ended while the other
            // buffer was still being emptied by the reader
            // This will probably happen if the last buffer of the
            // frame is pretty short.
            // Weird case as technically both buffers are filled. Not an overrun, unless
            // a new frame starts before the previous buffer gets emptied.
            // Wait in this state until we finish a data complete handshake
            // then we proceed to swap the active buffer
            // waiting on:
            // (a) data complete handshake is done -> swap buffers, assert end of frame
            // (b) new frame started -> overrun!
            if(vdb_ready_state == VDB_READY_DONE) begin
                vdb_next_state = VDB_STATE_FILL_BUF1_OTHER_FULL;
            end else if(vs_frame_start) begin
                vdb_next_state = VDB_STATE_OVERRUN;
            end
        end

        // same states as above, but for the other buffer
        VDB_STATE_FILL_BUF1_OTHER_EMPTY: begin
            if(buffer1_full || vs_frame_end) begin
                vdb_next_state = VDB_STATE_FILL_BUF0_OTHER_FULL;
            end
        end
        VDB_STATE_FILL_BUF1_OTHER_FULL: begin
            if(vdb_ready_state == VDB_READY_DONE) begin
                vdb_next_state = VDB_STATE_FILL_BUF1_OTHER_EMPTY;
            end else if(buffer1_full) begin
                vdb_next_state = VDB_STATE_OVERRUN;
            end else if(vs_frame_end) begin
                vdb_next_state = VDB_STATE_FILL_BUF1_OTHER_FULL_VS;
            end
        end
        VDB_STATE_FILL_BUF1_OTHER_FULL_VS: begin
            if(vdb_ready_state == VDB_READY_DONE) begin
                vdb_next_state = VDB_STATE_FILL_BUF0_OTHER_FULL;
            end else if(vs_frame_start) begin
                vdb_next_state = VDB_STATE_OVERRUN;
            end
        end

        VDB_STATE_OVERRUN: begin
            // stay here until complete is done.
            // otherwise requires a reset to exit this fault state.
            if(vdb_ready_state == VDB_READY_DONE) begin
                vdb_next_state = VDB_STATE_INITIAL;
            end
        end
        endcase
    end

    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            vdb_state <= VDB_STATE_INITIAL;
        end else begin
            vdb_state <= vdb_next_state;
        end
    end

    // Ready / Complete handshake state machine
    always @(*) begin
        vdb_next_ready_state = vdb_ready_state;
        case(vdb_ready_state)
        VDB_READY_IDLE: begin
            if((vdb_state == VDB_STATE_FILL_BUF0_OTHER_FULL) || (vdb_state == VDB_STATE_FILL_BUF1_OTHER_FULL)) begin
                vdb_next_ready_state = VDB_READY_BUF_FILLED;
            end
        end
        VDB_READY_BUF_FILLED: begin
            if(data_complete_wrclk) begin
                vdb_next_ready_state = VDB_READY_WAIT_COMPLETE;
            end
        end
        VDB_READY_WAIT_COMPLETE: begin
            if(data_complete_wrclk) begin
                vdb_next_ready_state = VDB_READY_DONE;
            end
        end
        VDB_READY_DONE: begin
            // go back to idle when the main state machine accepts this signal
            if(vdb_state != vdb_next_state) begin
                vdb_next_ready_state = VDB_READY_IDLE;
            end
        end
        endcase
    end

    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            vdb_ready_state <= VDB_READY_IDLE;
        end else begin
            vdb_ready_state <= vdb_next_ready_state;
        end
    end

    // Figure out which buffer is being written to
    // Note that we need to be a little bit ahead of the main state machine on buffer swaps
    // The "full" signal is when the write address points past the last valid position in the buffer
    // That will only be latched in the state machine one cycle after it happens.
    // Which is...one cycle too late if data enable is held high. One piece of data that should go into
    // the start of the next buffer will be lost
    assign buffer0_full = (buf0_write_address >= (MAX_BUFFER_SIZE >> 1));
    assign buffer1_full = (buf1_write_address >= (MAX_BUFFER_SIZE >> 1));
    always @(*) begin
        case(vdb_state)
        VDB_STATE_INITIAL,
        VDB_STATE_OVERRUN: begin
            // doesn't matter which buffer is being written, just choose the default buffer 0
            writing_to_buf0 = 1'b1;
            // but we can't actually write
            global_wren = 1'b0;
        end

        VDB_STATE_FILL_BUF0_OTHER_EMPTY: begin
            // writing to buffer 0, unless buffer 0 *just* filled up. Then swap to 1
            if(buffer0_full)    writing_to_buf0 = 1'b0;
            else                writing_to_buf0 = 1'b1;
            global_wren = 1'b1;
        end
        VDB_STATE_FILL_BUF0_OTHER_FULL: begin
            // writing to buffer 0. if buffer 0 gets filled, it will be an error state next
            writing_to_buf0 = 1'b1;
            if(buffer0_full)    global_wren = 1'b0;
            else                global_wren = 1'b1;
        end
        VDB_STATE_FILL_BUF0_OTHER_FULL_VS: begin
            // this is actually both buffers filled.
            writing_to_buf0 = 1'b1;
            global_wren = 1'b0;
        end
        VDB_STATE_FILL_BUF1_OTHER_EMPTY: begin
            // writing to buffer 0, unless buffer 0 *just* filled up. Then swap to 1
            if(buffer1_full)    writing_to_buf0 = 1'b1;
            else                writing_to_buf0 = 1'b0;
            global_wren = 1'b1;
        end
        VDB_STATE_FILL_BUF1_OTHER_FULL: begin
            // writing to buffer 0. if buffer 0 gets filled, it will be an error state next
            writing_to_buf0 = 1'b0;
            if(buffer1_full)    global_wren = 1'b0;
            else                global_wren = 1'b1;
        end
        VDB_STATE_FILL_BUF1_OTHER_FULL_VS: begin
            // this is actually both buffers filled.
            writing_to_buf0 = 1'b0;
            global_wren = 1'b0;
        end
        endcase
    end

    // Write register pointers
    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            buf0_write_address <= {(WRITE_ADDRESS_WIDTH+1){1'b0}};
            buf1_write_address <= {(WRITE_ADDRESS_WIDTH+1){1'b0}};
        end else begin
            // Who's writing?
            if(writing_to_buf0) begin
                if(global_wren && PIXEL_DE) begin
                    buf0_write_address <= buf0_write_address + {{(WRITE_ADDRESS_WIDTH){1'b0}}, 1'b1};
                end
                if(vdb_ready_state == VDB_READY_DONE) begin
                    // reset other buffer when it is finished being read
                    buf1_write_address <= {(WRITE_ADDRESS_WIDTH+1){1'b0}};
                end
            end else begin
                if(global_wren && PIXEL_DE) begin
                    buf1_write_address <= buf1_write_address + {{(WRITE_ADDRESS_WIDTH){1'b0}}, 1'b1};
                end
                if(vdb_ready_state == VDB_READY_DONE) begin
                    buf0_write_address <= {(WRITE_ADDRESS_WIDTH+1){1'b0}};
                end
            end
        end
    end

    // generating the end of frame signal
    // it should be high throughout the last buffer filled up (or was in the process of filling)
    // when the PIXEL_VS signal falling edge was detected
    // but... we can't just propagate it through. since the reader could still be on the previous
    // buffer and shouldn't be told that this is an end of frame
    // We need to hold onto the info about the end of frame and set it during the next buffer
    always @(posedge WRCLK or posedge reset_wrclk) begin
        if(reset_wrclk) begin
            frame_end_out <= 1'b0;
            frame_end_in_next_buf <= 1'b0;
        end else begin
            if(!frame_end_out) begin
                if((vdb_state == VDB_STATE_FILL_BUF0_OTHER_EMPTY) || (vdb_state == VDB_STATE_FILL_BUF1_OTHER_EMPTY)) begin
                    if(vs_frame_end) begin
                        // set frame end out immediately, as the
                        // next buffer will also be sent immediately
                        frame_end_out <= 1'b1;
                    end
                end 
                else if((vdb_state == VDB_STATE_FILL_BUF0_OTHER_FULL) || (vdb_state == VDB_STATE_FILL_BUF1_OTHER_FULL)) begin
                    if(vs_frame_end) begin
                        // not ready to set frame end out right now
                        // need to wait until the next buffer is started
                        frame_end_in_next_buf <= 1'b1;
                    end
                end

                if(frame_end_in_next_buf) begin
                    // set frame end out only when we eventually swap to the buffer when frame end occurred
                    // if((vdb_state == VDB_STATE_FILL_BUF0_OTHER_FULL_VS) || (vdb_state == VDB_STATE_FILL_BUF1_OTHER_FULL_VS)) begin
                        if(vdb_ready_state == VDB_READY_DONE) begin
                            frame_end_in_next_buf <= 1'b0;
                            frame_end_out <= 1'b1;
                        end
                    // end
                end
            end
            else begin
                 // frame_end_out is true
                frame_end_in_next_buf <= 1'b0; // never gonna have the pending frame out if we have the active assertion going
                if(vdb_ready_state == VDB_READY_DONE) begin
                    frame_end_out <= 1'b0;
                end
            end
        end
    end

    `ifndef GOWIN_BRAM

    // data buffers
    reg [WRITE_DATA_WIDTH-1:0] buffer0 [((1<<WRITE_ADDRESS_WIDTH)-1):0];
    reg [WRITE_DATA_WIDTH-1:0] buffer1 [((1<<WRITE_ADDRESS_WIDTH)-1):0];

    // read access in the buffers
    reg [READ_DATA_WIDTH-1:0] data_out_buf;
    assign RDDATA = data_out_buf;

    wire [WRITE_ADDRESS_WIDTH-1:0] read_addr = RDADDR[READ_ADDRESS_WIDTH-1:1];

    always @(posedge RDCLK) begin
        if(writing_to_buf0) begin
            if(RDADDR[0]) begin 
                // read the upper byte
                data_out_buf <= buffer1[read_addr][WRITE_DATA_WIDTH-1:READ_DATA_WIDTH];
            end
            else begin
                // read the lower byte
                data_out_buf <= buffer1[read_addr][READ_DATA_WIDTH-1:0];
            end
        end
        else begin
            if(RDADDR[0]) begin 
                // read the upper byte
                data_out_buf <= buffer0[read_addr][WRITE_DATA_WIDTH-1:READ_DATA_WIDTH];
            end
            else begin
                // read the lower byte
                data_out_buf <= buffer0[read_addr][READ_DATA_WIDTH-1:0];
            end
        end
    end

    // write access in the buffers
    always @(posedge WRCLK) begin
        if(global_wren) begin
            if(PIXEL_DE) begin
                if(writing_to_buf0) begin
                    buffer0[buf0_write_address] <= WRDATA;
                end else begin
                    buffer1[buf1_write_address] <= WRDATA;
                end
            end
        end
    end

    `else
    wire [READ_DATA_WIDTH-1:0] data_out_buf0;
    wire [READ_DATA_WIDTH-1:0] data_out_buf1;

    assign RDDATA = (writing_to_buf0 ? data_out_buf1 : data_out_buf0);

    // Port A - write only, Port B - read only
    Gowin_Diff_Wid buffer0(
        .reset(ARESET), //input reset
            // Port A
        .clka(WRCLK), //input clka
        .cea(PIXEL_DE), //input cea
        .ada(buf0_write_address), //input [9:0] ada
        .din(WRDATA), //input [15:0] din
            // Port B
        .clkb(RDCLK), //input clkb
        .ceb(1'b1), //input ceb
        .adb(RDADDR), //input [10:0] adb
        .dout(data_out_buf0), //output [7:0] dout
        .oce(1'b1) //input oce - not used in bypass mode (that's the mode we're in)
    );
    Gowin_Diff_Wid buffer1(
        .reset(ARESET), //input reset
            // Port A
        .clka(WRCLK), //input clka
        .cea(PIXEL_DE), //input cea
        .ada(buf1_write_address), //input [9:0] ada
        .din(WRDATA), //input [15:0] din
            // Port B
        .clkb(RDCLK), //input clkb
        .ceb(1'b1), //input ceb
        .adb(RDADDR), //input [10:0] adb
        .dout(data_out_buf1), //output [7:0] dout
        .oce(1'b1) //input oce - not used in bypass mode (that's the mode we're in)
    );

    `endif

    // outputs
    // BUFFER_LENGTH is the buffer address shifted left by 1 bit (multiplied by 2)
    // since the USB reader will be taking 8 bits out, but we wrote 16 bits at a ti
    assign BUFFER_LENGTH = {(writing_to_buf0 ? buf1_write_address : buf0_write_address), 1'b0};
    assign data_ready_wrclk = (vdb_ready_state == VDB_READY_BUF_FILLED);
    assign LAST = frame_end_out;
    assign OVERRUN = (vdb_state == VDB_STATE_OVERRUN);

    endmodule
