#include <soc.h>
#include <misc/misc_setting.h>
#include <cg/cg.h>

#include "./bspchip.h"
#include "./memctl.h"
#include "./memctl_func.h"

#define MEMCTL_CALI_RETRY_LIMILT 		(5)
#define MEMCTL_CALI_MIN_READ_WINDOW 		(7)
#define MEMCTL_CALI_MIN_WRITE_WINDOW 		(7)

#define CPU_DCACHE_SIZE 			(0x8000)
#define MEMCTL_CALI_TARGET_LEN                  (CPU_DCACHE_SIZE * 2)
#define MEMCTL_CALI_FULL_SCAN_RESOLUTION        (2)
#define MEMCTL_CALI_WRITE_DELAY_START_TAP       (0)
#define MEMCTL_CALI_READ_DELAY_START_TAP        (0)
#define MEMCTL_CALI_TARGET_ADDR 		(0x80b00000)

/* DRAM patch from efuse */
extern u8_t EFPH_patch_num;
extern u8_t EFPH_DQR_delay_en;
extern u8_t EFPH_DQW_delay_en;
extern u8_t EFPH_DM_delay_en;
extern u8_t EFPH_DQR_delay;
extern u8_t EFPH_DQW_delay;
extern u8_t EFPH_DM_delay;

extern unsigned int ft_result;
void sys_watchdog_enable(unsigned int ph1, unsigned int ph2);

#define pat_ary_num 32
struct ary_info_str {
	const unsigned int pattern[pat_ary_num];
	unsigned int pat_num;
};

void _write_pattern_1(uint32 start_addr, uint32 len, struct ary_info_str *ptr_ary)
{
    volatile uint32 *p_start;
    uint32 b_len;
    uint32 ary_i;

	//puts("In _write_pattern_1: writes 0x"); puthex(start_addr); puts("len: "); puthex(len); puts("\n\r");

    /* In case of write through D-Cache mechanisim, read data in DCache */
    p_start = (volatile uint32 *)start_addr;
    for(b_len = 0; b_len < len; b_len += sizeof(uint32)){
        uint32 data_tmp __attribute__((unused)) = *p_start;
        p_start++;
    }

	//puts("store data in L1$\n\r");

    /* Write data */
    p_start = (volatile uint32 *)start_addr;
	ary_i = 0;
    for(b_len = 0; b_len < len; b_len += sizeof(uint32)){
			*p_start = ptr_ary->pattern[ary_i];
        p_start++;
			ary_i = (ary_i+1) % (ptr_ary->pat_num);
    }

	//puts("write pattern to L1$\n\r");

    _memctl_DCache_flush_invalidate();

    return;
}


uint32 _verify_pattern_1(uint32 start_addr, uint32 len, struct ary_info_str *ptr_ary)
{

    volatile uint32 *p_start, data_tmp;
    uint32 b_len, err_result;
    uint32 ary_i, pat_data;

    _memctl_DCache_flush_invalidate();

    err_result = 0;

    /* Read data */
	ary_i = 0;
    p_start = (volatile uint32 *)start_addr;
    for(b_len = 0; b_len < len; b_len += sizeof(uint32)){
    	data_tmp = *p_start;
		pat_data = ptr_ary->pattern[ary_i];
		ary_i = (ary_i+1) % (ptr_ary->pat_num);
    	err_result = err_result | ( (pat_data | data_tmp) & ( ~(pat_data & data_tmp)));
    	p_start++;
    }

    return err_result;
}

void memctl_sync_write_buf(void)
{
	*((volatile unsigned int *)0xB8001038) = 0x80000000;
	while(*((volatile unsigned int *)0xB8001038) & 0x80000000);
	return;
}

void _memctl_set_phy_delay_all(uint32 w_delay, uint32 r_delay)
{
	uint32 i_dq;
        volatile uint32 *ddcrdqr_base;

        ddcrdqr_base = (uint32 *)DACDQR;
	//puts("w_delay = ");puthex(w_delay);puts("  r_delay = ");puthex(r_delay);puts("\n\r");
	//printf("%s:%d: wdelay(%d), r_delay(%d)\n", __FUNCTION__, __LINE__, w_delay, r_delay);

	for(i_dq = 0; i_dq < 32; i_dq++){
		*ddcrdqr_base = (w_delay << 24) | (r_delay << 8);
		ddcrdqr_base++;
	}

        _memctl_update_phy_param();
		//puts("ddcrdqr_base(70) = ");puthex(REG32(0xb8001570));puts("\n\r");
		//puts("ddcrdqr_base(74) = ");puthex(REG32(0xb8001574));puts("\n\r");
		//puts("ddcrdqr_base(78) = ");puthex(REG32(0xb8001578));puts("\n\r");

	return ;
}

void  _memctl_set_phy_delay_dqrf(uint32 bit_loc,uint32 max_w_seq_start,uint32 max_w_len,uint32 max_r_seq_start,uint32 max_r_len)
{
	volatile uint32 *ddcrdqr_base,*dwdqor_base,*dwdqor,mem_clk_mhz;
	unsigned char r_delay_tap, w_delay_tap;
	unsigned char r_holdtime, w_holdtime;
	static uint32 dm0_delay_min=16,dm1_delay_min=16,dm0_delay_max=0,dm1_delay_max=0,dm0_delay=0,dm1_delay=0;
	ddcrdqr_base = (volatile uint32 *)DACDQR;
	dwdqor_base = (volatile uint32 *)DWDQOR;

	ddcrdqr_base += bit_loc;

	mem_clk_mhz = GET_MEM_MHZ();

	if ((_soc.sid == PLR_SID_APRO) || (_soc.sid == PLR_SID_APRO_GEN2)) {	//For Apollo pro
		switch (mem_clk_mhz){
			case 666:
				r_holdtime = 12;
				w_holdtime = 16;
				break;
			case 600:
				r_holdtime = 12;
				w_holdtime = 18;
				break;
			default:
				r_holdtime = 14;
				w_holdtime = 18;
		}
	}else{		//For RTL9603C-vd
		switch (mem_clk_mhz){
			case 600:
				r_holdtime = 16;
				w_holdtime = 17;
			break;
			case 525:
				r_holdtime = 16;
				w_holdtime = 18;
			break;
			default:
			r_holdtime = w_holdtime =(((((1000*1000)/mem_clk_mhz)/2-150))/2)/20;
		}
	}

	/* For DDR2, DDR3 read delay tap*/
	if(max_r_seq_start==0 && max_r_len>=r_holdtime){
		r_delay_tap = max_r_len - r_holdtime;
	}else if(max_r_seq_start==0 && max_r_len<r_holdtime){
		r_delay_tap = 0;
	}else{
		r_delay_tap = max_r_len/2;
	}

	if ((_soc.sid == PLR_SID_APRO) || (_soc.sid == PLR_SID_APRO_GEN2)) {	//For Apollo pro
		if(_soc_cid_series==ST_RTL9603CE){
			/* For DDR2, DDR3 write delay tap*/
			if(max_w_seq_start==0 && max_w_len>=w_holdtime){
				w_delay_tap = max_w_seq_start + (max_w_len - w_holdtime);
			}else if(max_w_seq_start==0 && max_w_len<w_holdtime){
				w_delay_tap = max_w_seq_start + 0;
			}else{
				w_delay_tap = max_w_seq_start + max_w_len/2;
			}
		}else if((_soc.sid == PLR_SID_APRO) && xlat_chip_mode()==ST_RTL9607C && EFVD_hex==0x3332 && EFBD_spec==1){	//patch for RTL9607C-VAJ
		    w_delay_tap = max_w_seq_start + max_w_len/2;
		}else{
			/* For DDR2, DDR3 write delay tap*/
			if(max_w_seq_start==0){
				w_delay_tap = max_w_seq_start + max_w_len/3;
			}else{
				w_delay_tap = max_w_seq_start + max_w_len/2;
			}
		}
	}else{		//For RTL9603C-vd
		/* For DDR2, DDR3 write delay tap*/
		if(max_w_seq_start==0 && max_w_len>=w_holdtime){
			w_delay_tap = max_w_seq_start + (max_w_len - w_holdtime);
		}else if(max_w_seq_start==0 && max_w_len<w_holdtime){
			w_delay_tap = max_w_seq_start + 0;
		}else{
			w_delay_tap = max_w_seq_start + max_w_len/2;
		}
	}
	// DRAM patch from efuse
	if (EFPH_patch_num!=0 && EFPH_DQR_delay_en==1){
		//printf("DQ read delay enable\n");
		if(EFPH_DQR_delay<8){
			if(r_delay_tap>=(EFPH_DQR_delay+1))
				r_delay_tap = r_delay_tap - (EFPH_DQR_delay+1);
			else
				r_delay_tap = 0;
		}else{
				r_delay_tap = r_delay_tap + (EFPH_DQR_delay+1-8);
		}
	}
	if (EFPH_patch_num!=0 && EFPH_DQW_delay_en==1){
		//printf("DQ write delay enable\n");
		if(EFPH_DQW_delay<8){
			if(w_delay_tap>=(EFPH_DQW_delay+1))
				w_delay_tap = w_delay_tap - (EFPH_DQW_delay+1);
			else
				w_delay_tap = 0;
		}else{
			w_delay_tap = w_delay_tap + (EFPH_DQW_delay+1-8);
		}
	}

	/* write delay tap into controller */
	*ddcrdqr_base = ((w_delay_tap & 0x1f) << 24) | \
					(((max_r_seq_start + max_r_len) & 0x1f) << 16) | \
					(((max_r_seq_start + r_delay_tap) & 0x1f) << 8) | \
					((max_r_seq_start & 0x1f) << 0);

	/* Enable delay tap sync with write delay tap*/
	if (bit_loc < 16){
		dwdqor = dwdqor_base + (bit_loc / 4);
		*dwdqor = *dwdqor | (w_delay_tap << ((bit_loc % 4) * 8));
	}

	if ((_soc.sid == PLR_SID_APRO) || (_soc.sid == PLR_SID_APRO_GEN2)) {	//For Apollo pro
		// Fixed value
		// DM efuse patch at digital delay line funcion
	}else{
		// Set DQM delay witch is equal to the median value of DQ delay
		if(bit_loc<8){
			if(w_delay_tap<dm0_delay_min)	//search DM0 min.
				dm0_delay_min=w_delay_tap;
			if(w_delay_tap>dm0_delay_max)	//search DM0 max.
				dm0_delay_max=w_delay_tap;
		}else if(bit_loc>=8 && bit_loc<16){
			if(w_delay_tap<dm1_delay_min)	//search DM1 min.
				dm1_delay_min=w_delay_tap;
			if(w_delay_tap>dm1_delay_max)	//search DM1 max.
				dm1_delay_max=w_delay_tap;
		}else if(bit_loc==31){
			dm0_delay = (dm0_delay_min+dm0_delay_max)/2;	//median value
			dm1_delay = (dm1_delay_min+dm1_delay_max)/2;
			// DRAM patch from efuse for RTL9603C-vd
			if (EFPH_patch_num!=0 && EFPH_DM_delay_en==1){
				if(EFPH_DM_delay<8){
					if(dm0_delay>=(EFPH_DM_delay+1))
						dm0_delay = dm0_delay - (EFPH_DM_delay+1);
					else
						dm0_delay = 0;
					if(dm1_delay>=(EFPH_DM_delay+1))
						dm1_delay = dm1_delay - (EFPH_DM_delay+1);
					else
						dm1_delay = 0;
				}else{
					dm0_delay = dm0_delay + (EFPH_DM_delay+1-8);
					dm1_delay = dm1_delay + (EFPH_DM_delay+1-8);
				}
			}
			REG32(DCDQMR) = (dm1_delay&0x1f)<<16 | (dm0_delay&0x1f)<<24;
			REG32(DWDMOR) = (dm0_delay&0x1f)<<16 | (dm1_delay&0x1f)<<24;
		}
	}

	_memctl_update_phy_param();

	return;
}

uint32 DDR_Calibration_Full_Scan_Reduce(uint32 target_addr, uint32 len, uint32 resolution, uint32 w_start, uint32 r_start)
{

	struct ary_info_str rand_ary={
		.pattern={
			0x00010000, 0x01234567, 0x00000000, 0x76543210,
			0xFFFFFFFF, 0x89abcdef, 0x0000FFFF, 0xfedcba98,
			0xFFFF0000, 0x00FF00FF, 0xFF00FF00, 0xF0F0F0F0,
			0x0F0F0F0F, 0x5A5AA5A5, 0xA5A55A5A, 0x5A5AA5A5,
			0xA5A55A5A, 0xA5A55A5A, 0x5A5AA5A5, 0xA5A55A5A,
			0x5A5AA5A5, 0x5555AAAA, 0xAAAA5555, 0x5555AAAA,
			0xAAAA5555, 0xAAAA5555, 0x5555AAAA, 0xAAAA5555,
			0x5555AAAA, 0xCC3333CC, 0x33CCCC33, 0xCCCC3333},
		.pat_num=pat_ary_num,
    };
	struct ary_info_str *ary_info_sel;
	ary_info_sel = &rand_ary;

    uint8 max_r_seq_start_array[32], max_r_len_array[32];
    uint8 max_w_seq_start_array[32], max_w_len_array[32];
    uint8 seq_start_array[32];
    uint8 search_seq_start_array[32];
    uint8 w_delay, r_delay, bit_loc;
    uint8 is_this_bit_correct;
    uint8 mode_16bit;
    uint8 max_r_seq_start = 0, max_r_len = 0;
    uint8 max_w_seq_start, max_w_len;
    uint8 r_seq_start, r_len;
    uint8 w_seq_start, w_len;
    uint8 search_seq_start;
    uint32 WR_Result;
	uint32 r_start_shift_L=0, r_start_shift_U=0, last_value_L=0, last_value_U=0;

    if(REG32(DCR) & 0x0F000000){
        mode_16bit = 1;
    }else{
        mode_16bit = 0;
    }

	scan_start:

    /**************************************************************
     ******* Searching for the max. sequetial read window.*********
     **************************************************************/
    for(bit_loc = 0; bit_loc < 32; bit_loc++){
        max_r_seq_start_array[bit_loc] = 0;
        max_r_len_array[bit_loc]       = 0;
    }

    for(w_delay = w_start; w_delay < 32; w_delay += resolution){
        //For each w_delay, it is a new search for each w_delay, so that reset the relative information to initial state.
        for(bit_loc = 0; bit_loc < 32; bit_loc++){
            seq_start_array[bit_loc]       = 0;
            search_seq_start_array[bit_loc]= 1;
        }

        for(r_delay = r_start; r_delay < 32; r_delay += resolution){
            _memctl_set_phy_delay_all(w_delay, r_delay);
            _write_pattern_1(target_addr, len, ary_info_sel);
            memctl_sync_write_buf();

			/* We mark correct bit */
            WR_Result = ~(_verify_pattern_1(target_addr, len, ary_info_sel));
            if( mode_16bit == 0){ /* 8bit mode */
                WR_Result = ((WR_Result & 0xFF000000) >> 24) | ((WR_Result & 0x00FF0000)) | ((WR_Result & 0x0000FF00) >> 8) | ((WR_Result & 0x000000FF) << 16);
            }else{ /* 16bit mode */
                WR_Result = ((WR_Result & 0xFFFF0000) >> 16) | ((WR_Result & 0x0000FFFF) << 16);
            }

            for(bit_loc = 0; bit_loc < 32; bit_loc++){
                if(mode_16bit == 0){
                    if((bit_loc > 7) && (bit_loc < 16))
                        continue;
                    if((bit_loc > 23) && (bit_loc < 32))
                        continue;
                }

                max_r_seq_start  = max_r_seq_start_array[bit_loc];
                max_r_len        = max_r_len_array[bit_loc];
                r_seq_start      = seq_start_array[bit_loc];
                search_seq_start = search_seq_start_array[bit_loc];
                is_this_bit_correct = ((WR_Result>>bit_loc) & 0x1);
																		// if correct bit 0->0,    path is "Step A"
																		// if correct bit 0->1->0, path is "Step 1 -> Step 2 -> Step A"
																		// if correct bit 1->0,    path is "Step 1 -> Step 2 -> Step A"
																		// if correct bit 1->1,    path is "Step 1 -> Step B"
                if(search_seq_start == 1){								//Step 1: Search start
                    if(is_this_bit_correct == 1){						//if bit is correct
                        r_seq_start = r_delay;							//this is "the start"!
                        search_seq_start = 0;							//stop search start
                    }
                    if( (r_delay+resolution) >= 31 ){					//Step A: Step 2 Jemp to here OR there is not correct bit
                        r_len = 1;										//No more correct bit, set len=1
                        if(r_len > max_r_len){							//If jump here from Step 2, this "if" always "false".
                            max_r_len = r_len;							//If there is not correct bit, max len set 1.
                            max_r_seq_start = r_seq_start;				//at the same thme, update the max start. So the max start is selected when max len is occured.
                            r_len = 0;
                            r_seq_start = r_delay + resolution;
                        }
                    }
                }else{													//Step 2: Search end
                    if(is_this_bit_correct == 0){						//if bit is not correct
                        r_len = r_delay - r_seq_start - resolution + 1;	//this is "the end", calculate len
                        if(r_len > max_r_len){							//if the len is larger than previous len
                            max_r_len = r_len;							//update the max len
                            max_r_seq_start = r_seq_start;				//at the same thme, update the max start. So the max start is selected when max len is occured.
                            r_len = 0;
                            r_seq_start = r_delay + resolution;
                        }
		                    search_seq_start = 1;						//Jump to Step A
                    }else{
                        if((r_delay+resolution) >= 31){					//Step B: if the whole bit is correct
                            r_len = r_delay - r_seq_start + 1;			//this is "the end", calculate len
                            if(r_len > max_r_len){						//if the len is larger than previous len
                                max_r_len = r_len;						//update the max len
                                max_r_seq_start = r_seq_start;			//at the same thme, update the max start. So the max start is selected when max len is occured.
                                r_len = 0;
                                r_seq_start = r_delay + resolution;
                            }
                        }
                    }
                }

               if(max_r_len > max_r_len_array[bit_loc]){
                    max_r_seq_start_array[bit_loc] = max_r_seq_start;
                    max_r_len_array[bit_loc]       = max_r_len;
                }
                seq_start_array[bit_loc]     = r_seq_start;
                search_seq_start_array[bit_loc]= search_seq_start;

			}
        }
    }


    /*********************************************************************************
         Searching for the max. write delay window basing on max. read delay window.
     *********************************************************************************/
    for(bit_loc = 0; bit_loc < 32; bit_loc++){
        max_w_seq_start_array[bit_loc] = 0;
        max_w_len_array[bit_loc]       = 0;
    }

    for(r_delay = max_r_seq_start ; r_delay < (max_r_seq_start + max_r_len) ; r_delay += resolution){
        //For each r_delay, it is a new search for each r_delay, so that reset the relative information to initial state.
        for(bit_loc = 0; bit_loc < 32; bit_loc++){
            seq_start_array[bit_loc]       = 0;
            search_seq_start_array[bit_loc]= 1;
        }

         for(w_delay = w_start; w_delay < 32; w_delay+=resolution){
            _memctl_set_phy_delay_all(w_delay, r_delay);
            _write_pattern_1(target_addr, len, ary_info_sel);
            memctl_sync_write_buf();

            /* We mark correct bit */
            WR_Result = ~(_verify_pattern_1(target_addr, len, ary_info_sel));
            if( mode_16bit == 0){ /* 8bit mode */
                WR_Result = ((WR_Result & 0xFF000000) >> 24) | ((WR_Result & 0x00FF0000)) | ((WR_Result & 0x0000FF00) >> 8) | ((WR_Result & 0x000000FF) << 16);
            }else{ /* 16bit mode */
                WR_Result = ((WR_Result & 0xFFFF0000) >> 16) | ((WR_Result & 0x0000FFFF) << 16);
            }

            for(bit_loc = 0; bit_loc < 32; bit_loc++){
                if(mode_16bit == 0){
                    if((bit_loc > 7) && (bit_loc < 16))
                        continue;
                    if((bit_loc > 23) && (bit_loc < 32))
                        continue;
                }

                max_w_seq_start  = max_w_seq_start_array[bit_loc];
                max_w_len        = max_w_len_array[bit_loc];
                w_seq_start      = seq_start_array[bit_loc];
                search_seq_start = search_seq_start_array[bit_loc];
                is_this_bit_correct = ((WR_Result>>bit_loc) & 0x1);

                if(search_seq_start == 1){
                    if(is_this_bit_correct == 1){
                        w_seq_start = w_delay;
                        search_seq_start = 0;
                    }
                    if( (w_delay+resolution) >= 31 ){
                        w_len = 1;
                        if(w_len > max_w_len){
                            max_w_len = w_len;
                            max_w_seq_start = w_seq_start;
                            w_len = 0;
                            w_seq_start = w_delay + resolution;
                        }
                    }
                }else{
                    if(is_this_bit_correct == 0){
                        w_len = w_delay - w_seq_start - resolution + 1;
                        if(w_len > max_w_len){
                            max_w_len = w_len;
                            max_w_seq_start = w_seq_start;
                            w_len = 0;
                            w_seq_start = w_delay + resolution;
                        }
                        search_seq_start = 1;
                    }else{
                        if((w_delay+resolution) >= 31){
                            w_len = w_delay - w_seq_start + 1;
                            if(w_len > max_w_len){
                                max_w_len = w_len;
                                max_w_seq_start = w_seq_start;
                                w_len = 0;
                                w_seq_start = w_delay + resolution;
                            }
                        }
                    }
                }

                if(max_w_len > max_w_len_array[bit_loc]){
                    max_w_seq_start_array[bit_loc] = max_w_seq_start;
                    max_w_len_array[bit_loc]       = max_w_len;
                }
                seq_start_array[bit_loc]     = w_seq_start;
                search_seq_start_array[bit_loc]= search_seq_start;

			}
        }
    }

    for(bit_loc = 0; bit_loc < 32; bit_loc++){
        #if 0//Move to below
		puts("AK: Bit:"); printf("%2d", bit_loc); puts(" ");
		puts("max_r_s:"); puthex(max_r_seq_start_array[bit_loc]); puts(" ");
		puts("max_r_l:"); puthex(max_r_len_array[bit_loc]); puts(" ");
		puts("max_w_s:"); puthex(max_w_seq_start_array[bit_loc]); puts(" ");
		puts("max_w_l:"); puthex(max_w_len_array[bit_loc]); puts(" \n\r");
        #endif
#ifdef FT_TEST
		if((max_r_len_array[bit_loc]<=0xf) | (max_w_len_array[bit_loc]<=0xf)){
#else
		if((max_r_len_array[bit_loc]<8) | (max_w_len_array[bit_loc]<8)){
#endif
			puts("AK: Window is too small, watchdog enable\n");
			ft_result |= (1 << 1);
            return 1;
		}

		_memctl_set_phy_delay_dqrf(bit_loc, max_w_seq_start_array[bit_loc], max_w_len_array[bit_loc], max_r_seq_start_array[bit_loc], max_r_len_array[bit_loc]);

		// Read DQS group delay auto K: Search the max_r_start of all bit_loc

        if((bit_loc < 8) || ((bit_loc >= 16)&&(bit_loc < 24))){
			if(max_r_seq_start_array[bit_loc] > r_start_shift_L){		//search the MAX "max_r_seq_start_array" in LDQ
				r_start_shift_L = max_r_seq_start_array[bit_loc] + last_value_L;	//group delay is equal to start value add lest group delay
			}
        }
        if(((bit_loc >= 8)&&(bit_loc < 16)) || (bit_loc >= 24)){
			if(max_r_seq_start_array[bit_loc] > r_start_shift_U){		//search the MAX "max_r_seq_start_array" in HDQ
				r_start_shift_U = max_r_seq_start_array[bit_loc] + last_value_U;	//group delay is equal to start value add lest group delay
			}
        }

    }

    puts("AK: Bit/max_r_s/max_r_l/max_w_s/max_w_l    Bit/max_r_s/max_r_l/max_w_s/max_w_l\n");
    for(bit_loc = 0; bit_loc < 16; bit_loc++){
		printf("    [%2d]", bit_loc);
		printf("%7x", max_r_seq_start_array[bit_loc]);
		printf("%8x", max_r_len_array[bit_loc]);
		printf("%8x", max_w_seq_start_array[bit_loc]);
		printf("%8x", max_w_len_array[bit_loc]);
        u32_t _loc_2 = bit_loc+16;
		printf("    [%2d]", _loc_2);
		printf("%7x", max_r_seq_start_array[_loc_2]);
		printf("%8x", max_r_len_array[_loc_2]);
		printf("%8x", max_w_seq_start_array[_loc_2]);
		printf("%8x", max_w_len_array[_loc_2]);
        puts("\n");
    }

	//Read DQS group delay auto K: Save the DQS group value
	if(r_start_shift_L == 0){		//if r_start_shift_L =0 means the group delay is good now.
		r_start_shift_L = last_value_L;		//so the group delay is last group delay
	}else{							//if r_start_shift_L !=0 means the group delay is not enough.
		last_value_L = r_start_shift_L;		//so save the value in "last_value," write the "r_start_shift" and do the best again.
	}
	if(r_start_shift_U == 0){
		r_start_shift_U = last_value_U;
	}else{
		last_value_U = r_start_shift_U;
	}

	//Read DQS group delay auto K: Write the group delay
	if((((REG32(DACCR) >> 16) & 0x1F) != r_start_shift_L) || (((REG32(DACCR) >> 8) & 0x1F) != r_start_shift_U)) {
		REG32(DACCR) &= ~((0x1F << 16) | (0x1F << 8));		//clear DQS group delay
		#ifndef SKIP_LARGE_DDR_DEBUG_LOG_PRINT
		puts("AK: Let's calibrate DRAM again, it'll be better!\n\r");
        #if 0
		puts("AK: DQS0 group delay: "); puthex(r_start_shift_L); puts("\n\r");
		puts("AK: DQS1 group delay: "); puthex(r_start_shift_U); puts("\n\r");
        #else
		printf("AK: Group delay: DQS0=%d, DQS1=%d\n", r_start_shift_L, r_start_shift_U);
        #endif
        #endif
		REG32(DACCR) = REG32(DACCR) | ((r_start_shift_L & 0x1F) << 16) | ((r_start_shift_U & 0x1F) << 8);
		REG32(DMCR) = REG32(DMCR);
		goto scan_start;	//Read group delay Auto calibration loop
	}

    return 0;
}

uint32 DDR_Calibration()
{
    uint32 target_addr, len, resolution, w_start, r_start;
    target_addr 	= MEMCTL_CALI_TARGET_ADDR;
    len         	= MEMCTL_CALI_TARGET_LEN;
    resolution  	= MEMCTL_CALI_FULL_SCAN_RESOLUTION;
    w_start     	= MEMCTL_CALI_WRITE_DELAY_START_TAP;
    r_start     	= MEMCTL_CALI_READ_DELAY_START_TAP;

    /* Enable dynamic PHY FIFO Reset */
    REG32(DACCR)= REG32(DACCR) | 0x20;

    /* Do a fully scan to choose a proper point. */
    int retry=1;
    while(0 != DDR_Calibration_Full_Scan_Reduce(target_addr, len, resolution, w_start, r_start))
    {
        if(retry>0){
       		memctlc_reset_procedure();
            if(memctlc_DDR_Type()==IS_DDR3_SDRAM)
        		memctlc_ddr3_dll_reset();
        	else
    	        memctlc_ddr2_dll_reset();
        }else{
    		sys_watchdog_enable(0,0);
    		while(1);
        }
        retry--;
    }

	return 0;
}

