Question about how to understand the code implementation of the opencv matchTemplate cross-correlation formula

我尝试自己实现 opencv matchTemplate CV_TM_CCORR 方法,但是测试法线两者存在微小差别,不知道是不是我对CV_TM_CCORR 公式的理解有误,还是我编码的问题,第一次在这里发帖寻求帮助,谢谢
I tried to implement the opencv matchTemplate CV_TM_CCORR method by myself, but there is a slight difference between the two test normals. I don’t know if my understanding of the CV_TM_CCORR formula is wrong, or if it is a problem with my coding. This is the first time to post here for help, thank you
this code modified from github repository Fastest_Image_Pattern_Matching

void CMatchToolDlg::MatchTemplate (cv::Mat& matSrc, s_TemplData* pTemplData, cv::Mat& matResult_, int iLayer)
{
	matchTemplate(matSrc, pTemplData->vecPyramid[iLayer], matResult_, CV_TM_CCORR);
	double hs = clock();
#if 0
	matchTemplate(matSrc, pTemplData->vecPyramid[iLayer], matResult, CV_TM_CCORR);
#else
	Mat matResult;
	matResult.create(matSrc.rows - pTemplData->vecPyramid[iLayer].rows + 1,
		matSrc.cols - pTemplData->vecPyramid[iLayer].cols + 1, CV_32FC1);
	matResult.setTo(0);
	cv::Mat& matTemplate = pTemplData->vecPyramid[iLayer];
	int t_c_end = matTemplate.cols - 8, t_r_end = matTemplate.rows, t_c = 0, t_r = 0;
	for (int r = 0; r < matResult.rows; r++)
	{
		float* r_matResult = matResult.ptr<float>(r);
		uchar* r_source = matSrc.ptr<uchar>(r);
		uchar* r_template, *r_sub_source;
		for (int c = 0; c < matResult.cols; ++c, ++r_matResult, ++r_source)
		{
			r_template = matTemplate.ptr<uchar>();
			r_sub_source = r_source;
			for (t_r = 0; t_r < t_r_end; ++t_r, r_sub_source += matSrc.cols, r_template += matTemplate.cols)
			{
				t_c = 0;
#if 0
				for (; t_c <= t_c_end; t_c += 8)
				{
					//读取16个八位到寄存器
					__m128i source_v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(r_sub_source + t_c));
					__m128i template_v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(r_template + t_c));
					//16个八位转8个16位
					__m128i source_v_16 = _mm_cvtepu8_epi16(source_v);
					__m128i template_v_16 = _mm_cvtepu8_epi16(template_v);
					__m128i res_v = _mm_madd_epi16(source_v_16, template_v_16);
					res_v = _mm_hadd_epi32(res_v, res_v);
					res_v = _mm_hadd_epi32(res_v, res_v);
					*r_matResult += _mm_extract_epi32(res_v, 0);
				}
#endif
				for (; t_c < matTemplate.cols; t_c++)
				{
					*r_matResult += (*(r_sub_source + t_c))*(*(r_template + t_c));
				}
			}
		}
	}

#endif
	m_hs += clock() - hs;
	Mat diff;
	absdiff(matResult_, matResult, diff);
	double dMaxValue;
	minMaxLoc(diff, 0, &dMaxValue, 0,0);
	CCOEFF_Denominator (matSrc, pTemplData, matResult_, iLayer);
}