使用二分查找查询文本文件

前言

   在大文本查找数据,本身是件麻烦的事情,前面也简单介绍了几种方式,都不是很好的处理方案,本文将介绍通过给文本文件,通过排序,建立索引,及二分查找组合的方式,来实现快速查找.
使用这种也不是非常完美,在大数据大文本中,排序也是件耗时的事情.比起前几种方式,这种是极好的方案.
  1. 将文件读取,进行排序,写入新的文件
  2. 根据新文件,创建新文件的索引
  3. 根据二分查找中间数 获取索引
  4. 根据索引获取字符串,进行对比

1.读文件,排序,写入新文件

//字符串对比函数
int compare(const void *p1, const void *p2)
{
	char **pc1 = p1;
	char **pc2 = p2;
	return strcmp(*pc1, *pc2);
}

//将文件读取,进行排序,写入新的文件
void readAndWriteFile(const char *filePath, const char *sortPath)
{
	//1.1读取文件

	FILE *pReader = fopen(filePath, "rb");
	if (pReader == NULL)
	{
		printf("无法读取%s文件!\n", filePath);
		return;
	}
	p_lineContents = calloc(LINES, sizeof(char *));
	if (p_lineContents == NULL)
	{
		printf("给p_lineContents分配内存失败!\n");
		return;
	}
	//读取一行并加载到内存中
	for (int i = 0; i < LINES; i++)
	{
		char tmpStr[64] = { 0 };
		fgets(tmpStr, 64, pReader);
		int strLength = strlen(tmpStr);
		p_lineContents[i] = calloc(strLength, sizeof(char));
		char *p = strcpy(p_lineContents[i], tmpStr);
		if (p == NULL)
		{
			printf("在%d行,拷贝%s 出现问题!\n", i, tmpStr);
			break;
		}
	}
	fclose(pReader);

	//1.2 进行排序

	qsort(p_lineContents, LINES, sizeof(char *), compare);

	//1.3将排序好数据写入到文件中

	FILE *pWriter = fopen(sortPath, "wb");
	if (pWriter == NULL)
	{
		printf("无法创建%s文件!\n", sortPath);
		return;
	}
	for (int i = 0; i < LINES; i++)
	{
		fputs(p_lineContents[i], pWriter);
	}
	fclose(pWriter);
}

2.根据排序好的文件,创建索引

//创建文件索引
void createIndex(char *filePath, char *indexPath)
{
	if (filePath == NULL)
	{
		return;
	}
	printf("开始分配内存\n");
	const int lines = LINES;						
	IndexInfo.pIndex = calloc(lines, sizeof(int));
	IndexInfo.length = lines;

	FILE *pReader = fopen(filePath, "rb");
	if (pReader == NULL)
	{
		return;
	}

	printf("开始读取文件\n");
	int allLength = 0;
	for (int i = 0; i < lines; i++)
	{
		char str[64] = { 0 };
		fgets(str, 64, pReader);
		IndexInfo.pIndex[i] = allLength;
		int currentLine = strlen(str);
		allLength += currentLine;
	}
	fclose(pReader);

	if (indexPath == NULL)
	{
		return;
	}
	FILE *pWriter = fopen(indexPath, "wb");
	if (pWriter == NULL)
	{
		return;
	}
	printf("开始写入索引文件\n");
	fwrite(IndexInfo.pIndex, sizeof(int), IndexInfo.length, pWriter);
	fclose(pWriter);
	//如果结构体的索引没有释放,则释放索引内存
	if (IndexInfo.pIndex != NULL)
	{
		free(IndexInfo.pIndex);
		printf("在内存中索引释放完成完成\n");
	}
}

3.根据二分查找中间数 获取索引及获取字符串,进行对比

//将\n替换\0
void replaceN(char *str)
{
	while (*str != '\0')
	{
		if (*str == '\r' || *str == '\n')
		{
			*str = '\0';
		}
		str++;
	}
}

//将-替换\0
void replaceMidLine(char *str)
{
	while (*str != '\0')
	{
		if (*str == '-')
		{
			*str = '\0';
		}
		str++;
	}
}

//二分查找,根据索引文件的
void binarySearch(const char *filePath, const char *indexPath, const char *str)
{
	int header = 0;
	int footer = LINES - 1;
	int flag = 0;
	while (header <= footer)
	{
		int mid = (header + footer) / 2;
		char midStr[128] = { 0 };
		{
			FILE *pf1 = fopen(filePath, "rb");	//sortaesc
			FILE *pf2 = fopen(indexPath, "rb");	//indexsort

			int indexNum = -1;
			//计算中间数,让文件指针移动该地址,获取到索引的值
			fseek(pf2, mid*sizeof(int), SEEK_SET);
			fread(&indexNum, sizeof(int), 1, pf2);
			
			//根据索引值,让文件指针移动到索引的地址,获取字符串
			fseek(pf1, indexNum, SEEK_SET);
			fgets(midStr, 128, pf1);

			fclose(pf1);
			fclose(pf2);
		}
		replaceN(midStr);
		char tmpstr[128] = { 0 };
		sprintf(tmpstr, midStr);
		replaceMidLine(tmpstr);

		//进行对比
		int result = strcmp(tmpstr, str); 
		if (result == 0)
		{
			flag = 1;
			printf("%s", midStr);
			break;
		}
		else if (result == 1)
		{
			footer = mid - 1;
		}
		else
		{
			header = mid + 1;
		}
	}

	if (!flag)
	{
		printf("没有找到%s\n", str);
	}
}

测试并运行

//文件的行数,单独获取
#define LINES 101							

struct FileIndex
{
	int *pIndex;						//索引
	int length;						//长度
}IndexInfo;

int main(int argc, char *argv[])
{

	readAndWriteFile("qq", "qq_bin_sort");

	createIndex("qq_bin_sort", "qq_bin_index");

	while (1)
	{
		printf("请输入要查找的字符串:");
		char str[128] = { 0 };
		scanf("%s", str);
		binarySearch("qq", "qq_bin_index", str);
		printf("\n");
	}

	system("pause");
	return 0;
}

运行效果图

根据索引进行二分查找

秋风 2016-07-17