使用二分查找查询文本文件
前言
在大文本查找数据,本身是件麻烦的事情,前面也简单介绍了几种方式,都不是很好的处理方案,本文将介绍通过给文本文件,通过排序,建立索引,及二分查找组合的方式,来实现快速查找.使用这种也不是非常完美,在大数据大文本中,排序也是件耗时的事情.比起前几种方式,这种是极好的方案.
- 将文件读取,进行排序,写入新的文件
- 根据新文件,创建新文件的索引
- 根据二分查找中间数 获取索引
- 根据索引获取字符串,进行对比
1.读文件,排序,写入新文件
//字符串对比函数
int compare(const void *p1, const void *p2)
{
char **pc1 = p1;
char **pc2 = p2;
return strcmp(*pc1, *pc2);
}
//将文件读取,进行排序,写入新的文件
void readAndWriteFile(const char *filePath, const char *sortPath)
{
//1.1读取文件
FILE *pReader = fopen(filePath, "rb");
if (pReader == NULL)
{
printf("无法读取%s文件!\n", filePath);
return;
}
p_lineContents = calloc(LINES, sizeof(char *));
if (p_lineContents == NULL)
{
printf("给p_lineContents分配内存失败!\n");
return;
}
//读取一行并加载到内存中
for (int i = 0; i < LINES; i++)
{
char tmpStr[64] = { 0 };
fgets(tmpStr, 64, pReader);
int strLength = strlen(tmpStr);
p_lineContents[i] = calloc(strLength, sizeof(char));
char *p = strcpy(p_lineContents[i], tmpStr);
if (p == NULL)
{
printf("在%d行,拷贝%s 出现问题!\n", i, tmpStr);
break;
}
}
fclose(pReader);
//1.2 进行排序
qsort(p_lineContents, LINES, sizeof(char *), compare);
//1.3将排序好数据写入到文件中
FILE *pWriter = fopen(sortPath, "wb");
if (pWriter == NULL)
{
printf("无法创建%s文件!\n", sortPath);
return;
}
for (int i = 0; i < LINES; i++)
{
fputs(p_lineContents[i], pWriter);
}
fclose(pWriter);
}
2.根据排序好的文件,创建索引
//创建文件索引
void createIndex(char *filePath, char *indexPath)
{
if (filePath == NULL)
{
return;
}
printf("开始分配内存\n");
const int lines = LINES;
IndexInfo.pIndex = calloc(lines, sizeof(int));
IndexInfo.length = lines;
FILE *pReader = fopen(filePath, "rb");
if (pReader == NULL)
{
return;
}
printf("开始读取文件\n");
int allLength = 0;
for (int i = 0; i < lines; i++)
{
char str[64] = { 0 };
fgets(str, 64, pReader);
IndexInfo.pIndex[i] = allLength;
int currentLine = strlen(str);
allLength += currentLine;
}
fclose(pReader);
if (indexPath == NULL)
{
return;
}
FILE *pWriter = fopen(indexPath, "wb");
if (pWriter == NULL)
{
return;
}
printf("开始写入索引文件\n");
fwrite(IndexInfo.pIndex, sizeof(int), IndexInfo.length, pWriter);
fclose(pWriter);
//如果结构体的索引没有释放,则释放索引内存
if (IndexInfo.pIndex != NULL)
{
free(IndexInfo.pIndex);
printf("在内存中索引释放完成完成\n");
}
}
3.根据二分查找中间数 获取索引及获取字符串,进行对比
//将\n替换\0
void replaceN(char *str)
{
while (*str != '\0')
{
if (*str == '\r' || *str == '\n')
{
*str = '\0';
}
str++;
}
}
//将-替换\0
void replaceMidLine(char *str)
{
while (*str != '\0')
{
if (*str == '-')
{
*str = '\0';
}
str++;
}
}
//二分查找,根据索引文件的
void binarySearch(const char *filePath, const char *indexPath, const char *str)
{
int header = 0;
int footer = LINES - 1;
int flag = 0;
while (header <= footer)
{
int mid = (header + footer) / 2;
char midStr[128] = { 0 };
{
FILE *pf1 = fopen(filePath, "rb"); //sortaesc
FILE *pf2 = fopen(indexPath, "rb"); //indexsort
int indexNum = -1;
//计算中间数,让文件指针移动该地址,获取到索引的值
fseek(pf2, mid*sizeof(int), SEEK_SET);
fread(&indexNum, sizeof(int), 1, pf2);
//根据索引值,让文件指针移动到索引的地址,获取字符串
fseek(pf1, indexNum, SEEK_SET);
fgets(midStr, 128, pf1);
fclose(pf1);
fclose(pf2);
}
replaceN(midStr);
char tmpstr[128] = { 0 };
sprintf(tmpstr, midStr);
replaceMidLine(tmpstr);
//进行对比
int result = strcmp(tmpstr, str);
if (result == 0)
{
flag = 1;
printf("%s", midStr);
break;
}
else if (result == 1)
{
footer = mid - 1;
}
else
{
header = mid + 1;
}
}
if (!flag)
{
printf("没有找到%s\n", str);
}
}
测试并运行
//文件的行数,单独获取
#define LINES 101
struct FileIndex
{
int *pIndex; //索引
int length; //长度
}IndexInfo;
int main(int argc, char *argv[])
{
readAndWriteFile("qq", "qq_bin_sort");
createIndex("qq_bin_sort", "qq_bin_index");
while (1)
{
printf("请输入要查找的字符串:");
char str[128] = { 0 };
scanf("%s", str);
binarySearch("qq", "qq_bin_index", str);
printf("\n");
}
system("pause");
return 0;
}
运行效果图

秋风
2016-07-17