发布
2012年2月17日 (最后更新: 2012年2月17日)

输入纠错

评分: 3.7/5 (81票)
*****
注意: 我不确定我用于比较字符串的算法(请参阅下面的“similarity()”函数)是否正确;如果有人有任何改进意见,请随时发站内信给我。

我编写了一个函数,可以对无效的用户输入提出纠错建议。

例如,在使用 git(1) 时,如果您输入“git comit”,它会显示“您是指这个吗?/ commit”。它检测到参数(“comit”)无效,然后建议一个更正(“commit”)。这就是这个函数所做的(或试图做的)。

它是通过比较两个字符串并生成一个表示它们相似度的“分数”(通过对应用于两个字符串的 strcmp(3) 的返回值取绝对值并求和得到;零表示最相似,理论上不相似的字符串没有限制)。它使用这些分数来构建一个包含分数和相应字符串的关联数组。然后对该数组进行排序(使用 qsort(3)),以便键按正确的顺序排列。

源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* Copyright (C) 2012 Chrisname.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#define MIN(a, b)	(((a) < (b)) ?  (a) : (b))
#define MAX(a, b)	(((a) > (b)) ?  (a) : (b))
#define ABS(x)		(((x) < 0)   ? -(x) : (x))

/**
 * A simple associative array.
 */
struct assoc_array {
	int		key;	/**< The key value. */
	const char*	value;	/**< The string value. */
};

/* Comparator for qsort(3) */
static int qsort_compare_keys(const void* p1, const void* p2)
{
	struct assoc_array* node1 = (struct assoc_array*)p1;
	struct assoc_array* node2 = (struct assoc_array*)p2;
	/* Return the difference between 'node1->key' and 'node2->key'
	 */
	return node1->key - node2->key;
}

/** Returns the absolute similarity score for 's1' and 's2' */
static int similarity(const char* s1, const char* s2)
{
	int score = 0;
	unsigned long len1 = strlen(s1), len2 = strlen(s2), len = MAX(len1, len2);
	int i = 0;
	/* Sum the absolute values of the differences between each character of
	 * s1 and s2 given by strcmp(3)
	 */
	while (len --> 0) {
		int abs;
		/* If one string has ended, compare the rest of the other string
		 * with zero. Otherwise, compare the two strings as normal.
		 */
		if (len1 >= i && len2 < i) {
			int x = 0 - s1[i];
			score += ABS(x);
		} else if (len2 >= i && len1 < i) {
			int x = 0 - s2[i];
			score += ABS(x);
		} else {
			abs = ABS(strcmp(s1 + i, s2 + i));
			score += abs;
			/* Skip past the non-matching char so we don't compare it again
			 */
			while (s1[i] == s2[i] && len > 0) {
				++i;
				--len;
			}
			++i;
		}
	}
	return score;
}

/**
 * \brief	Suggests corrections to user input.
 * \param key	The key string.
 * \param values An array of strings to compare with the key string.
 * \param count	The number of elements in array 'values'.
 * \param n	The number of outputs to print.
 * \param reverse Whether to print output in reverse.
 * \param detail Whether to print detailed information.
 */
void correct(const char* key, char* values[], int count, long n, int reverse, int detail)
{
	int i;
	struct assoc_array* aa = malloc(sizeof(struct assoc_array) * count);
	/* Build an associative array out of values
	 */
	for (i = 0; i < count; ++i) {
		aa[i].key   = similarity(key, values[i]);
		aa[i].value = values[i];
	}
	/* Sort 'aa' by key
	 */
	qsort(aa, count, sizeof(struct assoc_array), qsort_compare_keys);
	/* Print the values in key order
	 */
	if (n == 0)
		n = count;
	if (reverse) {
		if (n)
			n = MAX(n, count - n);
		while (n --> 0) {
			if (detail)
				printf("aa[%d] = { %d, %s }\n", n, aa[n].key,
								aa[n].value);
			else
				printf("%s\n", aa[n].value);
		}
	} else {
		if (n)
			n = MIN(n, count);
		for (i = 0; i < n; ++i) {
			if (detail)
				printf("aa[%d] = { %d, %s }\n", i, aa[i].key,
								aa[i].value);
			else
				printf("%s\n", aa[i].value);
		}
	}
	free(aa);
}


用法示例
这是我为该函数创建的程序的源代码,我将其用于 shell 脚本。上面的代码已省略;只需将其粘贴到下面的代码中的“在此处粘贴”标记处即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * correct - finds which parameter/s is/are most similar to the key parameter.
 *
 * Copyright (C) 2012 Chrisname.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define PROGRAM_NAME	"correct"
#define VERSION_MAJOR	1
#define VERSION_MINOR	0
#define VERSION_PATCH	0

/*
 * PASTE HERE.
 */

/** Shows the usage message */
static void show_usage(const char* argv0)
{
	printf("correct - finds which parameter/s is/are most similar to the key parameter.\n"
		"Usage: %s [options] <key> <value 1> [value 2 [value ... [value n ]]]\n"
		"Options:\n"
		"\t-h,--help\tShow this help message\n"
		"\t-v,--version\tShow version information\n"
		"\t-n,--number\tSet the number of parameters to print. 0 = all. Default: 1\n"
		"\t-r,--reverse\tPrint output in reverse\n"
		"\t-d,--detail\tPrint more detailed information\n", argv0);
}

/** Shows the version message */
static void show_version()
{
	printf(PROGRAM_NAME " version %d.%d%d\n", VERSION_MAJOR, VERSION_MINOR,
								VERSION_PATCH);
}

int main(int argc, char* argv[])
{
	if (argc < 2) {
		show_usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	int reverse = 0, detail = 0;
	long n = 1;
	for (;;) {
		static const char* optstring = "hvn:rd";
		static struct option longopts[] = {
			{ "help",	0, NULL, 'h' },
			{ "version",	0, NULL, 'v' },
			{ "number",	1, NULL, 'n' },
			{ "reverse",	0, NULL, 'r' },
			{ "detail",	1, NULL, 'd' }
		};
		int c = getopt_long(argc, argv, optstring, longopts, NULL);
		if (c < 0)
			break;
		switch (c) {
			case 'h':
				show_usage(argv[0]);
				break;
			case 'v':
				show_version();
				break;
			case 'n':
				n = strtol(optarg, NULL, 10);
				break;
			case 'r':
				reverse = 1;
				break;
			case 'd':
				detail = 1;
				break;
			default:
				break;
		}
	}
	if ((argc - optind) < 2) {
		show_usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	correct(argv[optind], &argv[optind + 1], argc - optind - 1, n, reverse, detail);
	return 0;
}


输出
$ ./correct -n 0 hello hallo hail help hello
hello                    <-- This string is equal to the key string
hallo                    <-- This string has a similarity of 4 to the key string
help                     <-- This string has a similarity of 115 to the key string
hail                     <-- This string has a similarity of 118 to the key string