Skip to content

Commit 32a9c11

Browse files
committed
Add script to aid generation of perfect hash table
1 parent 7af8ee0 commit 32a9c11

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
<?php
2+
3+
function read_encoding_pointer_array(): array {
4+
// read the encoding pointer array
5+
$file_content = file_get_contents(__DIR__ . '/mbfl_encoding.c');
6+
$pattern = '/static const mbfl_encoding \*mbfl_encoding_ptr_list\[\][\s\S]*?\{([^}]*)\};/';
7+
preg_match($pattern, $file_content, $matches);
8+
assert(isset($matches[1]));
9+
$array = explode(",\n", $matches[1]);
10+
$array = array_map(function ($item) {
11+
return trim($item, "&\n\t ");
12+
}, $array);
13+
array_pop($array); // Remove NULL
14+
return $array;
15+
}
16+
17+
function search_struct_in_file(array &$result, $struct_names, $file_path)
18+
{
19+
$fileContent = file_get_contents($file_path);
20+
$pattern = '/const mbfl_encoding\s+(' . implode('|', $struct_names) . ')\s* = {([^}]*)}/';
21+
preg_match_all($pattern, $fileContent, $matches, PREG_SET_ORDER);
22+
foreach ($matches as $match) {
23+
$current_struct_name = $match[1];
24+
$struct_definition = $match[2];
25+
// Note: name is the second file
26+
$name = explode(',', $struct_definition)[1];
27+
$result[$current_struct_name] = trim($name, " \n\t\"");
28+
}
29+
}
30+
31+
function search_struct_in_dir($struct_names): array
32+
{
33+
$result = [];
34+
foreach (glob(__DIR__ . "/../**/*.c") as $file) {
35+
search_struct_in_file($result, $struct_names, $file);
36+
}
37+
return $result;
38+
}
39+
40+
$encoding_pointer_array = read_encoding_pointer_array();
41+
$encoding_pointer_array_name_mapping = search_struct_in_dir($encoding_pointer_array);
42+
43+
// The single byte encodings are generated and cannot be found in dedicated generated structs
44+
$fixed_encodings = [
45+
'mbfl_encoding_cp1251' => 'Windows-1251',
46+
'mbfl_encoding_cp1252' => 'Windows-1252',
47+
'mbfl_encoding_cp1254' => 'Windows-1254',
48+
'mbfl_encoding_8859_1' => 'ISO-8859-1',
49+
'mbfl_encoding_8859_2' => 'ISO-8859-2',
50+
'mbfl_encoding_8859_3' => 'ISO-8859-3',
51+
'mbfl_encoding_8859_4' => 'ISO-8859-4',
52+
'mbfl_encoding_8859_5' => 'ISO-8859-5',
53+
'mbfl_encoding_8859_6' => 'ISO-8859-6',
54+
'mbfl_encoding_8859_7' => 'ISO-8859-7',
55+
'mbfl_encoding_8859_8' => 'ISO-8859-8',
56+
'mbfl_encoding_8859_9' => 'ISO-8859-9',
57+
'mbfl_encoding_8859_10' => 'ISO-8859-10',
58+
'mbfl_encoding_8859_13' => 'ISO-8859-13',
59+
'mbfl_encoding_8859_14' => 'ISO-8859-14',
60+
'mbfl_encoding_8859_15' => 'ISO-8859-15',
61+
'mbfl_encoding_8859_16' => 'ISO-8859-16',
62+
'mbfl_encoding_cp866' => 'CP866',
63+
'mbfl_encoding_cp850' => 'CP850',
64+
'mbfl_encoding_koi8r' => 'KOI8-R',
65+
'mbfl_encoding_koi8u' => 'KOI8-U',
66+
'mbfl_encoding_armscii8' => 'ArmSCII-8',
67+
'mbfl_encoding_ascii' => 'ASCII',
68+
];
69+
70+
// Add the fixed encodings
71+
foreach ($fixed_encodings as $encoding_pointer => $encoding_name) {
72+
$encoding_pointer_array_name_mapping[$encoding_pointer] = $encoding_name;
73+
}
74+
75+
// Consistency check: all of the encoding pointer array entries should be found
76+
foreach ($encoding_pointer_array as $encoding_pointer) {
77+
assert(isset($encoding_pointer_array_name_mapping[$encoding_pointer]), "Missing entry for $encoding_pointer");
78+
}
79+
80+
$ordered_name_list = array_map(function ($encoding_pointer) use ($encoding_pointer_array_name_mapping) {
81+
return $encoding_pointer_array_name_mapping[$encoding_pointer];
82+
}, $encoding_pointer_array);
83+
84+
// Write out ordered name list, and invoke gperf for computing the perfect hash table
85+
file_put_contents(__DIR__ . '/encodings.txt', implode("\n", $ordered_name_list));
86+
ob_start();
87+
passthru('gperf ' . escapeshellarg(__DIR__ . '/encodings.txt') . ' --readonly-tables --null-strings --ignore-case -m 1000');
88+
$output = ob_get_clean();
89+
@unlink(__DIR__ . '/encodings.txt');
90+
91+
// Find asso_values array in $output
92+
$pattern = '/static const unsigned char asso_values\[\] =([^}]*)\};/';
93+
preg_match($pattern, $output, $matches);
94+
assert(isset($matches[1]));
95+
$asso_values = trim($matches[1], "\t \n{");
96+
echo "===--- Copy and paste the following values in the asso_values array in mbfl_encoding.c ---===\n";
97+
echo $asso_values, "\n";
98+
99+
// Find word_list array in $output
100+
$pattern = '/static const char \* const wordlist\[\] =([^}]*)\};/';
101+
preg_match($pattern, $output, $matches);
102+
assert(isset($matches[1]));
103+
$word_list = trim($matches[1], "\t \n{");
104+
$word_list = str_replace('(char*)0', '-1', $word_list);
105+
foreach ($encoding_pointer_array_name_mapping as $key => $value)
106+
{
107+
$index = array_search($key, $encoding_pointer_array);
108+
$word_list = str_replace("\"$value\"", $index, $word_list);
109+
}
110+
111+
echo "===--- Copy and paste the following values in the mbfl_encoding_ptr_list_after_hashing array in mbfl_encoding.c ---===\n";
112+
echo $word_list, "\n";

0 commit comments

Comments
 (0)