Skip to content

Commit b4af1f4

Browse files
authored
[DE-83] Bugfix/stopwords (#414)
* reverted default test docker images repository to docker.io * fixed handling of hex and verbatim strings in stopwords analyzer properties
1 parent 357b1a3 commit b4af1f4

File tree

3 files changed

+83
-13
lines changed

3 files changed

+83
-13
lines changed

docker/start_db.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Configuration environment variables:
44
# STARTER_MODE: (single|cluster|activefailover), default single
5-
# DOCKER_IMAGE: ArangoDB docker image, default gcr.io/gcr-for-testing/arangodb/arangodb:latest
5+
# DOCKER_IMAGE: ArangoDB docker image, default docker.io/arangodb/arangodb:latest
66
# SSL: (true|false), default false
77
# DATABASE_EXTENDED_NAMES: (true|false), default false
88
# ARANGO_LICENSE_KEY: only required for ArangoDB Enterprise
@@ -11,11 +11,11 @@
1111
# STARTER_MODE=cluster SSL=true ./start_db.sh
1212

1313
STARTER_MODE=${STARTER_MODE:=single}
14-
DOCKER_IMAGE=${DOCKER_IMAGE:=gcr.io/gcr-for-testing/arangodb/arangodb:latest}
14+
DOCKER_IMAGE=${DOCKER_IMAGE:=docker.io/arangodb/arangodb:latest}
1515
SSL=${SSL:=false}
1616
DATABASE_EXTENDED_NAMES=${DATABASE_EXTENDED_NAMES:=false}
1717

18-
STARTER_DOCKER_IMAGE=gcr.io/gcr-for-testing/arangodb/arangodb-starter:latest
18+
STARTER_DOCKER_IMAGE=docker.io/arangodb/arangodb-starter:latest
1919

2020
# exit when any command fails
2121
set -e

src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzerProperties.java

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.ArrayList;
2525
import java.util.List;
2626
import java.util.Objects;
27+
import java.util.stream.Collectors;
2728

2829
/**
2930
* @author Michele Rastelli
@@ -38,26 +39,89 @@ private static String stringToHex(String str) {
3839
return hex.toString();
3940
}
4041

42+
private static String hexToString(String hex) {
43+
final StringBuilder result = new StringBuilder();
44+
for (int i = 0; i < hex.length() - 1; i += 2) {
45+
String tempInHex = hex.substring(i, (i + 2));
46+
int decimal = Integer.parseInt(tempInHex, 16);
47+
result.append((char) decimal);
48+
}
49+
return result.toString();
50+
}
51+
4152
public StopwordsAnalyzerProperties() {
4253
stopwords = new ArrayList<>();
54+
hex = true;
4355
}
4456

45-
private List<String> stopwords;
57+
private final List<String> stopwords;
58+
private final boolean hex;
4659

4760
/**
48-
* @return array of hex-encoded strings that describe the tokens to be discarded.
61+
* @return list of hex-encoded strings that describe the tokens to be discarded.
62+
* @deprecated use {@link #getStopwordsAsHexList()} instead
4963
*/
64+
@Deprecated
5065
public List<String> getStopwords() {
51-
return stopwords;
66+
return getStopwordsAsHexList();
67+
}
68+
69+
/**
70+
* @return list of verbatim strings that describe the tokens to be discarded.
71+
*/
72+
public List<String> getStopwordsAsStringList() {
73+
if (hex) {
74+
return stopwords.stream()
75+
.map(StopwordsAnalyzerProperties::hexToString)
76+
.collect(Collectors.toList());
77+
} else {
78+
return stopwords;
79+
}
5280
}
5381

82+
/**
83+
* @return list of hex-encoded strings that describe the tokens to be discarded.
84+
*/
85+
public List<String> getStopwordsAsHexList() {
86+
if (hex) {
87+
return stopwords;
88+
} else {
89+
return stopwords.stream()
90+
.map(StopwordsAnalyzerProperties::stringToHex)
91+
.collect(Collectors.toList());
92+
}
93+
}
94+
95+
/**
96+
* @return if false each string in {@link #stopwords} is used as verbatim, if true as hex-encoded.
97+
*/
98+
public boolean getHex() {
99+
return hex;
100+
}
101+
102+
/**
103+
* @param value stopword as verbatim string
104+
* @return this
105+
*/
54106
public StopwordsAnalyzerProperties addStopwordAsString(final String value) {
55-
stopwords.add(stringToHex(value));
107+
if (hex) {
108+
stopwords.add(stringToHex(value));
109+
} else {
110+
stopwords.add(value);
111+
}
56112
return this;
57113
}
58114

115+
/**
116+
* @param value stopword as hex string
117+
* @return this
118+
*/
59119
public StopwordsAnalyzerProperties addStopwordAsHex(final String value) {
60-
stopwords.add(value);
120+
if (hex) {
121+
stopwords.add(value);
122+
} else {
123+
stopwords.add(hexToString(value));
124+
}
61125
return this;
62126
}
63127

@@ -66,11 +130,11 @@ public boolean equals(Object o) {
66130
if (this == o) return true;
67131
if (o == null || getClass() != o.getClass()) return false;
68132
StopwordsAnalyzerProperties that = (StopwordsAnalyzerProperties) o;
69-
return Objects.equals(stopwords, that.stopwords);
133+
return hex == that.hex && Objects.equals(stopwords, that.stopwords);
70134
}
71135

72136
@Override
73137
public int hashCode() {
74-
return Objects.hash(stopwords);
138+
return Objects.hash(stopwords, hex);
75139
}
76140
}

src/test/java/com/arangodb/ArangoSearchTest.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,15 +870,21 @@ public void stopwordsAnalyzer() {
870870
.addStopwordAsHex("616e64")
871871
.addStopwordAsString("the");
872872

873-
assertThat(properties.getStopwords(), hasItem("616e64"));
874-
assertThat(properties.getStopwords(), hasItem("746865"));
873+
assertThat(properties.getStopwordsAsStringList(), hasItem("and"));
874+
assertThat(properties.getStopwordsAsHexList(), hasItem("746865"));
875875

876876
StopwordsAnalyzer analyzer = new StopwordsAnalyzer();
877-
analyzer.setName("test-" + UUID.randomUUID().toString());
877+
String name = "test-" + UUID.randomUUID().toString();
878+
analyzer.setName(name);
878879
analyzer.setProperties(properties);
879880
analyzer.setFeatures(features);
880881

881882
createGetAndDeleteTypedAnalyzer(analyzer);
883+
db.createSearchAnalyzer(analyzer);
884+
String res = db.query("RETURN FLATTEN(TOKENS(SPLIT('the fox and the dog and a theater', ' '), @aName))",
885+
Collections.singletonMap("aName", name), String.class).next();
886+
assertThat(res, is("[\"fox\",\"dog\",\"a\",\"theater\"]"));
887+
db.deleteSearchAnalyzer(name);
882888
}
883889

884890
@Test

0 commit comments

Comments
 (0)