Skip to content

Commit 8e57e46

Browse files
committed
Adding tests for invalid Stata names and repeated value labels
1 parent ae4dca7 commit 8e57e46

File tree

1 file changed

+82
-3
lines changed

1 file changed

+82
-3
lines changed

pandas/tests/io/test_stata.py

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2054,14 +2054,18 @@ def test_stata_compression(compression_only, read_infer, to_infer):
20542054
def test_non_categorical_value_labels():
20552055
data = DataFrame(
20562056
{
2057-
"X": [1, 2, 3, 4, 1],
2057+
"fully_labelled": [1, 2, 3, 3, 1],
2058+
"partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],
20582059
"Y": [7, 7, 9, 8, 10],
20592060
"Z": pd.Categorical(["j", "k", "l", "k", "j"]),
20602061
}
20612062
)
20622063

20632064
with tm.ensure_clean() as path:
2064-
value_labels = {"X": {1: "one", 2: "two", 4: "four"}}
2065+
value_labels = {
2066+
"fully_labelled": {1: "one", 2: "two", 3: "three"},
2067+
"partially_labelled": {1.0: "one", 2.0: "two"},
2068+
}
20652069
expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}}
20662070

20672071
writer = StataWriter(path, data, value_labels=value_labels)
@@ -2072,7 +2076,7 @@ def test_non_categorical_value_labels():
20722076
assert reader_value_labels == expected
20732077

20742078
msg = "Can't create value labels for notY, it wasn't found in the dataset."
2075-
with pytest.raises(ValueError, match=msg):
2079+
with pytest.raises(KeyError, match=msg):
20762080
value_labels = {"notY": {7: "label1", 8: "label2"}}
20772081
writer = StataWriter(path, data, value_labels=value_labels)
20782082

@@ -2083,3 +2087,78 @@ def test_non_categorical_value_labels():
20832087
with pytest.raises(ValueError, match=msg):
20842088
value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}}
20852089
writer = StataWriter(path, data, value_labels=value_labels)
2090+
2091+
2092+
def test_non_categorical_value_label_name_conversion():
2093+
# Check conversion of invalid variable names
2094+
data = DataFrame(
2095+
{
2096+
"invalid~!": [1, 1, 2, 3, 5, 8], # Only alphanumeric and _
2097+
"6_invalid": [1, 1, 2, 3, 5, 8], # Must start with letter or _
2098+
"invalid_name_longer_than_32_characters": [8, 8, 9, 9, 8, 8], # Too long
2099+
"aggregate": [2, 5, 5, 6, 6, 9], # Reserved words
2100+
(1, 2): [1, 2, 3, 4, 5, 6], # Hashable non-string
2101+
}
2102+
)
2103+
2104+
value_labels = {
2105+
"invalid~!": {1: "label1", 2: "label2"},
2106+
"6_invalid": {1: "label1", 2: "label2"},
2107+
"invalid_name_longer_than_32_characters": {8: "eight", 9: "nine"},
2108+
"aggregate": {5: "five"},
2109+
(1, 2): {3: "three"},
2110+
}
2111+
2112+
expected = {
2113+
"invalid__": {1: "label1", 2: "label2"},
2114+
"_6_invalid": {1: "label1", 2: "label2"},
2115+
"invalid_name_longer_than_32_char": {8: "eight", 9: "nine"},
2116+
"_aggregate": {5: "five"},
2117+
"_1__2_": {3: "three"},
2118+
}
2119+
2120+
with tm.ensure_clean() as path:
2121+
with tm.assert_produces_warning(InvalidColumnName):
2122+
data.to_stata(path, value_labels=value_labels)
2123+
2124+
reader = StataReader(path)
2125+
reader_value_labels = reader.value_labels()
2126+
assert reader_value_labels == expected
2127+
2128+
2129+
def test_non_categorical_value_label_convert_categoricals_error():
2130+
# Mapping more than one value to the same label is valid for Stata
2131+
# labels, but can't be read with convert_categoricals=True
2132+
value_labels = {
2133+
"repeated_labels": {10: "Ten", 20: "More than ten", 40: "More than ten"}
2134+
}
2135+
2136+
data = DataFrame(
2137+
{
2138+
"repeated_labels": [10, 10, 20, 20, 40, 40],
2139+
}
2140+
)
2141+
2142+
with tm.ensure_clean() as path:
2143+
data.to_stata(path, value_labels=value_labels)
2144+
2145+
reader = StataReader(path, convert_categoricals=False)
2146+
reader_value_labels = reader.value_labels()
2147+
assert reader_value_labels == value_labels
2148+
2149+
col = "repeated_labels"
2150+
repeats = "-" * 80 + "\n" + "\n".join(["More than ten"])
2151+
2152+
msg = f"""
2153+
Value labels for column {col} are not unique. These cannot be converted to
2154+
pandas categoricals.
2155+
2156+
Either read the file with `convert_categoricals` set to False or use the
2157+
low level interface in `StataReader` to separately read the values and the
2158+
value_labels.
2159+
2160+
The repeated labels are:
2161+
{repeats}
2162+
"""
2163+
with pytest.raises(ValueError, match=msg):
2164+
read_stata(path, convert_categoricals=True)

0 commit comments

Comments
 (0)