@@ -2054,14 +2054,18 @@ def test_stata_compression(compression_only, read_infer, to_infer):
2054
2054
def test_non_categorical_value_labels ():
2055
2055
data = DataFrame (
2056
2056
{
2057
- "X" : [1 , 2 , 3 , 4 , 1 ],
2057
+ "fully_labelled" : [1 , 2 , 3 , 3 , 1 ],
2058
+ "partially_labelled" : [1.0 , 2.0 , np .nan , 9.0 , np .nan ],
2058
2059
"Y" : [7 , 7 , 9 , 8 , 10 ],
2059
2060
"Z" : pd .Categorical (["j" , "k" , "l" , "k" , "j" ]),
2060
2061
}
2061
2062
)
2062
2063
2063
2064
with tm .ensure_clean () as path :
2064
- value_labels = {"X" : {1 : "one" , 2 : "two" , 4 : "four" }}
2065
+ value_labels = {
2066
+ "fully_labelled" : {1 : "one" , 2 : "two" , 3 : "three" },
2067
+ "partially_labelled" : {1.0 : "one" , 2.0 : "two" },
2068
+ }
2065
2069
expected = {** value_labels , "Z" : {0 : "j" , 1 : "k" , 2 : "l" }}
2066
2070
2067
2071
writer = StataWriter (path , data , value_labels = value_labels )
@@ -2072,7 +2076,7 @@ def test_non_categorical_value_labels():
2072
2076
assert reader_value_labels == expected
2073
2077
2074
2078
msg = "Can't create value labels for notY, it wasn't found in the dataset."
2075
- with pytest .raises (ValueError , match = msg ):
2079
+ with pytest .raises (KeyError , match = msg ):
2076
2080
value_labels = {"notY" : {7 : "label1" , 8 : "label2" }}
2077
2081
writer = StataWriter (path , data , value_labels = value_labels )
2078
2082
@@ -2083,3 +2087,78 @@ def test_non_categorical_value_labels():
2083
2087
with pytest .raises (ValueError , match = msg ):
2084
2088
value_labels = {"Z" : {1 : "a" , 2 : "k" , 3 : "j" , 4 : "i" }}
2085
2089
writer = StataWriter (path , data , value_labels = value_labels )
2090
+
2091
+
2092
+ def test_non_categorical_value_label_name_conversion ():
2093
+ # Check conversion of invalid variable names
2094
+ data = DataFrame (
2095
+ {
2096
+ "invalid~!" : [1 , 1 , 2 , 3 , 5 , 8 ], # Only alphanumeric and _
2097
+ "6_invalid" : [1 , 1 , 2 , 3 , 5 , 8 ], # Must start with letter or _
2098
+ "invalid_name_longer_than_32_characters" : [8 , 8 , 9 , 9 , 8 , 8 ], # Too long
2099
+ "aggregate" : [2 , 5 , 5 , 6 , 6 , 9 ], # Reserved words
2100
+ (1 , 2 ): [1 , 2 , 3 , 4 , 5 , 6 ], # Hashable non-string
2101
+ }
2102
+ )
2103
+
2104
+ value_labels = {
2105
+ "invalid~!" : {1 : "label1" , 2 : "label2" },
2106
+ "6_invalid" : {1 : "label1" , 2 : "label2" },
2107
+ "invalid_name_longer_than_32_characters" : {8 : "eight" , 9 : "nine" },
2108
+ "aggregate" : {5 : "five" },
2109
+ (1 , 2 ): {3 : "three" },
2110
+ }
2111
+
2112
+ expected = {
2113
+ "invalid__" : {1 : "label1" , 2 : "label2" },
2114
+ "_6_invalid" : {1 : "label1" , 2 : "label2" },
2115
+ "invalid_name_longer_than_32_char" : {8 : "eight" , 9 : "nine" },
2116
+ "_aggregate" : {5 : "five" },
2117
+ "_1__2_" : {3 : "three" },
2118
+ }
2119
+
2120
+ with tm .ensure_clean () as path :
2121
+ with tm .assert_produces_warning (InvalidColumnName ):
2122
+ data .to_stata (path , value_labels = value_labels )
2123
+
2124
+ reader = StataReader (path )
2125
+ reader_value_labels = reader .value_labels ()
2126
+ assert reader_value_labels == expected
2127
+
2128
+
2129
+ def test_non_categorical_value_label_convert_categoricals_error ():
2130
+ # Mapping more than one value to the same label is valid for Stata
2131
+ # labels, but can't be read with convert_categoricals=True
2132
+ value_labels = {
2133
+ "repeated_labels" : {10 : "Ten" , 20 : "More than ten" , 40 : "More than ten" }
2134
+ }
2135
+
2136
+ data = DataFrame (
2137
+ {
2138
+ "repeated_labels" : [10 , 10 , 20 , 20 , 40 , 40 ],
2139
+ }
2140
+ )
2141
+
2142
+ with tm .ensure_clean () as path :
2143
+ data .to_stata (path , value_labels = value_labels )
2144
+
2145
+ reader = StataReader (path , convert_categoricals = False )
2146
+ reader_value_labels = reader .value_labels ()
2147
+ assert reader_value_labels == value_labels
2148
+
2149
+ col = "repeated_labels"
2150
+ repeats = "-" * 80 + "\n " + "\n " .join (["More than ten" ])
2151
+
2152
+ msg = f"""
2153
+ Value labels for column { col } are not unique. These cannot be converted to
2154
+ pandas categoricals.
2155
+
2156
+ Either read the file with `convert_categoricals` set to False or use the
2157
+ low level interface in `StataReader` to separately read the values and the
2158
+ value_labels.
2159
+
2160
+ The repeated labels are:
2161
+ { repeats }
2162
+ """
2163
+ with pytest .raises (ValueError , match = msg ):
2164
+ read_stata (path , convert_categoricals = True )
0 commit comments