Skip to content

Commit 2796d1f

Browse files
committed
Moving label encoding to method
1 parent 70dc88b commit 2796d1f

File tree

1 file changed

+14
-32
lines changed

1 file changed

+14
-32
lines changed

pandas/io/stata.py

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -665,10 +665,18 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
665665
categories = catarray.cat.categories
666666
self.value_labels = list(zip(np.arange(len(categories)), categories))
667667
self.value_labels.sort(key=lambda x: x[0])
668+
668669
self.text_len = 0
669670
self.txt: list[bytes] = []
670671
self.n = 0
672+
self.off = np.array([])
673+
self.val = np.array([])
674+
self.len = 0
675+
676+
self._prepare_value_labels()
671677

678+
def _prepare_value_labels(self):
679+
""" Encode value labels. """
672680
# Compute lengths and setup lists of offsets and labels
673681
offsets: list[int] = []
674682
values: list[int] = []
@@ -677,10 +685,10 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
677685
if not isinstance(category, str):
678686
category = str(category)
679687
warnings.warn(
680-
value_label_mismatch_doc.format(catarray.name),
688+
value_label_mismatch_doc.format(self.labname),
681689
ValueLabelTypeMismatch,
682690
)
683-
category = category.encode(encoding)
691+
category = category.encode(self._encoding)
684692
offsets.append(self.text_len)
685693
self.text_len += len(category) + 1 # +1 for the padding
686694
values.append(vl[0])
@@ -785,37 +793,11 @@ def __init__(
785793
self.text_len = 0
786794
self.txt: list[bytes] = []
787795
self.n = 0
796+
self.off = np.array([])
797+
self.val = np.array([])
798+
self.len = 0
788799

789-
# Compute lengths and setup lists of offsets and labels
790-
offsets: list[int] = []
791-
values: list[int] = []
792-
for vl in self.value_labels:
793-
category = vl[1]
794-
if not isinstance(category, str):
795-
category = str(category)
796-
warnings.warn(
797-
value_label_mismatch_doc.format(labname),
798-
ValueLabelTypeMismatch,
799-
)
800-
category = category.encode(encoding)
801-
offsets.append(self.text_len)
802-
self.text_len += len(category) + 1 # +1 for the padding
803-
values.append(vl[0])
804-
self.txt.append(category)
805-
self.n += 1
806-
807-
if self.text_len > 32000:
808-
raise ValueError(
809-
"Stata value labels for a single variable must "
810-
"have a combined length less than 32,000 characters."
811-
)
812-
813-
# Ensure int32
814-
self.off = np.array(offsets, dtype=np.int32)
815-
self.val = np.array(values, dtype=np.int32)
816-
817-
# Total length
818-
self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
800+
self._prepare_value_labels()
819801

820802

821803
class StataMissingValue:

0 commit comments

Comments
 (0)