Skip to content

Commit 08f8f29

Browse files
author
Alex Kleeman
committed
Obtained approval from Climate Corporation to incorporate
methods from their core.Data objects. Those methods are included here, but merging is not entirely complete. Renamed the package to polyglot. Added a bunch of tests, also not entirely complete.
1 parent 25c3ae4 commit 08f8f29

File tree

6 files changed

+1720
-195
lines changed

6 files changed

+1720
-195
lines changed

src/__init__.py

Whitespace-only changes.

src/data.py

Lines changed: 0 additions & 195 deletions
This file was deleted.

src/polyglot/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from data import Dataset, Variable

src/polyglot/conventions.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import numpy as np
2+
import unicodedata
3+
4+
NULL = '\x00'
5+
NC_BYTE = '\x00\x00\x00\x01'
6+
NC_CHAR = '\x00\x00\x00\x02'
7+
NC_SHORT = '\x00\x00\x00\x03'
8+
# netCDF-3 only supports 32-bit integers
9+
NC_INT = '\x00\x00\x00\x04'
10+
NC_FLOAT = '\x00\x00\x00\x05'
11+
NC_DOUBLE = '\x00\x00\x00\x06'
12+
13+
# Map between netCDF type and numpy dtype and vice versa. Due to a bug
14+
# in the __hash__() method of numpy dtype objects (fixed in development
15+
# release of numpy), we need to explicitly match byteorder for dict
16+
# lookups to succeed. Here we normalize to native byte order.
17+
#
18+
# NC_CHAR is a special case because netCDF represents strings as
19+
# character arrays. When NC_CHAR is encountered as the type of an
20+
# attribute value, this TYPEMAP is not consulted and the data is read
21+
# as a string. However, when NC_CHAR is encountered as the type of a
22+
# variable, then the data is read is a numpy array of 1-char elements
23+
# (equivalently, length-1 raw "strings"). There is no support for numpy
24+
# arrays of multi-character strings.
25+
TYPEMAP = {
26+
# we could use np.dtype's as key/values except __hash__ comparison of
27+
# numpy.dtype is broken in older versions of numpy. If you must compare
28+
# and cannot upgrade, use __eq__.This bug is
29+
# known to be fixed in numpy version 1.3
30+
NC_BYTE: 'int8',
31+
NC_CHAR: '|S1',
32+
NC_SHORT: 'int16',
33+
NC_INT: 'int32',
34+
NC_FLOAT: 'float32',
35+
NC_DOUBLE: 'float64',
36+
}
37+
for k in TYPEMAP.keys():
38+
TYPEMAP[TYPEMAP[k]] = k
39+
40+
# Special characters that are permitted in netCDF names except in the
41+
# 0th position of the string
42+
_specialchars = '_.@+- !"#$%&\()*,:;<=>?[]^`{|}~'
43+
44+
# The following are reserved names in CDL and may not be used as names of
45+
# variables, dimension, attributes
46+
_reserved_names = set([
47+
'byte',
48+
'char',
49+
'short',
50+
'ushort',
51+
'int',
52+
'uint',
53+
'int64',
54+
'uint64',
55+
'float'
56+
'real',
57+
'double',
58+
'bool',
59+
'string',
60+
])
61+
62+
def coerce_type(arr):
63+
"""Coerce a numeric data type to a type that is compatible with
64+
netCDF-3
65+
66+
netCDF-3 can not handle 64-bit integers, but on most platforms
67+
Python integers are int64. To work around this discrepancy, this
68+
helper function coerces int64 arrays to int32. An exception is
69+
raised if this coercion is not safe.
70+
71+
netCDF-3 can not handle booleans, but booleans can be trivially
72+
(albeit wastefully) represented as bytes. To work around this
73+
discrepancy, this helper function coerces bool arrays to int8.
74+
"""
75+
# Comparing the char attributes of numpy dtypes is inelegant, but this is
76+
# the fastest test of equivalence that is invariant to endianness
77+
if arr.dtype.char == 'l': # np.dtype('int64')
78+
cast_arr = arr.astype(
79+
np.dtype('int32').newbyteorder(arr.dtype.byteorder))
80+
if not (cast_arr == arr).all():
81+
raise ValueError("array contains integer values that " +
82+
"are not representable as 32-bit signed integers")
83+
return cast_arr
84+
elif arr.dtype.char == '?': # np.dtype('bool')
85+
# bool
86+
cast_arr = arr.astype(
87+
np.dtype('int8').newbyteorder(arr.dtype.byteorder))
88+
return cast_arr
89+
else:
90+
return arr
91+
92+
def _isalnumMUTF8(c):
93+
"""Return True if the given UTF-8 encoded character is alphanumeric
94+
or multibyte.
95+
96+
Input is not checked!
97+
"""
98+
return (c.isalnum() or (len(c.encode('utf-8')) > 1))
99+
100+
def is_valid_name(s):
101+
"""Test whether an object can be validly converted to a netCDF
102+
dimension, variable or attribute name
103+
104+
Earlier versions of the netCDF C-library reference implementation
105+
enforced a more restricted set of characters in creating new names,
106+
but permitted reading names containing arbitrary bytes. This
107+
specification extends the permitted characters in names to include
108+
multi-byte UTF-8 encoded Unicode and additional printing characters
109+
from the US-ASCII alphabet. The first character of a name must be
110+
alphanumeric, a multi-byte UTF-8 character, or '_' (reserved for
111+
special names with meaning to implementations, such as the
112+
"_FillValue" attribute). Subsequent characters may also include
113+
printing special characters, except for '/' which is not allowed in
114+
names. Names that have trailing space characters are also not
115+
permitted.
116+
"""
117+
if not isinstance(s, basestring):
118+
return False
119+
if not isinstance(s, unicode):
120+
s = unicode(s, 'utf-8')
121+
num_bytes = len(s.encode('utf-8'))
122+
return ((unicodedata.normalize('NFC', s) == s) and
123+
(s not in _reserved_names) and
124+
(num_bytes >= 0) and
125+
('/' not in s) and
126+
(s[-1] != ' ') and
127+
(_isalnumMUTF8(s[0]) or (s[0] == '_')) and
128+
all((_isalnumMUTF8(c) or c in _specialchars for c in s))
129+
)

0 commit comments

Comments
 (0)