Skip to content

added percentage rank to DataFrame.Rank #6728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 28, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ Improvements to existing features
- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
- Testing statements updated to use specialized asserts (:issue:`6175`)
- ``DataFrame.rank()`` now has a percentage rank option (:issue:`5971`)
- ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`)
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
Expand Down
29 changes: 21 additions & 8 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,


def rank_2d_float64(object in_arr, axis=0, ties_method='average',
ascending=True, na_option='keep'):
ascending=True, na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -296,6 +296,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
float64_t sum_ranks = 0
int tiebreak = 0
bint keep_na = 0
float count = 0.0

tiebreak = tiebreakers[ties_method]

Expand Down Expand Up @@ -335,13 +336,15 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
for i in range(n):
dups = sum_ranks = 0
total_tie_count = 0
count = 0.0
for j in range(k):
sum_ranks += j + 1
dups += 1
val = values[i, j]
if val == nan_value and keep_na:
ranks[i, argsorted[i, j]] = nan
continue
count += 1.0
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
Expand All @@ -363,15 +366,16 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0

if pct:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
return ranks


def rank_2d_int64(object in_arr, axis=0, ties_method='average',
ascending=True, na_option='keep'):
ascending=True, na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -384,6 +388,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
int64_t val
float64_t sum_ranks = 0
int tiebreak = 0
float count = 0.0
tiebreak = tiebreakers[ties_method]

if axis == 0:
Expand Down Expand Up @@ -411,10 +416,12 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
for i in range(n):
dups = sum_ranks = 0
total_tie_count = 0
count = 0.0
for j in range(k):
sum_ranks += j + 1
dups += 1
val = values[i, j]
count += 1.0
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
Expand All @@ -436,7 +443,8 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0

if pct:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down Expand Up @@ -528,7 +536,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
ranks[argsorted[j]] = total_tie_count
sum_ranks = dups = 0
if pct:
ranks / count
return ranks / count
else:
return ranks

Expand Down Expand Up @@ -562,7 +570,7 @@ class NegInfinity(object):
__cmp__ = _return_true

def rank_2d_generic(object in_arr, axis=0, ties_method='average',
ascending=True, na_option='keep'):
ascending=True, na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -577,6 +585,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
float64_t sum_ranks = 0
int tiebreak = 0
bint keep_na = 0
float count = 0.0

tiebreak = tiebreakers[ties_method]

Expand Down Expand Up @@ -611,7 +620,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
for i in range(len(values)):
ranks[i] = rank_1d_generic(in_arr[i],
ties_method=ties_method,
ascending=ascending)
ascending=ascending,
pct=pct)
if axis == 0:
return ranks.T
else:
Expand All @@ -626,12 +636,14 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
for i in range(n):
dups = sum_ranks = infs = 0
total_tie_count = 0
count = 0.0
for j in range(k):
val = values[i, j]
if val is nan_value and keep_na:
ranks[i, argsorted[i, j]] = nan
infs += 1
continue
count += 1.0
sum_ranks += (j - infs) + 1
dups += 1
if j == k - 1 or are_diff(values[i, j + 1], val):
Expand All @@ -652,7 +664,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0

if pct:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def rank(values, axis=0, method='average', na_option='keep',
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method,
ascending=ascending, na_option=na_option)
ascending=ascending, na_option=na_option, pct=pct)

return ranks

Expand Down
10 changes: 6 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4182,7 +4182,7 @@ def f(arr):
return data.apply(f, axis=axis)

def rank(self, axis=0, numeric_only=None, method='average',
na_option='keep', ascending=True):
na_option='keep', ascending=True, pct=False):
"""
Compute numerical data ranks (1 through n) along axis. Equal values are
assigned a rank that is the average of the ranks of those values
Expand All @@ -4205,6 +4205,8 @@ def rank(self, axis=0, numeric_only=None, method='average',
* bottom: smallest rank if descending
ascending : boolean, default True
False for ranks by high (1) to low (N)
pct : boolean, default False
Computes percentage rank of data

Returns
-------
Expand All @@ -4214,18 +4216,18 @@ def rank(self, axis=0, numeric_only=None, method='average',
if numeric_only is None:
try:
ranks = algos.rank(self.values, axis=axis, method=method,
ascending=ascending, na_option=na_option)
ascending=ascending, na_option=na_option,
pct=pct)
return self._constructor(ranks, index=self.index,
columns=self.columns)
except TypeError:
numeric_only = True

if numeric_only:
data = self._get_numeric_data()
else:
data = self
ranks = algos.rank(data.values, axis=axis, method=method,
ascending=ascending, na_option=na_option)
ascending=ascending, na_option=na_option, pct=pct)
return self._constructor(ranks, index=data.index, columns=data.columns)

def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1732,7 +1732,7 @@ def rank(self, method='average', na_option='keep', ascending=True,
keep: leave NA values where they are
ascending : boolean, default True
False for ranks by high (1) to low (N)
pct : boolean, defeault False
pct : boolean, default False
Computes percentage rank of data

Returns
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10933,12 +10933,24 @@ def test_rank(self):

def test_rank2(self):
from datetime import datetime
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
assert_frame_equal(result, expected)

df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
assert_frame_equal(result, expected)



df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
assert_frame_equal(result, expected)


expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
assert_frame_equal(result, expected)
Expand Down