ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

balalaunicorn · 2022-07-05T18:44:28Z

So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)

#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft

data_sources = ['phone.json']

def main():
    spd = Spd(data_sources) #class instantiation
    start = time.process_time()
    relevant_tweets = spd.detector(data_sources)
    stop = time.process_time()
    return relevant_tweets




class Spd:
    """ some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
    def __init__(self, data_sources):
        self.data_sources = data_sources
    pass
        
    # first function in the class:
    def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
        data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
                         'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
                         'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
                         'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
        non_english_tweets = 0 # keep track of the non-English tweets
        with codecs.open('phone.json', 'r') as f: # data_source is read from extractor() function
            for line in f.readlines():
                non_English = 0
                try:
                    line = json.loads(line)
                    if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
                        data_extracts['Language'].append(line['Language'])
                        data_extracts['TweetID'].append(line['TweetID'])
                        data_extracts['RawTweets'].append(line['RawTweets'])
                        data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
                        data_extracts['CreatedAt'].append(line['CreatedAt'])
                        data_extracts['AccountCreated'].append(line['AccountCreated'])                       
                        data_extracts['ScreenName'].append(line['ScreenName'])                          
                        data_extracts['RetweetCount'].append(line['RetweetCount'])
                        data_extracts['FollowersCount'].append(line['FollowersCount'])
                        data_extracts['FriendsCount'].append(line['FriendsCount'])
                        data_extracts['StatusesCount'].append(line['StatusesCount'])
                        data_extracts['FavouritesCount'].append(line['FavouritesCount'])
                        data_extracts['UserName'].append(line['UserName'])
                        data_extracts['Location'].append(line['Location'])
                        data_extracts['Description'].append(line['Description'])
                        data_extracts['UserURL'].append(line['UserURL'])
                        data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
                        data_extracts['UserID'].append(line['UserID'])
                        data_extracts['TimeZone'].append(line['TimeZone'])
                        data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
                    else:
                        non_english_tweets +=1
                except:
                    continue
            df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
            df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
            df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
            df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
            AccountAge = [] # compute the account age of accounts
            date_format = "%Y-%m-%d  %H:%M:%S"
            for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
                #try:
                dr = str(dr)
                dc = str(dc)
                d1 = datetime.strptime(dr,date_format)
                d2 = datetime.strptime(dc,date_format)
                dif = d1 - d2
                AccountAge.append(dif.days)
                #except:
                    #continue
            df0['AccountAge']=AccountAge
            # add/define additional features ...
            df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
            df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
            df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
            df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
            df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
            df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
            df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
            df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
            df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
            df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
            df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
            df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
            df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
            df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
            df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)       
            # Remove all RTs, set UserID as index and save relevant files:
            df0 = df0[df0.Retweets.values==False] # remove retweets
            df0 = df0.set_index('UserID')
            df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
            #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
            df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv 
            with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
                d.write('{}'.format(non_english_tweets))
                d.close()
        return df0

    
    def detector(self, data_sources): # accept list of raw tweets as json objects
        self.data_sources = data_sources
        for data_sources in data_sources:
            self.data_sources = data_sources
            df0 = self.extractor(data_sources)
            #drop fields not required for predicition
            X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
                 'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
            X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
            X = X.dropna()
            # reload the trained model for use:
            spd_filter=pickle.load(open('trained_rf.pkl','rb'))
            PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
            X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
            nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
            spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
            #relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
            relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
            relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
            #relevant_tweets = relevant_tweets.drop('UserID', axis=1) 
            # save files:
            X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
            nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
            spam.to_csv(data_source[:5]+'_spam_accounts.csv')
            relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
        return relevant_tweets # or return relevant_tweets, nonspam, spam

if __name__ =='__main__':
    main()`

glemaitre · 2022-07-11T15:47:40Z

Please provide a minimal example

glemaitre · 2022-07-11T15:49:31Z

You can have a look at https://scikit-learn.org/dev/developers/minimal_reproducer.html to craft one.

hayesall added the Status: More Info Needed label Jul 17, 2022

glemaitre closed this as completed Jul 8, 2023

stephengmatthews mentioned this issue Sep 10, 2024

[BUG] macro_averaged_mean_absolute_error() raises ValueError #1094

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

balalaunicorn commented Jul 5, 2022 •

edited by glemaitre

Loading

glemaitre commented Jul 11, 2022

Uh oh!

glemaitre commented Jul 11, 2022

Uh oh!

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

Comments

balalaunicorn commented Jul 5, 2022 • edited by glemaitre Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

glemaitre commented Jul 11, 2022

Uh oh!

glemaitre commented Jul 11, 2022

Uh oh!

balalaunicorn commented Jul 5, 2022 •

edited by glemaitre

Loading