Python Pandas tips to make data analysis faster: tips from my library

How to Create Custom Django Management Commands

Follwing are some easy to use pandas library tricks you can implement in a second

Remove duplicates of csv file:

import pandas as pd
def remove_dumplicates_of_csv():
  df = pd.read_csv('/home/bharat/a.csv').drop_duplicates(keep='first')
  df.to_csv('/home/bharat/b.csv', index=False)
  print(df)
  return   
Remove duplicates of csv file based on id column:

import pandas as pd
def remove_dumplicates_of_csv():
  df = pd.read_csv('/home/bharat/a.csv').drop_duplicates(keep='first', subset=['id'])
  df.to_csv('/home/bharat/b.csv', index=False)
  print(df)
  return   
Print duplicates of two text files:

def get_duplicates_from_csv():

  text2 = open("/home/bharat/a.txt").readlines()
  text1 = open("/home/bharat/b.txt").readlines()
  for line in difflib.unified_diff(text1, text2):
      print(line)  
Merge two files without duplicates and merge both the files and again remove duplication from new file:

def dtop_duplicates_from_two_csv():
  df_1 = pd.read_csv('/home/bharat/a.csv').drop_duplicates(keep='first')
  df_2 = pd.read_csv('/home/bharat/b.csv').drop_duplicates(keep='first')  
  df = pd.concat([df_1, df_2])
  print(df)
  df.drop_duplicates(keep=False)
  print(df)  
Merge all files inside the folder and create new combined csv:

def merge_csv_files_into_one():

  import os
  import glob
  import pandas as pd

  os.chdir("/home/bharat/files/")
  extension = 'csv'
  all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  #combine all files in the list
  combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
  #export to csv
  combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
Extract particular row from list of csvs inside the folder and generate new csv:


def extract_row_from_csvs_folder():
  path = '/home/bharat/files/'
  l = []
  for file in [f for f in os.listdir(path) if '.csv' in f]:
      with open(path+file, 'r') as f:
          reader = csv.reader(f)
          examples = list(reader)
          try:
            l.append(examples[1])
          except:
            pass

  with open("/home/bharat/file_output.csv", 'w', newline='') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerows(l)

  return
Shuffle columns or rearrange the columns of all csv's inside the folder:

def shuffle_columns_of_csvs():
  path = '/home/bharat/files/'

  for file in [f for f in os.listdir(path) if '.csv' in f]:
    print("processing start:- ", file)
    # header=2 means skip header and first row
    df = pd.read_csv(path+file, low_memory=False)
    #  shuffle columns
    df = df[['id', 'appid', 'mtimestamp', 'uptimestamp', 'ipaddress', 'city', 'country', 'eventname', 'createddate', 'createdtime', 'events', 'deviceid', 'timestamp', 'kafkaoffset']]  
    # update new columns
    # df.columns = ['id', 'appid', 'events', 'deviceid', 'timestamp', 'mtimestamp', 'uptimestamp', 'ipaddress', 'city', 'country', 'eventname', 'createddate', 'createdtime', 'kafkaoffset']
    df.to_csv(path+file.split(".")[0]+'_shuffle.csv', index=False, header=False)
    print("done:- ", file)

  return  
Filter particular column value of all csv inside the folder and create new csv with filtered:

def registration_filteration_in_csv():

  path = '/home/bharat/files/'

  for file in [f for f in os.listdir(path) if '.csv' in f]:
    print("processing start:- ", file)
    arr_dict = dict()
    df = pd.read_csv(path+file, low_memory=False, error_bad_lines=False)
    rslt_df2 = df[df['eventname'] == "Register"]
    rslt_df2.to_csv(path+file.split(".")[0]+"_reg.csv", index=False)