Understanding price data

12 minute read

Published:

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from ipywidgets import interact
os.chdir('D:/Documents/MSc course/MAT002/coursework1')
a = pd.read_csv('price_quote_2017_09.csv')
a.head()
headings = a.columns
def region_code(x):
    if x==2:
        return 'London'
    elif x==1:
        return 'Catalogue'
    elif x==3:
        return 'South-East'
    elif x==4:
        return 'South-West'
    elif x==5:
        return 'East Anglia'
    elif x==6:
        return 'East Midlands'
    elif x==7:
        return 'West Midlands'
    elif x==8:
        return 'Yorks & Humber'
    elif x==9:
        return 'North-West'
    elif x==10:
        return 'North'
    elif x==11:
        return 'Wales'
    elif x==12:
        return 'Scotland'
    elif x==13:
        return 'Northern Ireland'
    else:
        return x
    
def shop_code(x):
    if x==1:
        return 'Chain of stores'
    elif x==2:
        return 'Independent'
    else:
        return x

a['REGION'] = a['REGION'].apply(lambda x: region_code(x))
a['SHOP_TYPE'] = a['SHOP_TYPE'].apply(lambda x: shop_code(x))

a.head()
QUOTE_DATEITEM_IDITEM_DESCVALIDITYSHOP_CODEPRICEINDICATOR_BOXORIG_INDICATOR_BOXPRICE_RELATIVELOG_PRICE_RELATIVESTRATUM_WEIGHTSTRATUM_TYPESTART_DATEEND_DATEREGIONSHOP_TYPESHOP_WEIGHTBASE_PRICEBASE_VALIDITYSTRATUM_CELL
0201709210102LARGE LOAF-WHITE-UNSLICED-800G1270.0TT0.00.011.741201704999999North-WestChain of stores11.0249
1201709210102LARGE LOAF-WHITE-UNSLICED-800G1400.0TT0.00.012.841201702999999South-EastChain of stores11.0933
2201709210102LARGE LOAF-WHITE-UNSLICED-800G1440.0TT0.00.07.601201702999999East MidlandsChain of stores11.1036
3201709210102LARGE LOAF-WHITE-UNSLICED-800G1520.0MM0.00.07.601201702999999East MidlandsIndependent11.7036
4201709210102LARGE LOAF-WHITE-UNSLICED-800G1550.0TT0.00.09.051201702999999East AngliaChain of stores11.0935
a.groupby(by='ITEM_DESC').count()['PRICE'].sort_values(ascending=True)
ITEM_DESC
PRE-RECORDED DVD (NON-FILM)        44
COMPUTER GAME TOP 20 CHART         45
WEEKLY NANNY FEES                  46
CD ALBUM (NOT CHART)               46
BLU RAY DISC (FILM) FROM CHART     47
                                 ... 
MOBILE PHONE ACCESSORY            426
RESTAURANT CUP OF COFFEE          464
BOTTLE OF WINE 70-75CL            464
RESTAURANT - SWEET COURSE         465
RESTAURANT MAIN COURSE            466
Name: PRICE, Length: 568, dtype: int64
def confidence_intervals(x):
    result = a[a['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    
    d = 1.0*np.array(result['PRICE'])
    n = len(d)
    m, se = np.mean(d), stats.sem(d)
    h = se * 2.575
    return m, m-h, m+h
def statistical(x):
    plt.clf()
    result = a[a['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    
    sns.distplot(result['PRICE'],kde=False, fit=stats.norm)
    plt.tight_layout()
    plt.xlabel('Price of a {}, £'.format(str.capitalize(x)))
    plt.ylabel('Probability distribution')
    plt.axvline(confidence_intervals(x)[0])
    plt.axvline(confidence_intervals(x)[1],linestyle='--')
    plt.axvline(confidence_intervals(x)[2],linestyle='--')
    plt.savefig(x+'_normal.png')
    plt.show()
    
    print('Mean price = £{:.2f}'.format(np.mean(result['PRICE'])))
    print('Lower confidence interval = £{:.2f}'.format(confidence_intervals(x)[1]))
    print('Upper confidence interval = £{:.2f}'.format(confidence_intervals(x)[2]))
    print('Number of quotes = {}'.format(len(result)))
    # Shapiro-Wilk test for normality
    print('Shapiro-Wilk test = {}'.format(stats.shapiro(result['PRICE'])))
def summarise(x):
    result = a[a['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    return result['PRICE'].describe()
interact(statistical, x=['LAGER - PINT 3.4-4.2%','LIQUEUR PER NIP   SPECIFY ML'])
D:\Anaconda\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

png

Mean price = £3.39
Lower confidence interval = £3.30
Upper confidence interval = £3.47
Number of quotes = 399
Shapiro-Wilk test = ShapiroResult(statistic=0.9917417764663696, pvalue=0.025665421038866043)





<function __main__.statistical(x)>
summarise("HOME KILLED BEEF-LEAN MINCE KG")
count    271.000000
mean       7.476568
std        2.064686
min        2.900000
25%        6.180000
50%        7.490000
75%        8.780000
max       13.980000
Name: PRICE, dtype: float64

Test normality for all products

S_W_tests = []

for i in pd.unique(a['ITEM_DESC']):

    result = a[a['ITEM_DESC']==i]
    result = result[result['PRICE']>0]
    
    # Looking at all ANOVA values
    S_W_tests.append((i,stats.shapiro(result['PRICE'])[0],stats.shapiro(result['PRICE'])[1]))
    
Swilk_stats = pd.DataFrame(S_W_tests,columns=['Item','T_Cr','p_val'])
Swilk_stats[['Item','T_Cr','p_val']].sort_values(by='p_val',ascending=True)
ItemT_Crp_val
394WOMENS LEGGINGS FULL LENGTH0.3599457.055836e-33
381WOMEN'S SKIRT: CASUAL0.4615392.500297e-31
378WOMEN'S CARDIGAN0.4498834.000081e-31
383WOMEN'S VEST/STRAPPY TOP0.4255454.382403e-31
407WOMEN'S NIGHTDRESS/PYJAMAS0.4512215.599286e-31
............
445DAILY DISPOSABLE SOFT LENSES0.9840989.940200e-02
347HOME CARE ASSISTANT HRLY RATE0.9814161.663436e-01
494CAR REPAIRS LOCAL GARAGE0.9848712.340499e-01
199DRAUGHT STOUT PER PINT0.9957323.606236e-01
344HOME REMOVAL- 1 VAN0.9902746.221716e-01

568 rows × 3 columns

Wilcoxon signed rank stat

def wilcoxon(x):
    plt.clf()
    result = a[a['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    N = len(result)
    
    sns.distplot(result['PRICE'],kde=False, fit=stats.norm)
    plt.xlabel('Price of a {}, £'.format(str.capitalize(x)))
    plt.ylabel('Probability distribution')
    plt.axvline(confidence_intervals(x)[0])
    plt.axvline(confidence_intervals(x)[1],linestyle='--')
    plt.axvline(confidence_intervals(x)[2],linestyle='--')
    plt.show()
    
    print('Mean price = £{:.2f}'.format(np.mean(result['PRICE'])))
    print('Median price = £{:.2f}'.format(np.median(result['PRICE'])))
    print('Lower confidence interval (normal approx.) = £{:.2f}'.format(confidence_intervals(x)[1]))
    print('Upper confidence interval (normal approx.) = £{:.2f}'.format(confidence_intervals(x)[2]))
    # Shapiro-Wilk test for normality
    print('Shapiro-Wilk test = {}'.format(stats.shapiro(result['PRICE'])))
    print('Sample size = {}'.format(len(result['PRICE'])))
    
    #Wilcoxon signed-rank test
    test, p_val = stats.wilcoxon(x=np.median(result['PRICE'])*np.ones(shape=(N)), y=result['PRICE'])
    print('Wilcoxon Test stat = {}'.format(test))
    print('Wilcoxon P_value = {}'.format(p_val))
    
    confidence = []
    for i in np.linspace(np.median(result['PRICE'])*1, np.median(result['PRICE'])*1.5, num=66):
        p_val = stats.wilcoxon(x=i*np.ones(shape=(N)), y=result['PRICE'])[1]
        confidence.append((i,p_val))
    wilcoxon = pd.DataFrame(confidence,columns=['Medians','p_val'])
    return wilcoxon
wilcoxon("HOURLY RATE FOR SOLICITOR")
D:\Anaconda\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

png

Mean price = £254.09
Median price = £240.00
Lower confidence interval (normal approx.) = £235.69
Upper confidence interval (normal approx.) = £272.49
Shapiro-Wilk test = ShapiroResult(statistic=0.8421726226806641, pvalue=5.8617789733261816e-08)
Sample size = 83
Wilcoxon Test stat = 986.5
Wilcoxon P_value = 0.09462102392757735
Mediansp_val
0240.0000009.462102e-02
1241.8461542.393837e-01
2243.6923083.660094e-01
3245.5384624.161481e-01
4247.3846158.167931e-01
.........
61352.6153857.855655e-13
62354.4615385.633270e-13
63356.3076925.633270e-13
64358.1538465.269514e-13
65360.0000006.677325e-13

66 rows × 2 columns

def regional_chart(x):
    plt.clf()
    result = a[a['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    result = result[result['REGION']!='Northern Ireland']
    
    g = result.groupby('REGION')
    
    sns.set(style="ticks")

    # Initialize the figure
    f, ax = plt.subplots(figsize=(7, 6))
    
    sns.boxplot(y="REGION", x="PRICE", data=result)
    sns.swarmplot(y="REGION", x="PRICE", data=result, size=2, color='.3')
    # Add in points to show each observation
    
    plt.title('Boxplot for {}'.format(str.capitalize(x)))
    plt.ylabel('Region code')
    plt.xlabel('Price, £')
    plt.savefig(x+'_regional.png')
    plt.show()
    
    print(g['PRICE'].mean())
    
    # ANOVA
    
    samples = [np.array(x[1]['PRICE']) for x in g]
    f_val, p_val = stats.kruskal(*samples)
    print('F value: {:.3f}, p value: {:.3f}'.format(f_val, p_val))
regional_chart('TAKEAWAY CHICKEN & CHIPS')
<Figure size 432x288 with 0 Axes>

png

REGION
East Anglia       3.883571
East Midlands     4.806250
London            3.916429
North             3.846667
North-West        4.232667
Scotland          5.371818
South-East        5.050000
South-West        4.793000
Wales             4.325000
West Midlands     3.848182
Yorks & Humber    3.253077
Name: PRICE, dtype: float64
F value: 30.428, p value: 0.001
F_Stats = []

for i in pd.unique(a['ITEM_DESC']):

    result = a[a['ITEM_DESC']==i]
    result = result[result['PRICE']>0]
    
    # Looking at all ANOVA values
    g = result.groupby('REGION')
    samples = [np.array(x[1]['PRICE']) for x in g]
    f_val, p_val = stats.kruskal(*samples)
    
    F_Stats.append((i,f_val,p_val))

F_Stats = pd.DataFrame(F_Stats,columns=['Item','f_val','p_val'])
F_Stats.sort_values(by='p_val',ascending=False).head(n=30)
Itemf_valp_val
144MELON EACH EG HONEYDEW1.2386780.999852
512MP4 PLAYER1.5911430.999821
221VODKA-70 CL BOTTLE1.3422920.999780
311WASHING UP LIQUID 380-900ML1.3720810.999754
122VEGETABLE STIR FRY FRESH PACK1.4671710.999659
118PRE-PACKED SALAD 100-250G1.5745460.999518
123SWEET POTATO PER KG1.6729650.999355
6DRY SPAGHETTI OR PASTA 500G1.8959510.998830
207LAGER 12 - 24 CANS (440-500ML)2.0172620.998434
384WOMENS DRESS -CASUAL/FORMAL2.5210250.998084
196VENDING MACHINE - SOFT DRINK2.2087620.997618
77TEA BAGS-2-PACKET OF 2402.2273700.997524
206LAGER 4 BOTTLES- PREMIUM2.2542850.997384
143FRUIT FRESH SNACKING 150-350G2.3292990.996964
92SUGAR -GRANULATED-WHITE-PER KG2.4221040.996377
142BLUEBERRIES PUNNET PER KG2.4719170.996030
12HOT OAT CEREAL2.4828880.995951
501PUSHCHAIR/STROLLER2.5137570.995720
124CANNED TOMATOES 390-400G2.6854670.994264
539PORTABLE DIGITAL STORAGE DEVIC2.6970060.994155
150CANNED SOUP-390-425G2.7460300.993674
20BISCUITS HALF CHOC 260-400G2.9615170.991225
166COFFEE PODS PACK 8-163.1310890.988877
447MOBILE PHONE ACCESSORY3.7129220.988077
516FLAT PANEL TV 14-22"/35-55CM3.8302460.986308
43CONTINENTAL SLICED DELI MEAT3.2928380.986262
161CHILLED PIZZA EACH 300-600G3.4313010.983712
78FRUIT SQUASH, 750ML - 1.5LT3.4536520.983272
45CHICKEN KIEV 2 PACK 240-310G3.4858120.982626
158CHILLED READY MEAL-MEAT-SERVS13.5300370.981709
len(F_Stats[F_Stats['p_val']<0.05])
106
shoptype_chart('MENS SOCKS 1 PAIR')

png

SHOP_TYPE
Chain of stores    2.013203
Independent        7.511379
Name: PRICE, dtype: float64
Shapiro-Wilk test = [ShapiroResult(statistic=0.7307813167572021, pvalue=4.8377427903997244e-21), ShapiroResult(statistic=0.9316719770431519, pvalue=0.0028513146098703146)]
Variances = [2.1291349375008, 10.721570511296076]
F value: 401.414, p value: 0.000
F_Stats = []

for i in pd.unique(a['ITEM_DESC']):

    result = a[a['ITEM_DESC']==i]
    result = result[result['PRICE']>0]
    
    # Looking at all ANOVA values
    g = result.groupby('SHOP_TYPE')
    samples = [x[1]['PRICE'] for x in g]
    
    if len(samples)>1:
        f_val, p_val = stats.f_oneway(*samples)

        F_Stats.append((i,f_val,p_val))

F_Stats = pd.DataFrame(F_Stats,columns=['Item','f_val','p_val'])
F_Stats.sort_values(by='p_val',ascending=True)
Itemf_valp_val
381MENS SOCKS 1 PAIR433.8848242.628217e-63
120BANANAS-PER KG520.1316533.259348e-63
99FRESH VEG-CUCUMBER-WHOLE416.0194142.653576e-57
100FRESH VEG-LETTUCE-ICEBERG-EACH362.4046381.065520e-51
327MENS JEANS238.1779301.581973e-41
............
138DRIED POTTED SNACK 50-120G0.0056529.401826e-01
322FUNERAL-CREMATION0.0011499.729816e-01
24FROZEN BEEFBURGERS PACK OF 40.0001329.908450e-01
449NON-NHS MEDICINE-PHYSIOTHERAPY0.0001159.914533e-01
263ELECTRIC COOKER0.0000289.958069e-01

523 rows × 3 columns

Seasonal trends

b = pd.concat(pd.read_csv(i,names=headings,skiprows=1) for i in os.listdir() if i.startswith('upload'))
b.head()
QUOTE_DATEITEM_IDITEM_DESCVALIDITYSHOP_CODEPRICEINDICATOR_BOXORIG_INDICATOR_BOXPRICE_RELATIVELOG_PRICE_RELATIVESTRATUM_WEIGHTSTRATUM_TYPESTART_DATEEND_DATEREGIONSHOP_TYPESHOP_WEIGHTBASE_PRICEBASE_VALIDITYSTRATUM_CELL
0201601210102LARGE LOAF-WHITE-UNSLICED-800G1210.0TT0.00.08.2112015029999998111.39038
1201601210102LARGE LOAF-WHITE-UNSLICED-800G1270.0TT0.00.011.5712015049999999111.38349
2201601210102LARGE LOAF-WHITE-UNSLICED-800G1400.0TT0.00.08.2112015029999998111.39038
3201601210102LARGE LOAF-WHITE-UNSLICED-800G1430.0MM0.00.010.7312015029999992113.69032
4201601210102LARGE LOAF-WHITE-UNSLICED-800G1470.0TT0.00.08.2112015029999998111.39038
import datetime as dt
b['QUOTE_DATE'] = b['QUOTE_DATE'].apply(lambda x: dt.datetime.strftime(dt.datetime.strptime(str(x)[-2:],'%m'),'%b'))
b.head()
QUOTE_DATEITEM_IDITEM_DESCVALIDITYSHOP_CODEPRICEINDICATOR_BOXORIG_INDICATOR_BOXPRICE_RELATIVELOG_PRICE_RELATIVESTRATUM_WEIGHTSTRATUM_TYPESTART_DATEEND_DATEREGIONSHOP_TYPESHOP_WEIGHTBASE_PRICEBASE_VALIDITYSTRATUM_CELL
0Jan210102LARGE LOAF-WHITE-UNSLICED-800G1210.0TT0.00.08.2112015029999998111.39038
1Jan210102LARGE LOAF-WHITE-UNSLICED-800G1270.0TT0.00.011.5712015049999999111.38349
2Jan210102LARGE LOAF-WHITE-UNSLICED-800G1400.0TT0.00.08.2112015029999998111.39038
3Jan210102LARGE LOAF-WHITE-UNSLICED-800G1430.0MM0.00.010.7312015029999992113.69032
4Jan210102LARGE LOAF-WHITE-UNSLICED-800G1470.0TT0.00.08.2112015029999998111.39038
def seasonal_analysis(x):
    
    plt.clf()
    
    result = b[b['ITEM_DESC']==x]
    result = result[result['PRICE']>0]
    
    # Initialize the figure
    f, ax = plt.subplots(figsize=(7, 6))
    
    sns.boxplot(y="PRICE", x="QUOTE_DATE", data=result)
    sns.swarmplot(y="PRICE", x="QUOTE_DATE", data=result, size=2, color='.3')
    # Add in points to show each observation
    
    plt.title('Boxplot for {}'.format(str.capitalize(x)))
    plt.ylabel('Price, £')
    plt.xlabel('Month of 2016')

    plt.show()

    g = result.groupby('QUOTE_DATE')
    
    # ANOVA
    
    samples = [np.array(x[1]['PRICE']) for x in g]
    
    # Look at min length of array

    minimum_sample = np.min([len(i) for i in samples])

    # Randomly remove

    samples = [np.random.choice(i,minimum_sample) for i in samples]
    
    #Friedman
    f_val, p_val = stats.friedmanchisquare(*samples)
    print('F value: {:.3f}, p value: {:.3f}'.format(f_val, p_val))
    
    print(g['PRICE'].mean())
seasonal_analysis("""KEROSENE - 1000L DELIVERED""")
D:\Anaconda\lib\site-packages\seaborn\categorical.py:1296: UserWarning: 5.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)



<Figure size 432x288 with 0 Axes>

png

F value: 506.239, p value: 0.000
QUOTE_DATE
Aug    353.054436
Dec    444.412647
Jul    362.422985
Jun    361.453603
Nov    410.575344
Oct    423.470534
Sep    373.458947
Name: PRICE, dtype: float64
F_Stats = []

for i in pd.unique(b['ITEM_DESC']):

    try:
        result = b[b['ITEM_DESC']==i]
        result = result[result['PRICE']>0]

        # Looking at all ANOVA values
        g = result.groupby('QUOTE_DATE')
        samples = [np.array(x[1]['PRICE']) for x in g]

        minimum_sample = np.min([len(i) for i in samples])

        samples = [np.random.choice(i,minimum_sample) for i in samples]

        f_val, p_val = stats.friedmanchisquare(*samples)

        F_Stats.append((i,f_val,p_val))
    
    except ValueError:
        pass
    
F_Stats = pd.DataFrame(F_Stats,columns=['Item','f_val','p_val'])
F_Stats.sort_values(by='p_val',ascending=True).head(n=20)
Itemf_valp_val
797KEROSENE - 1000L DELIVERED489.1836901.796869e-102
773CIGARETTES 8275.1624291.704665e-56
774CIGARETTES 12262.8589487.309182e-54
776CIGARETTES 20190.8987581.638615e-38
696MELON EACH EG HONEYDEW190.8973781.639723e-38
77720 FILTER - OTHER BRAND176.8448011.586925e-35
248KEROSENE - 1000L DELIVERED161.6867286.355599e-34
224CIGARETTES 15158.5886582.934914e-33
223CIGARETTES 12146.3489301.233071e-30
775CIGARETTES 15145.0383548.653131e-29
640FRUIT SQUASH, 750ML - 1.5LT142.8738132.479238e-28
772CREAM LIQUER 70CL-1LT 15-20%129.1077681.980945e-25
222CIGARETTES 8110.4342295.880041e-23
54MARGARINE/LOW FAT SPREAD-500G107.8342212.107678e-22
519BOOK-NON-FICTION-HARD-COVER107.3366372.690798e-22
624MARGARINE/LOW FAT SPREAD-500G110.4842771.613894e-21
132SMALL TYPE ORANGES PER PACK/KG124.6074292.139395e-21
1037BOOK-NON-FICTION-PAPER-BACK103.5506084.552591e-20
82PURE FRUIT SMOOTHIE 750ML-1LTR111.5022319.177381e-19
694BLUEBERRIES PUNNET PER KG91.8472611.251904e-17