(.env) boris@boris-All-Series:~/VOTING/PYSPARK$ cat PySparkDataFrame2.py
import pyspark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import pandas as pd
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","","Smith","36636","M",60000),
("Michael","Rose","","40288","M",70000),
("Robert","","Williams","42114","",400000),
("Maria","Anne","Jones","39192","F",500000),
("Jen","Mary","Brown","","F",0)]
columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)
# Converting dataframe to pandas
pandasDF = pysparkDF.toPandas()
print(pandasDF)
# plotting pandas frames
ax = plt.gca()
pandasDF.plot(kind='line',x='first_name',y='salary',ax=ax,color='red')
pandasDF.plot(kind='bar',x='first_name',y='salary',ax=ax )
plt.show()
No comments:
Post a Comment