pyspark 初试
1、安装jdk
2、安装spark
curl -o spark.tgz https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-4.0.0/spark-4.0.0-bin-hadoop3.tgz
tar -xvf spark.tgz
mv spark-4.0.0-bin-hadoop3 /opt/spark
export SPARK_HOME=/opt/spark
export PATH=$PATH:SPARK_HOME/bin:$SPARK_HOME/sbin
source /etc/profile
spark-shell
#import findspark
#findspark.init()
from pyspark.sql import SparkSessionspark = SparkSession.builder.appName('test').getOrCreate()
#df =spark.read.text("name.txt")
#df.show(2)df =spark.read.csv("911.csv",header=True,inferSchema=True)
df.show(5)
df.head(5)
df.printSchema()
df.count()
df.describe().show()
df.sample(frction=0.05).show()row=df.head() #只获取一行
row.asDict() #转成字典
df.columns #打印列 column 只是列的描述
#查询
df.select(df['salary'],((df['salary'] * 0.1).alias('bonus'))).show()