# add a subplot ax1=plt.subplot(2, 2, 1) # add a subplot with no frame ax2=plt.subplot(2, 2, 2, frameon=False) # add a polar subplot plt.subplot(2, 2, 3, projection='polar') # add a red subplot that shares the x-axis with ax1 plt.subplot(2, 2, 4, sharex=ax1, facecolor='red')
#delete ax2 from the figure plt.delaxes(ax2) #add ax2 to the figure again plt.subplot(ax2)
from cycler import cycler import numpy as np import matplotlib.pyplot as plt
x = np.linspace(0, 2 * np.pi) offsets = np.linspace(0, 2*np.pi, 4, endpoint=False) # Create array with shifted-sine curve along each column yy = np.transpose([np.sin(x + phi) for phi in offsets])
# 2. Define prop cycle for single set of axes color & width ax1.set_prop_cycle(cycler('color', ['c', 'm', 'y', 'k']) + cycler('lw', [1, 2, 3, 4])) ax1.plot(yy) ax1.set_title('Set axes color cycle to cmyk')
# Tweak spacing between subplots to prevent labels from overlapping fig.subplots_adjust(hspace=0.3) plt.show()
Sample 4 - set size && save picture
1 2 3 4 5 6
#### way 1 from matplotlib.pyplot import figure figure(figsize=(20, 10), dpi=80,)
import pyspark.sql.functions as F from pyspark.sql.types import IntegerType # define the sum_col defTotal(Course_Fees, Discount): res = Course_Fees - Discount return res new_f = F.udf(Total, IntegerType()) new_df = df.withColumn("Total_price", new_f("Course_Fees", "Discount"))
# 使用 udf @udf(IntegerType()) defTotal(Course_Fees, Discount): res = Course_Fees - Discount return res # 使用 udf + lambda function = udf(lambda col1, col2 : col1-col2, IntegerType()) new_df = old_df.withColumn('col_n',function(col('col_1'), col('col_2')))
# 代码 from pyspark.sql.functions import * @F.udf(IntegerType()) def TimeDiff(a, b): return abs(a - b) # 报错 TypeError: Invalid argument, not a string or column: 1 of type <class 'int'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
1
因为使用 from pyspark.sql.functions import * 导入,导致 abs 使用 from pyspark.sql.functions 内的函数
TypeError: Can not infer schema for type: <class ‘str’>
rdd.toDF() 时报错。
1 2 3 4
from pyspark.sql import Row
row = Row("val") # Or some other column name rdd.map(row).toDF()
// 读取 s3 认证信息 val cf = Paths.get(System.getProperty("user.home"), ".aws/credentials") val c = newIni(newFile(cf.toUri)) val prefs = newIniPreferences(c) val awsAccessKeyId = prefs.node("default").get("aws_access_key_id", "no") val awsSecretAccessKey = prefs.node("default").get("aws_secret_access_key", "no")
➜ ~ hdfs dfs -copyFromLocal /Users/wii/heap /tmp/ 2022-06-04 23:20:22,272 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2022-06-04 23:21:23,169 WARN hdfs.DataStreamer: Exception in createBlockOutputStream blk_1073741843_1019 org.apache.hadoop.net.ConnectTimeoutException: 60000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=/10.1.0.191:9866] at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:589) at org.apache.hadoop.hdfs.DataStreamer.createSocketForPipeline(DataStreamer.java:253) at org.apache.hadoop.hdfs.DataStreamer.createBlockOutputStream(DataStreamer.java:1774) at org.apache.hadoop.hdfs.DataStreamer.nextBlockOutputStream(DataStreamer.java:1728) at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:713) ...