Create Schema for a File from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType EmpSchema = StructType([ StructField('Empno', IntegerType(), True), StructField('Empname', StringType(), True), StructField('salary', DoubleType(), True) ]) empDF = spark.read.schema(EmpSchema).csv("file:////home/ak/datasets/emp.csv") Create a Schema for Data from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType EmpSchema = StructType([ StructField('Empno', IntegerType(), True), StructField('Empname', StringType(), True), StructField('MGR', IntegerType(), True), StructField('YOJ', StringType(), True), StructField('deptno', StringType(), True), StructField('gender', StringType(), True), StructField('salary', DoubleType(), True) ]) emp = [ (1,"Smith",1,"2018","10","M",3000.00), (2,"Rose",1,"2010","20","M",4000.00), (3,"Williams",1,"2010","10","M",1000.00), (4,"Jones",2,"2005","10","F",2000.00), (5,"Brown",2,"2010","40","",300.00), (6,"Brown",2,"2010","50","",2000.00) ] empDF = spark.createDataFrame(data=emp, schema = EmpSchema) Column name for data (Infer Schema) empColumns = ["emp_id", "name", "superior_emp_id", "year_joined", "emp_dept_id", "gender","salary"] empDF = spark.createDataFrame(data=emp, schema = empColumns)