Reddit DataFrame Schema
This is the schema for the Reddit comments when reading a Spark DataFrame:
from pyspark.sql import types
comments_schema = types.StructType([
types.StructField('archived', types.BooleanType()),
types.StructField('author', types.StringType()),
types.StructField('author_flair_css_class', types.StringType()),
types.StructField('author_flair_text', types.StringType()),
types.StructField('body', types.StringType()),
types.StructField('controversiality', types.LongType()),
types.StructField('created_utc', types.StringType()),
types.StructField('distinguished', types.StringType()),
types.StructField('downs', types.LongType()),
types.StructField('edited', types.StringType()),
types.StructField('gilded', types.LongType()),
types.StructField('id', types.StringType()),
types.StructField('link_id', types.StringType()),
types.StructField('name', types.StringType()),
types.StructField('parent_id', types.StringType()),
types.StructField('retrieved_on', types.LongType()),
types.StructField('score', types.LongType()),
types.StructField('score_hidden', types.BooleanType()),
types.StructField('subreddit', types.StringType()),
types.StructField('subreddit_id', types.StringType()),
types.StructField('ups', types.LongType()),
#types.StructField('year', types.IntegerType()),
#types.StructField('month', types.IntegerType()),
])
You can then read the data with:
comments = spark.read.json(inputs, schema=comments_schema)
Updated Mon Aug. 29 2022, 10:52 by ggbaker.