Source code for pyiem.nws.bufkit
"""A BUFKIT File Reader."""
import re
from io import StringIO
import numpy as np
import pandas as pd
from pyiem.util import LOG
KEY_VAL_RE = re.compile(r"(?P<key>[A-Z0-9]{4}) = (?P<value>[0-9\-\./]+)")
def _read_station(text: str):
"""our station data reader."""
# GEMPAK variables always start with letters
keys = [t for t in text[:1000].split() if t[0].isalpha()]
# Split on the last key above to get just numbers
numbers = text.split(keys[-1])[1].split()
if len(numbers) % len(keys) != 0:
LOG.info(
"BUFKIT reader found len(numbers)[%s] %% len(keys)[%s] != 0",
len(numbers),
len(keys),
)
# Likely a corrupted file, so an evasive hack
meat = text.split(keys[-1])[1]
pos = meat.find("STID")
meat = meat[:pos]
numbers = meat.strip().split()
rows = [
numbers[i : (i + len(keys))] for i in range(0, len(numbers), len(keys))
]
df = pd.DataFrame(rows, columns=keys)
df["utc_valid"] = pd.to_datetime(
df["YYMMDD/HHMM"],
format="%y%m%d/%H%M",
utc=True,
)
return df.drop("YYMMDD/HHMM", axis=1).astype(float, False, "ignore")
def _read_sounding(text):
"""our sounding reader."""
snparm = []
stnprm = []
# Figure out some headers by taking a sample
for line in text[:1000].split("\n"):
if not snparm and line.startswith("SNPARM"):
snparm = line.split("=")[1].strip().split(";")
elif not stnprm and line.startswith("STNPRM"):
stnprm = line.split("=")[1].strip().split(";")
rows = []
stnrows = []
# Split into sections, skipping the already parsed header
sections = text.split("STID =")[1:]
for section in sections:
settings = dict(KEY_VAL_RE.findall(section))
stnrows.append(settings)
# split based on the last snparm
numbers = section.split(snparm[-1])[-1].split()
# should be a multiple of snparm
if len(numbers) % len(snparm) != 0:
LOG.info(
"BUFKIT reader found len(numbers)[%s] %% len(snparm)[%s] != 0",
len(numbers),
len(snparm),
)
# Likely a corrupted file, just skip it
continue
for i in range(0, len(numbers), len(snparm)):
rows.append([settings["STIM"], *numbers[i : (i + len(snparm))]]) # noqa
cols = ["STIM", *snparm]
stndf = pd.DataFrame(stnrows)
stndf["utc_valid"] = pd.to_datetime(
stndf["TIME"],
format="%y%m%d/%H%M",
utc=True,
)
stndf = stndf.drop("TIME", axis=1).astype(float, False, "ignore")
sndf = pd.DataFrame(rows, columns=cols, dtype=float)
sndf["STIM"] = sndf["STIM"].astype(int)
return sndf, stndf
[docs]
def read_bufkit(mixedobj):
"""Read a BUFKIT file and return two pandas dataframes.
The first dataframe is the sounding values with a column called `STIM`,
which can be joined against the index of the station_dataframe.
Args:
mixedobj (str or filelike): What to read.
Returns:
(profile_dataframe, station_dataframe)
"""
if isinstance(mixedobj, str):
with open(mixedobj, encoding="utf8") as fh:
text = fh.read()
elif isinstance(mixedobj, StringIO):
text = mixedobj.getvalue()
else:
raise ValueError("Provided mixedobj should be str or StringIO")
# Step 0 remove CR
text = text.replace("\r", "")
# Step 1 split the text into two sections
pos = text.find("STN YYMMDD/HHMM")
if pos == -1:
raise ValueError("Failed to find station data delimiter")
sounding_text = text[:pos]
station_text = text[pos:]
sndf, paramdf = _read_sounding(sounding_text)
stndf = _read_station(station_text)
# Join the paramdf into stndf
stndf = pd.merge(
stndf, paramdf, how="outer", left_on="utc_valid", right_on="utc_valid"
).set_index("STIM")
# -9999 is missing
stndf = stndf.replace({-9999: np.nan})
sndf = sndf.replace({-9999: np.nan})
return sndf, stndf