import pandas as pd
datapath = 'Sim Data (Clusters; 800, Seed; 2639)' # all of the data is within a folder in this .ipynb file's directory
stardata = pd.read_csv(datapath + '/All Star Data.txt', delimiter=' ') # read the data from the .txt file into a dataframe
equats = stardata['Equatorial'] # get the equatorial positions of all of the stars
polars = stardata['Polar'] # get the polar positions of all of the stars
parallax = stardata['Parallax'] # get the parallax of the stars
Plotting the Local Galaxy¶
Now with the data imported, let's take a look at all of the stars that are nearby to us. The output data includes parallax angles, so it's a really simple task to separate data with non-negligible parallax data and then plot it.
# now, lets plot all of the stars that have some parallax angle
import matplotlib.pyplot as plt
indexes = [i for i, x in enumerate(parallax) if x > 0.001] # this will return the indexes of all stars that have some parallax
# now to populate new lists with all of the equatorial/polar angles of stars that have some parallax
localequats = [equats[i] for i in indexes]
localpolars = [polars[i] for i in indexes]
# now lets plot these close stars!
fig = plt.figure(figsize=(15, 15)) # make a figure to plot stuff on
ax = fig.add_subplot(1, 1, 1) # and add an axis so we can plot stuff on the figure
ax.scatter(localequats, localpolars, s=0.5, c='w') # plot small-ish dots, all white in colour
ax.set_xlim([0, 360]) # we go 0->360 degrees equatorially
ax.set_ylim([0, 180]) # we go 0->180 degrees along the poles
ax.invert_yaxis() # the data is set up so that polar angle of 0 is at the top, so we need to flip the axis
ax.set_facecolor('k') # space has a black background, duh
ax.set_aspect(1) # makes it so that the figure is twice as wide as it is tall - no stretching!
plt.show() # now, finally show the figure
This looks really cool! We can see a horizontal band of stars as we'd expect from our local galaxy. We can also see some not-too-distant galaxies to the top right of the picture. I'm curious to see what it looks like in 3D, so I'll program that too.
This is going to be a bit more complicated than the above, since we have data in (mostly) spherical coordinates, but we will need them in cartesian coordinates (x, y, z) in order to plot them in 3D.
import numpy as np
localparallax = np.array(parallax[indexes])
distance = (1 / localparallax)
equat, polar = np.radians(localequats), np.radians(localpolars) # convert the angles to radians
# now to convert the angles and distances to cartesian coordinates
x = distance * np.cos(equat) * np.sin(polar)
y = distance * np.sin(equat) * np.sin(polar)
z = distance * np.cos(polar)
fig = plt.figure(figsize=(10, 10)); # and finally plot as before
ax = fig.add_subplot(projection='3d')
ax.scatter(x, y, z, s=0.5); ax.set_xlabel('pc'); ax.set_ylabel('pc'); ax.set_zlabel('pc');
That looks really cool! We can see a couple of galaxies here. Let's zoom in on that big spiral galaxy in the center. I'm going to do that by restricting the stars we plot to only those with high parallax.
closeindexes = [i for i, x in enumerate(localparallax) if x > 0.007] # restriction step
closex = x[closeindexes]; closey = y[closeindexes]; closez = z[closeindexes]; # get the XYZ of close stars
fig = plt.figure(figsize=(8, 8)); # and now plot
ax = fig.add_subplot(projection='3d')
ax.scatter(closex, closey, closez, s=0.5);
ax.set_xlabel('pc'); ax.set_ylabel('pc'); ax.set_zlabel('pc');
ax.set_xlim([-100, 100]); ax.set_ylim([-100, 100]); ax.set_zlim([-100, 100]); # we want some equal axis limits so things are proportional
plt.show()
It looks like we're in an SBa galaxy (tightly wound spiral arms, with a clear central bar)! You can also see some distortion in our data to the left of the image due to the parallax measurements. Since the measured parallax angles are discrete, we have discontinuities in the distances to far away stars. This isn't anything wrong, just a limitation of the data.
Determining Variable Star Properties¶
Now, let's take a look at the stars that have some variability AND parallax data. Firstly, lets get the names and rough luminosities (in the V band) of each star, where luminosity is calculated by
$$ L = F \times r^2$$variables = stardata["Variable?"]
variableindexes = [i for i, x in enumerate(parallax) if x > 0.005 and variables[i] == 1] # we only want stars that are pretty close!
starnames = stardata['Name']; starnames = [i for i in starnames]
variablenames = [starnames[i] for i in variableindexes]
variableparallaxes = np.array([stardata["Parallax"][i] for i in variableindexes])
Vfluxes = stardata['GreenF']; variableVfluxes = np.array([Vfluxes[i] for i in variableindexes])
variableVlumins = variableVfluxes * 4 * np.pi * ((1 / variableparallaxes) * 3.086 * 10**16)**2
Now we've got some luminosities. Let's work out what the period of these light curves are.
Calculating Periods from Timeseries Data¶
This can be a simple job to do by hand for one or two lightcurves, but doing it accurately across potentially thousands of lightcurves can be difficult. There are several ways that you could do this. The way most familiar to me is to use a Lomb Scargle periodogram. Let's start by importing some packages and seeing how we find the period of one star.
from astropy.timeseries import LombScargle
from glob import glob
variablepath = datapath + "/Variable Star Data/"
fnames = glob(variablepath + "*.txt")
freqs = np.linspace(1/120, 0.45, 10000) # frequency grid shouldn't go higher than Nyquist limit
for lightcurve in fnames[-2:]: # im only searching in reverse to avoid the first data point being noisy!
if lightcurve[len(variablepath):-4] in variablenames: # this checks whether the star of this lightcurve is in our variable stars
data = pd.read_csv(lightcurve, delimiter=' ') # load in the data
time, flux = data['Time'], data['NormalisedFlux'] # just extract the columns as variables
plt.scatter(time, flux);
plt.title('Star lightcurve'); plt.xlabel('Hours'); plt.ylabel('Normalised Flux'); plt.show()
LS = LombScargle(time, flux) # initialize a Lomb-Scargle
power = LS.power(freqs) # calculate LS power
plt.plot(freqs, power); plt.xlabel('Frequency (1/hrs)'); plt.ylabel('LS Power')
plt.show()
print('Most likely period: %.2f h' % (1 / freqs[np.argmax(power)]))
break # I only want to plot one lightcurve here!
Most likely period: 28.63 h
This looks excellent! We can clearly see (top graph) that this star has a well defined period in a tridiagonal wave. When we perform some analysis on the data, the Lomb Scargle fitting shows that there is one dominant period that makes up the overall light curve. This results in the most likely period (the frequency position of the first and largest peak) being about 28.63 hours.
We can now do this analysis across all of the nearby variable stars to approximate their periods.
periods = [] # initialise a list to hold all of our period data
for lightcurve in fnames:
if lightcurve[len(variablepath):-4] in variablenames: # this checks whether the star of this lightcurve is in our variable stars
data = pd.read_csv(lightcurve, delimiter=' ') # load in the data
time, flux = data['Time'], data['NormalisedFlux'] # just extract the columns as variables
LS = LombScargle(time, flux) # initialize a Lomb-Scargle fitting
power = LS.power(freqs) # calculate LS power
bestfreq = freqs[np.argmax(power)] # which frequency has the highest Lomb-Scargle power?
pred = LS.model(time, bestfreq) # make a sine wave prediction at the best frequency
periods.append(1 / bestfreq) # add each period to the list
periods = np.array(periods) # turn it from a list to an array
Now with that done, we can plot the data and see where our variable stars lie on a period-luminosity graph.
fig, ax = plt.subplots()
ax.scatter(periods, variableVlumins, s=1)
ax.set_yscale('log')
ax.set_xlabel('Period (hours)')
ax.set_ylabel('V-band Luminosity (W)');
We can see some noisy data scattered mostly to the left which we can ignore. What's left seems to be three nicely defined linear trends in period and log luminosity.
Monte Carlo Uncertainty Propagation¶
Time to work out the period luminosity relations, starting with the lower period relation. Before we do this however, I must mention uncertainty in the data. With any experimental/observational data in science, there is some inherent uncertainty in the values we obtain. It's important to propagate these errors in our data when we fit trendlines or do math with them! One of the easiest ways to do this is by Monte Carlo Uncertainty Propagation. I've created a gif outlining what this does, but the tl;dr is this. In most cases, we can assume that our uncertainty in our data follows a normal distribution, with a mean of our value and a standard deviation of our uncertainty. So, we can simulate thousands of data points according to this distribution (using python), and work out a trendline across all samples from the distribution. This gives us parameters for our model, but also gives us uncertainty in these parameters!
With this in mind, I'm going to create a function that will do this all for me.
def monte_carlo(xdata, xuncs, ydata, yuncs, iterations):
''' Simulate many samples from data distributions and obtain a *linear* trendline with parameters and uncertainties.
Parameters
----------
x/ydata : np.array
x/yuncs : float or np.array
Standard deviations of our data. Can be length 1 if the uncertainty is the same across data, or an array for unique
SDs
'''
# initialise arrays to store our data in
grads = np.zeros(iterations)
yints = np.zeros(iterations)
x_rand = np.zeros(len(xdata))
y_rand = np.zeros(len(xdata))
# if our uncertainty is a scalar, make it an array with N times that value (N being the length of our data array)
if np.size(xuncs) == 1:
xuncs = np.ones(len(xdata)) * xuncs
if np.size(yuncs) == 1:
yuncs = np.ones(len(ydata)) * yuncs
# now to perform n=iterations random samples of our data distributions
for i in range(iterations):
for j in range(len(xdata)):
# generate a random normal variable for each of our XY data points
x_rand[j] = np.random.normal(xdata[j], xuncs[j])
y_rand[j] = np.random.normal(ydata[j], yuncs[j])
# now fit a line to our random data. A 1-dimensional polynomial is just a straight line!
grads[i], yints[i] = np.polyfit(x_rand, y_rand, 1)
# now get the statistics of our *iterations* number of trendline parameters
meangrad = np.mean(grads[:i])
SDgrad = np.std(grads[:i])
meanyint = np.mean(yints[:i])
SDyint = np.std(yints[:i])
return np.array([meangrad, SDgrad, meanyint, SDyint])
Calculating Period-Luminosity Functions¶
With our Monte Carlo trendline-fitting function now defined, we can use it to fit some lines to some data! Let's start with the low-period variable data as before:
lowperiodindexes = [i for i, x in enumerate(periods) if 15 <= x <= 30]
# we need to do another step to remove the outlier(s) at the bottom. Check to see if lumins for each index are high
# enough luminosity:
lowperiodindexes = [lowperiodindexes[i] for i, x in enumerate(variableVlumins[lowperiodindexes]) if x > 10**25]
lowperiods = periods[lowperiodindexes]
lowperiodlumins = np.log10(variableVlumins[lowperiodindexes]) # get lumins in log scale
lowperiodunc = 0.01 / np.log(10)
# now, lets fit a linear trend to this data. We can do this with a polynomial fit of one degree from numpy:
shortgradient, shortgradSD, shortyint, shortyintSD = monte_carlo(lowperiods, 0.1, lowperiodlumins, lowperiodunc, 20000)
print(f"Short grad = {round(shortgradient, 3)}±{round(shortgradSD, 3)}; Short y-int = {round(shortyint, 2)}±{round(shortyintSD, 2)}")
fitperiods = np.linspace(min(lowperiods), max(lowperiods), 10)
fitlumins = shortgradient * fitperiods + shortyint
fig, ax = plt.subplots()
# ax.scatter(lowperiods, lowperiodlumins);
ax.errorbar(lowperiods, lowperiodlumins, xerr=0.1, yerr=lowperiodunc, fmt='.', lw=1)
ax.plot(fitperiods, fitlumins, 'r-')
ax.set_xlabel('Period (hours)'); ax.set_ylabel(r'Log$_{10}$ V-band luminosity (W)');
Short grad = -0.087±0.002; Short y-int = 28.27±0.04
This gives a reasonably close and consistent fit with the data, with a trendline of approximately
$$\text{Log}_{10}(L_V) = (-0.087\pm 0.002)P + (28.27\pm0.04)$$Now we should do the same with the longer period data:
longerperiodindexes = [i for i, x in enumerate(periods) if 30 <= x <= 52]
# we need to do another step to remove the outlier(s) at the bottom. Check to see if lumins for each index are high
# enough luminosity:
longerperiodindexes = [longerperiodindexes[i] for i, x in enumerate(variableVlumins[longerperiodindexes]) if x > 10**23]
longerperiods = periods[longerperiodindexes]
longerperiodlumins = np.log10(variableVlumins[longerperiodindexes]) # get lumins in log scale
longerperiodunc = 0.01 / np.log(10)
# now, lets fit a linear trend to this data. We can do this with a polynomial fit of one degree from numpy:
longergradient, longergradSD, longeryint, longeryintSD = monte_carlo(longerperiods, 0.1, longerperiodlumins, longerperiodunc, 20000)
print(f"Longer grad = {round(longergradient, 3)}±{round(longergradSD, 3)}; Longer y-int = {round(longeryint, 2)}±{round(longeryintSD, 2)}")
fitperiods = np.linspace(min(longerperiods), max(longerperiods), 10)
fitlumins = longergradient * fitperiods + longeryint
fig, ax = plt.subplots()
ax.errorbar(longerperiods, longerperiodlumins, xerr=0.1, yerr=longerperiodunc, fmt='.', lw=1)
ax.plot(fitperiods, fitlumins, 'r-')
ax.set_xlabel('Period (hours)'); ax.set_ylabel(r'Log$_{10}$ V-band luminosity (W)');
Longer grad = -0.087±0.002; Longer y-int = 33.71±0.08
This trendline gives a fit of approximately
$$\text{Log}_{10}(L_V) = (-0.087\pm 0.002)P + (33.71\pm 0.08)$$This seems like quite a strong trend! Finally, let's fit to the third period-luminosity trend.
longestperiodindexes = [i for i, x in enumerate(periods) if 80 <= x]
# we need to do another step to remove the outlier(s) at the bottom. Check to see if lumins for each index are high
# enough luminosity:
longestperiodindexes = [longestperiodindexes[i] for i, x in enumerate(variableVlumins[longestperiodindexes]) if x > 10**23]
longestperiods = periods[longestperiodindexes]
longestperiodlumins = np.log10(variableVlumins[longestperiodindexes]) # get lumins in log scale
longestperiodunc = 0.01 / np.log(10)
# now, lets fit a linear trend to this data. We can do this with a polynomial fit of one degree from numpy:
longestgradient, longestgradSD, longestyint, longestyintSD = monte_carlo(longestperiods, 0.1, longestperiodlumins,
longestperiodunc, 20000)
print(f"Longest grad = {round(longestgradient, 3)}±{round(longestgradSD, 3)}; Longest y-int = {round(longestyint, 2)}±{round(longestyintSD, 2)}")
fitperiods = np.linspace(min(longerperiods), max(longerperiods), 10)
fitperiods = np.linspace(min(longestperiods), max(longestperiods), 10)
fitlumins = longestgradient * fitperiods + longestyint
fig, ax = plt.subplots()
ax.errorbar(longestperiods, longestperiodlumins, xerr=0.1, yerr=longestperiodunc, fmt='.', lw=1)
ax.plot(fitperiods, fitlumins, 'r-')
ax.set_xlabel('Period (hours)'); ax.set_ylabel(r'Log$_{10}$ V-band luminosity (W)');
Longest grad = -0.087±0.001; Longest y-int = 34.35±0.06
This trendline gives a linear fit of $$\text{Log}_{10}(L_V) = (-0.087\pm0.001)P + (34.35\pm 0.06)$$
In practice any of these three period-luminosity relations will be useful in finding the distance to resolved stars. Ideally, all three would be used for a given galaxy so that we may best determine its distance. In truth, especially for distant galaxies, only the PL relation for the bright stars will be particularly versatile despite the lower population of stars in that region on the HR diagram.